# Preprocessing data Youtube Rewind 2016

#### Import library

In [1]:
from dotenv import load_dotenv
from pymongo import MongoClient
import os
import pandas as pd
import re

In [2]:
# Memuat value dari file .env
load_dotenv()

mongodb_url = os.getenv('URL_SANDY')
local_url = os.getenv('URL_LOCAL')

In [3]:
database_name = "youtube_rewind_indonesia"
collection_from = "cleansing_2016"
collection_name = "preprocessing_2016"

#### Import dataset dari MongoDB

In [4]:
# Koneksi ke MongoDB
client = MongoClient(local_url)

# Pilih database dan koleksi
db = client[database_name]
collection = db[collection_from]

# Query data dari koleksi
cursor = collection.find()

# Konversi cursor ke list dan kemudian ke DataFrame
data = list(cursor)
df = pd.DataFrame(data)

# Menutup koneksi
client.close()

In [5]:
df

Unnamed: 0,_id,textOriginal
0,66515ed9d882d1f847df446f,Rewind paling bnyk memori soalnya di jaman itu...
1,66515ed9d882d1f847df4470,3:30 BeaconCream🗿🗿
2,66515ed9d882d1f847df4471,"Seru banget liat ulang"" dari yang 2023"
3,66515ed9d882d1f847df4472,Haii aku dari 2024🥹🇮🇩
4,66515ed9d882d1f847df4473,Halo dari 2024
...,...,...
57701,66515f99d882d1f847e05655,t
57702,66515f99d882d1f847e05659,firt
57703,66515f99d882d1f847e0565f,firstttt
57704,66515f99d882d1f847e05662,Firs


#### Mengubah data text menjadi lowercase

In [6]:
df['textOriginal'] = df['textOriginal'].str.lower()

df

Unnamed: 0,_id,textOriginal
0,66515ed9d882d1f847df446f,rewind paling bnyk memori soalnya di jaman itu...
1,66515ed9d882d1f847df4470,3:30 beaconcream🗿🗿
2,66515ed9d882d1f847df4471,"seru banget liat ulang"" dari yang 2023"
3,66515ed9d882d1f847df4472,haii aku dari 2024🥹🇮🇩
4,66515ed9d882d1f847df4473,halo dari 2024
...,...,...
57701,66515f99d882d1f847e05655,t
57702,66515f99d882d1f847e05659,firt
57703,66515f99d882d1f847e0565f,firstttt
57704,66515f99d882d1f847e05662,firs


#### Menghapus data dengan atribut text null

In [7]:
df = df[df['textOriginal'] != '']
df

Unnamed: 0,_id,textOriginal
0,66515ed9d882d1f847df446f,rewind paling bnyk memori soalnya di jaman itu...
1,66515ed9d882d1f847df4470,3:30 beaconcream🗿🗿
2,66515ed9d882d1f847df4471,"seru banget liat ulang"" dari yang 2023"
3,66515ed9d882d1f847df4472,haii aku dari 2024🥹🇮🇩
4,66515ed9d882d1f847df4473,halo dari 2024
...,...,...
57701,66515f99d882d1f847e05655,t
57702,66515f99d882d1f847e05659,firt
57703,66515f99d882d1f847e0565f,firstttt
57704,66515f99d882d1f847e05662,firs


#### Menghapus baris dengan nilai yang hanya berupa numeric pada kolom 'textOriginal'

In [8]:
df = df[~df['textOriginal'].str.isnumeric()]

df

Unnamed: 0,_id,textOriginal
0,66515ed9d882d1f847df446f,rewind paling bnyk memori soalnya di jaman itu...
1,66515ed9d882d1f847df4470,3:30 beaconcream🗿🗿
2,66515ed9d882d1f847df4471,"seru banget liat ulang"" dari yang 2023"
3,66515ed9d882d1f847df4472,haii aku dari 2024🥹🇮🇩
4,66515ed9d882d1f847df4473,halo dari 2024
...,...,...
57701,66515f99d882d1f847e05655,t
57702,66515f99d882d1f847e05659,firt
57703,66515f99d882d1f847e0565f,firstttt
57704,66515f99d882d1f847e05662,firs


#### Fungsi untuk membersihkan teks dari beberapa pola yang umum ditemukan dalam data teks

In [9]:
def clean_data_text(text):
    # Menghapus username
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    # Menghapus hashtag
    text = re.sub(r'#\w+', '', text)
    # Menghapus angka
    text = re.sub(r'\d+', '', text)
    # Menghapus 'RT' (retweet)
    text = re.sub(r'RT[\s]+', '', text)
    # Menghapus URL
    text = re.sub(r'https?://\S+', '', text)
    # Menghapus karakter non-alfanumerik
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    # Menghapus spasi ekstra di awal dan akhir teks, serta spasi berlebih di antara kata
    text = re.sub(r'\s+', ' ', text).strip()
    # Mencari dan menghapus karakter tunggal yang terpisah dari kata-kata
    text = re.sub(r'\b\w\b', '', text)
    # Menghapus angka tunggal
    text = re.sub(r'\b\d+\b', '', text)
    # Menghapus karakter berulang, misalnya: 'aa', 'bbb', 'ccc'
    text = re.sub(r'(.)\1+', r'\1', text)
    # Menghapus emotikon
    text = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    return text

In [10]:
# Apply clean_data_text pada dataframe
# Menggunakan .loc untuk mengakses DataFrame
df.loc[:, 'textOriginal'] = df['textOriginal'].apply(clean_data_text)

df

Unnamed: 0,_id,textOriginal
0,66515ed9d882d1f847df446f,rewind paling bnyk memori soalnya di jaman itu...
1,66515ed9d882d1f847df4470,beaconcream
2,66515ed9d882d1f847df4471,seru banget liat ulang dari yang
3,66515ed9d882d1f847df4472,hai aku dari
4,66515ed9d882d1f847df4473,halo dari
...,...,...
57701,66515f99d882d1f847e05655,
57702,66515f99d882d1f847e05659,firt
57703,66515f99d882d1f847e0565f,first
57704,66515f99d882d1f847e05662,firs


#### Fungsi untuk membersihkan teks dari kata yang hanya 1 huruf

In [11]:
def remove_single_letter_words(text):
    return re.sub(r'\b\w\b', '', text).strip()

In [12]:
# Apply remove_single_letter_words pada dataframe
# Menggunakan .loc untuk mengakses DataFrame
df.loc[:, 'textOriginal'] = df['textOriginal'].apply(remove_single_letter_words)

df

Unnamed: 0,_id,textOriginal
0,66515ed9d882d1f847df446f,rewind paling bnyk memori soalnya di jaman itu...
1,66515ed9d882d1f847df4470,beaconcream
2,66515ed9d882d1f847df4471,seru banget liat ulang dari yang
3,66515ed9d882d1f847df4472,hai aku dari
4,66515ed9d882d1f847df4473,halo dari
...,...,...
57701,66515f99d882d1f847e05655,
57702,66515f99d882d1f847e05659,firt
57703,66515f99d882d1f847e0565f,first
57704,66515f99d882d1f847e05662,firs


#### Memeriksa panjang teks pada kolom 'textOriginal' dan menyimpan hanya baris dengan panjang antara 5 sampai 200 karakter

In [13]:
df = df[(df['textOriginal'].str.len() >= 5) & (df['textOriginal'].str.len() <= 200)]

df

Unnamed: 0,_id,textOriginal
0,66515ed9d882d1f847df446f,rewind paling bnyk memori soalnya di jaman itu...
1,66515ed9d882d1f847df4470,beaconcream
2,66515ed9d882d1f847df4471,seru banget liat ulang dari yang
3,66515ed9d882d1f847df4472,hai aku dari
4,66515ed9d882d1f847df4473,halo dari
...,...,...
57697,66515f98d882d1f847e05641,first viewer
57699,66515f99d882d1f847e05646,wakakakak
57700,66515f99d882d1f847e0564c,frist
57703,66515f99d882d1f847e0565f,first


### Memilih hanya 10000 data sebagai sampel dari keseluruhan dataset yang telah bersih

In [14]:
# Memilih 10.000 data secara acak
df_sampled = df.sample(n=10000, random_state=42)

df_sampled

Unnamed: 0,_id,textOriginal
14304,66515f00d882d1f847df833b,eka gustiwana pengaruh besar di ytrewind musik...
39741,66515f57d882d1f847dffab3,this felt so much beter to watch compared to y...
19641,66515f0fd882d1f847df9a57,stil man teman
24441,66515f22d882d1f847dfb084,semua indah sebelum si pengacau datang
5388,66515ee7d882d1f847df5b69,menit yg tengah rachel venya bukan ya
...,...,...
56751,66515f94d882d1f847e04edd,oh may god job keren
57235,66515f96d882d1f847e051cd,mantap lah ini yt rewind the bigest manequin c...
51246,66515f7fd882d1f847e03241,edhozel gak ada
28159,66515f2dd882d1f847dfc282,pengen ikut di tahun


### Normalisasi

In [15]:
norm = {
    " yg ": " yang ",
    " kren ": " keren",
    " sdh ": " sudah ",
    " dgn ": " dengan ",
    " tdk ": " tidak ",
    " dlm ": " dalam ",
    " dl ": " dulu ",
    " gw ": " saya ",
    " kok ": " kenapa ",
    " bs ": " bisa ",
    " bgt ": " sangat ",
    " njir ": " kagum ",
    " anjir ": " kagum ",
    " g ": " tidak ",
    " lg ": " lagi ",
    " dr ": " dari ",
    " tp ": " tapi ",
    " jg ": " juga ",
    " sm ": " sama ",
    " krn ": " karena ",
    " spt ": " seperti ",
    " yg ": " yang ",
    " pd ": " pada ",
    " dn ": " dan ",
    " d ": " di ",
    " kt ": " kita ",
    " km ": " kamu ",
    " ms ": " masih ",
    " tlg ": " tolong ",
    " sja ": " saja ",
    " sy ": " saya ",
    " sdikit ": " sedikit ",
    " dket ": " dekat ",
    " lbh ": " lebih ",
    " sj ": " saja ",
    " bnyk ": " banyak ",
    " dgnnya ": " dengannya ",
    " sgala ": " segala ",
    " skrg ": " sekarang ",
    " nmr ": " nomor ",
    " srg ": " sering ",
    " ttg ": " tentang ",
    " smua ": " semua ",
    " trus ": " terus ",
    " pdhl ": " padahal ",
    " kt ": " kita ",
    " dkt ": " dekat ",
    " jg ": " juga ",
    " dlm ": " dalam ",
    " bhw ": " bahwa ",
    " sja ": " saja ",
    " mrk ": " mereka ",
    " sgth ": " sangat ",
    " dn ": " dan ",
    " trlalu ": " terlalu ",
    " msing2 ": " masing-masing ",
    " brp ": " berapa ",
    " aq ": " aku ",
    " smpe ": " sampai ",
    " kalo ": " kalau ",
    " bkn ": " bukan ",
    " jd ": " jadi ",
    " dket ": " dekat ",
    " ntr ": " nanti ",
    " klo ": " kalau ",
    " sda ": " sudah ",
    " sy ": " saya ",
    " kpn ": " kapan ",
    " dmn ": " dimana ",
    " lgsng ": " langsung ",
    " tdk ": " tidak ",
    " pgi ": " pagi ",
    " ngga ": " tidak ",
    " gak ": " tidak ",
    " lbih ": " lebih ",
    " cm ": " cuman ",
    " sgt ": " sangat ",
    " aj ": " saja ",
    " apa2 ": " apa-apa ",
    " krg ": " kurang ",
    " ngerti ": " mengerti ",
    " slalu ": " selalu ",
    " bbrp ": " beberapa ",
    " smpe ": " sampai ",
    " krja ": " kerja ",
    " bbrp ": " beberapa ",
    " jln ": " jalan ",
    " hr ": " hari ",
    " kok ": " kenapa ",
    " mgkin ": " mungkin "
    # Tambahkan aturan normalisasi lainnya di sini sesuai kebutuhan
}

#### Fungsi normalisasi

In [16]:
def normalisasi(text):
    for i in norm:
        text = text.replace(i, norm[i])
    return text

In [17]:
df_sampled.loc[:, 'textOriginal'] = df_sampled['textOriginal'].apply(normalisasi)

df_sampled

Unnamed: 0,_id,textOriginal
14304,66515f00d882d1f847df833b,eka gustiwana pengaruh besar di ytrewind musik...
39741,66515f57d882d1f847dffab3,this felt so much beter to watch compared to y...
19641,66515f0fd882d1f847df9a57,stil man teman
24441,66515f22d882d1f847dfb084,semua indah sebelum si pengacau datang
5388,66515ee7d882d1f847df5b69,menit yang tengah rachel venya bukan ya
...,...,...
56751,66515f94d882d1f847e04edd,oh may god job keren
57235,66515f96d882d1f847e051cd,mantap lah ini yt rewind the bigest manequin c...
51246,66515f7fd882d1f847e03241,edhozel tidak ada
28159,66515f2dd882d1f847dfc282,pengen ikut di tahun


## Stopwords

In [18]:
import Sastrawi

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

more_stop_words = ["tidak"]

stop_words = StopWordRemoverFactory().get_stop_words()
stop_words.extend(more_stop_words)

new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)

def stopword(str_text):
  str_text = stop_words_remover_new.remove(str_text)
  return str_text

In [19]:
df_sampled.loc[:, 'textOriginal'] = df_sampled['textOriginal'].apply(stopword)

df_sampled

Unnamed: 0,_id,textOriginal
14304,66515f00d882d1f847df833b,eka gustiwana pengaruh besar ytrewind musik ny...
39741,66515f57d882d1f847dffab3,this felt so much beter to watch compared to y...
19641,66515f0fd882d1f847df9a57,stil man teman
24441,66515f22d882d1f847dfb084,semua indah si pengacau datang
5388,66515ee7d882d1f847df5b69,menit tengah rachel venya bukan
...,...,...
56751,66515f94d882d1f847e04edd,may god job keren
57235,66515f96d882d1f847e051cd,mantap lah yt rewind the bigest manequin chale...
51246,66515f7fd882d1f847e03241,edhozel ada
28159,66515f2dd882d1f847dfc282,pengen ikut tahun


## Tokenisasi
#### Memisahkan kalimat menjadi kata-kata

In [20]:
tokenized = df_sampled['textOriginal'].apply(lambda x:x.split())
tokenized

14304    [eka, gustiwana, pengaruh, besar, ytrewind, mu...
39741    [this, felt, so, much, beter, to, watch, compa...
19641                                   [stil, man, teman]
24441                 [semua, indah, si, pengacau, datang]
5388                 [menit, tengah, rachel, venya, bukan]
                               ...                        
56751                               [may, god, job, keren]
57235    [mantap, lah, yt, rewind, the, bigest, manequi...
51246                                       [edhozel, ada]
28159                                [pengen, ikut, tahun]
36113                                        [gak, edozel]
Name: textOriginal, Length: 10000, dtype: object

## Stemming
#### Mengubah kata berimbuhan menjadi kata dasar

In [21]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Inisialisasi counter
counter = 1

def stemming(text_cleaning):
    global counter
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    do = []
    for w in text_cleaning:
        dt = stemmer.stem(w)
        do.append(dt)
    d_clean = []
    d_clean = " ".join(do)
    print(f"{counter}: {d_clean}") 
    counter += 1
    return d_clean

Proses berikut agak lama karena data besar

In [22]:
tokenized = tokenized.apply(stemming)
tokenized

1: eka gustiwana pengaruh besar ytrewind musik nya ngalahin ytrewind ytrewind efek editanya mantul
2: this felt so much beter to watch compared to youtube rewind and didnt even understand what they were saying
3: stil man teman
4: semua indah si acau datang
5: menit tengah rachel venya bukan
6: tidak rewind
7: kesini karna liat podcast om dedy sama chandra liow
8: anjir keren merinding cikay cantik pis
9: best scene pap masuk manequin nya
10: god job
11: apa cuma sini nonton ulang cuma pgn liat scene rich brian
12: oktober masih nonton
13: tak tanding
14: bagus lagu satu jadi lagu youtube rewind
15: sumpah rindu
16: musicaly ga ya
17: keren banget sumpah
18: stil the best
19: where this old youtube
20: yg akhir miaw aug bukan rich chiga
21: siapa kecewa ytr gada ka nesie judge sama arap
22: is the best year ever
23: ewing hd
24: jir juga mau ikut kali
25: kok ga young lex ama awkarin
26: ga make tempat besar gbk aja udah keren gin apa make gbk
27: menit pake helm harald arkan apa bukan

14304    eka gustiwana pengaruh besar ytrewind musik ny...
39741    this felt so much beter to watch compared to y...
19641                                       stil man teman
24441                           semua indah si acau datang
5388                       menit tengah rachel venya bukan
                               ...                        
56751                                    may god job keren
57235    mantap lah yt rewind the bigest manequin chale...
51246                                          edhozel ada
28159                                       ken ikut tahun
36113                                           gak edozel
Name: textOriginal, Length: 10000, dtype: object

In [23]:
tokenized

14304    eka gustiwana pengaruh besar ytrewind musik ny...
39741    this felt so much beter to watch compared to y...
19641                                       stil man teman
24441                           semua indah si acau datang
5388                       menit tengah rachel venya bukan
                               ...                        
56751                                    may god job keren
57235    mantap lah yt rewind the bigest manequin chale...
51246                                          edhozel ada
28159                                       ken ikut tahun
36113                                           gak edozel
Name: textOriginal, Length: 10000, dtype: object

#### Mengimpan Tokenized Comments ke MongoDB Local

In [24]:
# Koneksi ke MongoDB
client = MongoClient(local_url)
db = client[database_name]
collection = db[collection_name]

# Konversi tokenized menjadi list of dictionaries
data_to_insert = [{'textOriginal': text} for text in tokenized]

# Masukkan data ke dalam koleksi MongoDB
collection.insert_many(data_to_insert)

print("Data berhasil diimpor ke MongoDB Local.")

Data berhasil diimpor ke MongoDB Local.
