# Preprocessing data Youtube Rewind 2021

#### Import library

In [1]:
from dotenv import load_dotenv
from pymongo import MongoClient
import os
import pandas as pd
import re

In [2]:
# Memuat value dari file .env
load_dotenv()

mongodb_url = os.getenv('URL_SANDY')
local_url = os.getenv('URL_LOCAL')

In [3]:
database_name = "youtube_rewind_indonesia"
collection_from = "cleansing_2021"
collection_name = "preprocessing_2021"

#### Import dataset dari MongoDB

In [4]:
# Koneksi ke MongoDB
client = MongoClient(local_url)

# Pilih database dan koleksi
db = client[database_name]
collection = db[collection_from]

# Query data dari koleksi
cursor = collection.find()

# Konversi cursor ke list dan kemudian ke DataFrame
data = list(cursor)
df = pd.DataFrame(data)

# Menutup koneksi
client.close()

In [5]:
df

Unnamed: 0,_id,textOriginal
0,6651666d9da7c08a9d7f3bbe,REWIND INDONESIA 2021.#RI2021\nYuk bikin video...
1,6651666d9da7c08a9d7f3bbf,.
2,6651666d9da7c08a9d7f3bc0,❤❤❤❤❤😊😊😊😢😢😢😢
3,6651666d9da7c08a9d7f3bc2,easter egg tapi rewind indonesia 2021
4,6651666d9da7c08a9d7f3bc3,"Anjirrrr , Nonton lagi dan ternyata ada Lord U..."
...,...,...
59536,6651673e9da7c08a9d804a6c,MasyaAllah keren sekaliiiii i love u🥺🤍🤍🤍🤍!!!!
59537,6651673e9da7c08a9d804a6d,MERINDING👏
59538,6651673e9da7c08a9d804a6e,KERENN🫂
59539,6651673e9da7c08a9d804a6f,Keyennnnnn


#### Mengubah data text menjadi lowercase

In [6]:
df['textOriginal'] = df['textOriginal'].str.lower()

df

Unnamed: 0,_id,textOriginal
0,6651666d9da7c08a9d7f3bbe,rewind indonesia 2021.#ri2021\nyuk bikin video...
1,6651666d9da7c08a9d7f3bbf,.
2,6651666d9da7c08a9d7f3bc0,❤❤❤❤❤😊😊😊😢😢😢😢
3,6651666d9da7c08a9d7f3bc2,easter egg tapi rewind indonesia 2021
4,6651666d9da7c08a9d7f3bc3,"anjirrrr , nonton lagi dan ternyata ada lord u..."
...,...,...
59536,6651673e9da7c08a9d804a6c,masyaallah keren sekaliiiii i love u🥺🤍🤍🤍🤍!!!!
59537,6651673e9da7c08a9d804a6d,merinding👏
59538,6651673e9da7c08a9d804a6e,kerenn🫂
59539,6651673e9da7c08a9d804a6f,keyennnnnn


#### Menghapus data dengan atribut text null

In [7]:
df = df[df['textOriginal'] != '']
df

Unnamed: 0,_id,textOriginal
0,6651666d9da7c08a9d7f3bbe,rewind indonesia 2021.#ri2021\nyuk bikin video...
1,6651666d9da7c08a9d7f3bbf,.
2,6651666d9da7c08a9d7f3bc0,❤❤❤❤❤😊😊😊😢😢😢😢
3,6651666d9da7c08a9d7f3bc2,easter egg tapi rewind indonesia 2021
4,6651666d9da7c08a9d7f3bc3,"anjirrrr , nonton lagi dan ternyata ada lord u..."
...,...,...
59536,6651673e9da7c08a9d804a6c,masyaallah keren sekaliiiii i love u🥺🤍🤍🤍🤍!!!!
59537,6651673e9da7c08a9d804a6d,merinding👏
59538,6651673e9da7c08a9d804a6e,kerenn🫂
59539,6651673e9da7c08a9d804a6f,keyennnnnn


#### Menghapus baris dengan nilai yang hanya berupa numeric pada kolom 'textOriginal'

In [8]:
df = df[~df['textOriginal'].str.isnumeric()]

df

Unnamed: 0,_id,textOriginal
0,6651666d9da7c08a9d7f3bbe,rewind indonesia 2021.#ri2021\nyuk bikin video...
1,6651666d9da7c08a9d7f3bbf,.
2,6651666d9da7c08a9d7f3bc0,❤❤❤❤❤😊😊😊😢😢😢😢
3,6651666d9da7c08a9d7f3bc2,easter egg tapi rewind indonesia 2021
4,6651666d9da7c08a9d7f3bc3,"anjirrrr , nonton lagi dan ternyata ada lord u..."
...,...,...
59536,6651673e9da7c08a9d804a6c,masyaallah keren sekaliiiii i love u🥺🤍🤍🤍🤍!!!!
59537,6651673e9da7c08a9d804a6d,merinding👏
59538,6651673e9da7c08a9d804a6e,kerenn🫂
59539,6651673e9da7c08a9d804a6f,keyennnnnn


#### Fungsi untuk membersihkan teks dari beberapa pola yang umum ditemukan dalam data teks

In [9]:
def clean_data_text(text):
    # Menghapus username
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    # Menghapus hashtag
    text = re.sub(r'#\w+', '', text)
    # Menghapus angka
    text = re.sub(r'\d+', '', text)
    # Menghapus 'RT' (retweet)
    text = re.sub(r'RT[\s]+', '', text)
    # Menghapus URL
    text = re.sub(r'https?://\S+', '', text)
    # Menghapus karakter non-alfanumerik
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    # Menghapus spasi ekstra di awal dan akhir teks, serta spasi berlebih di antara kata
    text = re.sub(r'\s+', ' ', text).strip()
    # Mencari dan menghapus karakter tunggal yang terpisah dari kata-kata
    text = re.sub(r'\b\w\b', '', text)
    # Menghapus angka tunggal
    text = re.sub(r'\b\d+\b', '', text)
    # Menghapus karakter berulang, misalnya: 'aa', 'bbb', 'ccc'
    text = re.sub(r'(.)\1+', r'\1', text)
    # Menghapus emotikon
    text = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    return text

In [10]:
# Apply clean_data_text pada dataframe
# Menggunakan .loc untuk mengakses DataFrame
df.loc[:, 'textOriginal'] = df['textOriginal'].apply(clean_data_text)

df

Unnamed: 0,_id,textOriginal
0,6651666d9da7c08a9d7f3bbe,rewind indonesia yuk bikin video nobarnontonre...
1,6651666d9da7c08a9d7f3bbf,
2,6651666d9da7c08a9d7f3bc0,
3,6651666d9da7c08a9d7f3bc2,easter eg tapi rewind indonesia
4,6651666d9da7c08a9d7f3bc3,anjir nonton lagi dan ternyata ada lord ulti n...
...,...,...
59536,6651673e9da7c08a9d804a6c,masyalah keren sekali love
59537,6651673e9da7c08a9d804a6d,merinding
59538,6651673e9da7c08a9d804a6e,keren
59539,6651673e9da7c08a9d804a6f,keyen


#### Fungsi untuk membersihkan teks dari kata yang hanya 1 huruf

In [11]:
def remove_single_letter_words(text):
    return re.sub(r'\b\w\b', '', text).strip()

In [12]:
# Apply remove_single_letter_words pada dataframe
# Menggunakan .loc untuk mengakses DataFrame
df.loc[:, 'textOriginal'] = df['textOriginal'].apply(remove_single_letter_words)

df

Unnamed: 0,_id,textOriginal
0,6651666d9da7c08a9d7f3bbe,rewind indonesia yuk bikin video nobarnontonre...
1,6651666d9da7c08a9d7f3bbf,
2,6651666d9da7c08a9d7f3bc0,
3,6651666d9da7c08a9d7f3bc2,easter eg tapi rewind indonesia
4,6651666d9da7c08a9d7f3bc3,anjir nonton lagi dan ternyata ada lord ulti n...
...,...,...
59536,6651673e9da7c08a9d804a6c,masyalah keren sekali love
59537,6651673e9da7c08a9d804a6d,merinding
59538,6651673e9da7c08a9d804a6e,keren
59539,6651673e9da7c08a9d804a6f,keyen


#### Memeriksa panjang teks pada kolom 'textOriginal' dan menyimpan hanya baris dengan panjang antara 5 sampai 200 karakter

In [13]:
df = df[(df['textOriginal'].str.len() >= 5) & (df['textOriginal'].str.len() <= 200)]

df

Unnamed: 0,_id,textOriginal
0,6651666d9da7c08a9d7f3bbe,rewind indonesia yuk bikin video nobarnontonre...
3,6651666d9da7c08a9d7f3bc2,easter eg tapi rewind indonesia
4,6651666d9da7c08a9d7f3bc3,anjir nonton lagi dan ternyata ada lord ulti n...
5,6651666d9da7c08a9d7f3bc4,wow mantap kali nih rewind nya indonesia nih p...
6,6651666d9da7c08a9d7f3bc5,masyalah rewind terkeren sementara ini cuma re...
...,...,...
59536,6651673e9da7c08a9d804a6c,masyalah keren sekali love
59537,6651673e9da7c08a9d804a6d,merinding
59538,6651673e9da7c08a9d804a6e,keren
59539,6651673e9da7c08a9d804a6f,keyen


### Memilih hanya 10000 data sebagai sampel dari keseluruhan dataset yang telah bersih

In [14]:
# Memilih 10.000 data secara acak
df_sampled = df.sample(n=10000, random_state=42)

df_sampled

Unnamed: 0,_id,textOriginal
27872,665166cb9da7c08a9d7fb6d5,semua cerita dan pesan sangat mudah di cerna g...
49426,6651671f9da7c08a9d801b1f,ada salam dari binjai wkwk
20535,665166b69da7c08a9d7f95c6,keren banget banget banget plis dari audio kua...
46606,665167149da7c08a9d800dbb,the best ni
11586,6651668e9da7c08a9d7f6df2,keren tidak keren sangat keren
...,...,...
45019,6651670f9da7c08a9d80062a,dari awal sampai selesai gk berhenti berlinang...
16840,665166a19da7c08a9d7f8543,ada bryan furan anjay
41932,665167059da7c08a9d7ff7b9,wagelaseh fel nya bikin gabisa berkata ikut se...
30347,665166d69da7c08a9d7fc22f,halilingis mana si number one wkwowkwk


### Normalisasi

In [15]:
norm = {
    " yg ": " yang ",
    " kren ": " keren",
    " sdh ": " sudah ",
    " dgn ": " dengan ",
    " tdk ": " tidak ",
    " dlm ": " dalam ",
    " dl ": " dulu ",
    " gw ": " saya ",
    " kok ": " kenapa ",
    " bs ": " bisa ",
    " bgt ": " sangat ",
    " njir ": " kagum ",
    " anjir ": " kagum ",
    " g ": " tidak ",
    " lg ": " lagi ",
    " dr ": " dari ",
    " tp ": " tapi ",
    " jg ": " juga ",
    " sm ": " sama ",
    " krn ": " karena ",
    " spt ": " seperti ",
    " yg ": " yang ",
    " pd ": " pada ",
    " dn ": " dan ",
    " d ": " di ",
    " kt ": " kita ",
    " km ": " kamu ",
    " ms ": " masih ",
    " tlg ": " tolong ",
    " sja ": " saja ",
    " sy ": " saya ",
    " sdikit ": " sedikit ",
    " dket ": " dekat ",
    " lbh ": " lebih ",
    " sj ": " saja ",
    " bnyk ": " banyak ",
    " dgnnya ": " dengannya ",
    " sgala ": " segala ",
    " skrg ": " sekarang ",
    " nmr ": " nomor ",
    " srg ": " sering ",
    " ttg ": " tentang ",
    " smua ": " semua ",
    " trus ": " terus ",
    " pdhl ": " padahal ",
    " kt ": " kita ",
    " dkt ": " dekat ",
    " jg ": " juga ",
    " dlm ": " dalam ",
    " bhw ": " bahwa ",
    " sja ": " saja ",
    " mrk ": " mereka ",
    " sgth ": " sangat ",
    " dn ": " dan ",
    " trlalu ": " terlalu ",
    " msing2 ": " masing-masing ",
    " brp ": " berapa ",
    " aq ": " aku ",
    " smpe ": " sampai ",
    " kalo ": " kalau ",
    " bkn ": " bukan ",
    " jd ": " jadi ",
    " dket ": " dekat ",
    " ntr ": " nanti ",
    " klo ": " kalau ",
    " sda ": " sudah ",
    " sy ": " saya ",
    " kpn ": " kapan ",
    " dmn ": " dimana ",
    " lgsng ": " langsung ",
    " tdk ": " tidak ",
    " pgi ": " pagi ",
    " ngga ": " tidak ",
    " gak ": " tidak ",
    " lbih ": " lebih ",
    " cm ": " cuman ",
    " sgt ": " sangat ",
    " aj ": " saja ",
    " apa2 ": " apa-apa ",
    " krg ": " kurang ",
    " ngerti ": " mengerti ",
    " slalu ": " selalu ",
    " bbrp ": " beberapa ",
    " smpe ": " sampai ",
    " krja ": " kerja ",
    " bbrp ": " beberapa ",
    " jln ": " jalan ",
    " hr ": " hari ",
    " kok ": " kenapa ",
    " mgkin ": " mungkin "
    # Tambahkan aturan normalisasi lainnya di sini sesuai kebutuhan
}

#### Fungsi normalisasi

In [16]:
def normalisasi(text):
    for i in norm:
        text = text.replace(i, norm[i])
    return text

In [17]:
df_sampled.loc[:, 'textOriginal'] = df_sampled['textOriginal'].apply(normalisasi)

df_sampled

Unnamed: 0,_id,textOriginal
27872,665166cb9da7c08a9d7fb6d5,semua cerita dan pesan sangat mudah di cerna g...
49426,6651671f9da7c08a9d801b1f,ada salam dari binjai wkwk
20535,665166b69da7c08a9d7f95c6,keren banget banget banget plis dari audio kua...
46606,665167149da7c08a9d800dbb,the best ni
11586,6651668e9da7c08a9d7f6df2,keren tidak keren sangat keren
...,...,...
45019,6651670f9da7c08a9d80062a,dari awal sampai selesai gk berhenti berlinang...
16840,665166a19da7c08a9d7f8543,ada bryan furan anjay
41932,665167059da7c08a9d7ff7b9,wagelaseh fel nya bikin gabisa berkata ikut se...
30347,665166d69da7c08a9d7fc22f,halilingis mana si number one wkwowkwk


## Stopwords

In [18]:
import Sastrawi

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

more_stop_words = ["tidak"]

stop_words = StopWordRemoverFactory().get_stop_words()
stop_words.extend(more_stop_words)

new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)

def stopword(str_text):
  str_text = stop_words_remover_new.remove(str_text)
  return str_text

In [19]:
df_sampled.loc[:, 'textOriginal'] = df_sampled['textOriginal'].apply(stopword)

df_sampled

Unnamed: 0,_id,textOriginal
27872,665166cb9da7c08a9d7fb6d5,semua cerita pesan sangat mudah cerna god job ...
49426,6651671f9da7c08a9d801b1f,salam binjai wkwk
20535,665166b69da7c08a9d7f95c6,keren banget banget banget plis audio kualitas...
46606,665167149da7c08a9d800dbb,the best ni
11586,6651668e9da7c08a9d7f6df2,keren keren sangat keren
...,...,...
45019,6651670f9da7c08a9d80062a,awal selesai gk berhenti berlinang air mata su...
16840,665166a19da7c08a9d7f8543,bryan furan anjay
41932,665167059da7c08a9d7ff7b9,wagelaseh fel nya bikin gabisa berkata ikut se...
30347,665166d69da7c08a9d7fc22f,halilingis mana si number one wkwowkwk


## Tokenisasi
#### Memisahkan kalimat menjadi kata-kata

In [20]:
tokenized = df_sampled['textOriginal'].apply(lambda x:x.split())
tokenized

27872    [semua, cerita, pesan, sangat, mudah, cerna, g...
49426                                [salam, binjai, wkwk]
20535    [keren, banget, banget, banget, plis, audio, k...
46606                                      [the, best, ni]
11586                        [keren, keren, sangat, keren]
                               ...                        
45019    [awal, selesai, gk, berhenti, berlinang, air, ...
16840                                [bryan, furan, anjay]
41932    [wagelaseh, fel, nya, bikin, gabisa, berkata, ...
30347        [halilingis, mana, si, number, one, wkwowkwk]
9590     [chandra, liow, yt, rewind, di, directed, eric...
Name: textOriginal, Length: 10000, dtype: object

## Stemming
#### Mengubah kata berimbuhan menjadi kata dasar

In [21]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Inisialisasi counter
counter = 1

def stemming(text_cleaning):
    global counter
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    do = []
    for w in text_cleaning:
        dt = stemmer.stem(w)
        do.append(dt)
    d_clean = []
    d_clean = " ".join(do)
    print(f"{counter}: {d_clean}") 
    counter += 1
    return d_clean

Proses berikut agak lama karena data besar

In [22]:
tokenized = tokenized.apply(stemming)
tokenized

1: semua cerita pesan sangat mudah cerna god job merinding
2: salam binjai wkwk
3: keren banget banget banget plis audio kualitas vidionya bagus banget
4: the best ni
5: keren keren sangat keren
6: kecewa daoa ngak masuk
7: chandra top
8: maf gk berkatakata lagisebuah karya penuh totalitas penuh arti
9: bagus youtube rewind
10: rehan bogor mana indonesia mana
11: jelek ah sponsor makan tu sponsor kwkwkwkw
12: serius merinding
13: rio ajg gadav
14: rewind indonesia jelek pernah ada iklan kocak dek wkwkwkw
15: bagus video maksud jalan cerita apayah
16: keren asli keren
17: merinding kalian keren banget buat video dan semua libat so proud to be indonesians
18: keren bet
19: binjai kece
20: the best lah
21: kece keren sangat so proud of you guys
22: keren pol
23: aku kira brando
24: keren parah ga tau lah
25: sangar
26: keren
27: jadi inget kata chandra waktu podcastnya om dedy
28: jos terimakasih matursuwun thanks
29: tidak yah rewind aku bareng
30: jerome out of expectation
31: keren kec

27872    semua cerita pesan sangat mudah cerna god job ...
49426                                    salam binjai wkwk
20535    keren banget banget banget plis audio kualitas...
46606                                          the best ni
11586                             keren keren sangat keren
                               ...                        
45019    awal selesai gk henti linang air mata sumpah t...
16840                                    bryan furan anjay
41932    wagelaseh fel nya bikin gabisa kata ikut sesak...
30347               halilingis mana si number one wkwowkwk
9590     chandra liow yt rewind di directed ericko lim ...
Name: textOriginal, Length: 10000, dtype: object

In [23]:
tokenized

27872    semua cerita pesan sangat mudah cerna god job ...
49426                                    salam binjai wkwk
20535    keren banget banget banget plis audio kualitas...
46606                                          the best ni
11586                             keren keren sangat keren
                               ...                        
45019    awal selesai gk henti linang air mata sumpah t...
16840                                    bryan furan anjay
41932    wagelaseh fel nya bikin gabisa kata ikut sesak...
30347               halilingis mana si number one wkwowkwk
9590     chandra liow yt rewind di directed ericko lim ...
Name: textOriginal, Length: 10000, dtype: object

#### Mengimpan Tokenized Comments ke MongoDB Local

In [24]:
# Koneksi ke MongoDB
client = MongoClient(local_url)
db = client[database_name]
collection = db[collection_name]

# Konversi tokenized menjadi list of dictionaries
data_to_insert = [{'textOriginal': text} for text in tokenized]

# Masukkan data ke dalam koleksi MongoDB
collection.insert_many(data_to_insert)

print("Data berhasil diimpor ke MongoDB Local.")

Data berhasil diimpor ke MongoDB Local.
