# Preprocessing data Youtube Rewind 2015

#### Import library

In [1]:
from dotenv import load_dotenv
from pymongo import MongoClient
import os
import pandas as pd
import re

In [2]:
# Memuat value dari file .env
load_dotenv()

mongodb_url = os.getenv('URL_SANDY')
local_url = os.getenv('URL_LOCAL')

In [3]:
database_name = "youtube_rewind_indonesia"
collection_from = "cleansing_2015"
collection_name = "preprocessing_2015"

#### Import dataset dari MongoDB

In [4]:
# Koneksi ke MongoDB
client = MongoClient(local_url)

# Pilih database dan koleksi
db = client[database_name]
collection = db[collection_from]

# Query data dari koleksi
cursor = collection.find()

# Konversi cursor ke list dan kemudian ke DataFrame
data = list(cursor)
df = pd.DataFrame(data)

# Menutup koneksi
client.close()

In [6]:
df = df[['_id', 'textOriginal']]
df

Unnamed: 0,_id,textOriginal
0,66515de230ea1caf68a1005c,Masih the best di hati oe
1,66515de230ea1caf68a1005d,2024🎉
2,66515de230ea1caf68a1005e,3:34 scene yg gw suka
3,66515de230ea1caf68a1005f,"Diantara semua YouTube rewind, tahun ini yg pa..."
4,66515de230ea1caf68a10060,ada yang balik kesini di 2024 ini rewind terba...
...,...,...
24312,66515e2230ea1caf68a16f4d,woeee
24313,66515e2230ea1caf68a16f4e,1
24314,66515e2230ea1caf68a16f54,=))) kerenn
24315,66515e2230ea1caf68a16f55,Mantep :v


#### Mengubah data text menjadi lowercase

In [31]:
df['textOriginal'] = df['textOriginal'].str.lower()

df

Unnamed: 0,_id,textOriginal
0,66515de230ea1caf68a1005c,masih the best di hati oe
1,66515de230ea1caf68a1005d,2024🎉
2,66515de230ea1caf68a1005e,3:34 scene yg gw suka
3,66515de230ea1caf68a1005f,"diantara semua youtube rewind, tahun ini yg pa..."
4,66515de230ea1caf68a10060,ada yang balik kesini di 2024 ini rewind terba...
...,...,...
24312,66515e2230ea1caf68a16f4d,woeee
24313,66515e2230ea1caf68a16f4e,1
24314,66515e2230ea1caf68a16f54,=))) kerenn
24315,66515e2230ea1caf68a16f55,mantep :v


#### Menghapus data dengan atribut text null

In [32]:
df = df[df['textOriginal'] != '']
df

Unnamed: 0,_id,textOriginal
0,66515de230ea1caf68a1005c,masih the best di hati oe
1,66515de230ea1caf68a1005d,2024🎉
2,66515de230ea1caf68a1005e,3:34 scene yg gw suka
3,66515de230ea1caf68a1005f,"diantara semua youtube rewind, tahun ini yg pa..."
4,66515de230ea1caf68a10060,ada yang balik kesini di 2024 ini rewind terba...
...,...,...
24312,66515e2230ea1caf68a16f4d,woeee
24313,66515e2230ea1caf68a16f4e,1
24314,66515e2230ea1caf68a16f54,=))) kerenn
24315,66515e2230ea1caf68a16f55,mantep :v


#### Menghapus baris dengan nilai yang hanya berupa numeric pada kolom 'textOriginal'

In [33]:
df = df[~df['textOriginal'].str.isnumeric()]

df

Unnamed: 0,_id,textOriginal
0,66515de230ea1caf68a1005c,masih the best di hati oe
1,66515de230ea1caf68a1005d,2024🎉
2,66515de230ea1caf68a1005e,3:34 scene yg gw suka
3,66515de230ea1caf68a1005f,"diantara semua youtube rewind, tahun ini yg pa..."
4,66515de230ea1caf68a10060,ada yang balik kesini di 2024 ini rewind terba...
...,...,...
24311,66515e2230ea1caf68a16f4c,first?
24312,66515e2230ea1caf68a16f4d,woeee
24314,66515e2230ea1caf68a16f54,=))) kerenn
24315,66515e2230ea1caf68a16f55,mantep :v


#### Fungsi untuk membersihkan teks dari beberapa pola yang umum ditemukan dalam data teks

In [34]:
def clean_data_text(text):
    # Menghapus username
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    # Menghapus hashtag
    text = re.sub(r'#\w+', '', text)
    # Menghapus angka
    text = re.sub(r'\d+', '', text)
    # Menghapus 'RT' (retweet)
    text = re.sub(r'RT[\s]+', '', text)
    # Menghapus URL
    text = re.sub(r'https?://\S+', '', text)
    # Menghapus karakter non-alfanumerik
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    # Menghapus spasi ekstra di awal dan akhir teks, serta spasi berlebih di antara kata
    text = re.sub(r'\s+', ' ', text).strip()
    # Mencari dan menghapus karakter tunggal yang terpisah dari kata-kata
    text = re.sub(r'\b\w\b', '', text)
    # Menghapus angka tunggal
    text = re.sub(r'\b\d+\b', '', text)
    # Menghapus karakter berulang, misalnya: 'aa', 'bbb', 'ccc'
    text = re.sub(r'(.)\1+', r'\1', text)
    # Menghapus emotikon
    text = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    return text

In [35]:
# Apply clean_data_text pada dataframe
# Menggunakan .loc untuk mengakses DataFrame
df.loc[:, 'textOriginal'] = df['textOriginal'].apply(clean_data_text)

df

Unnamed: 0,_id,textOriginal
0,66515de230ea1caf68a1005c,masih the best di hati oe
1,66515de230ea1caf68a1005d,
2,66515de230ea1caf68a1005e,scene yg gw suka
3,66515de230ea1caf68a1005f,diantara semua youtube rewind tahun ini yg pal...
4,66515de230ea1caf68a10060,ada yang balik kesini di ini rewind terbagus n...
...,...,...
24311,66515e2230ea1caf68a16f4c,first
24312,66515e2230ea1caf68a16f4d,woe
24314,66515e2230ea1caf68a16f54,keren
24315,66515e2230ea1caf68a16f55,mantep


#### Fungsi untuk membersihkan teks dari kata yang hanya 1 huruf

In [36]:
def remove_single_letter_words(text):
    return re.sub(r'\b\w\b', '', text).strip()

In [37]:
# Apply remove_single_letter_words pada dataframe
# Menggunakan .loc untuk mengakses DataFrame
df.loc[:, 'textOriginal'] = df['textOriginal'].apply(remove_single_letter_words)

df

Unnamed: 0,_id,textOriginal
0,66515de230ea1caf68a1005c,masih the best di hati oe
1,66515de230ea1caf68a1005d,
2,66515de230ea1caf68a1005e,scene yg gw suka
3,66515de230ea1caf68a1005f,diantara semua youtube rewind tahun ini yg pal...
4,66515de230ea1caf68a10060,ada yang balik kesini di ini rewind terbagus n...
...,...,...
24311,66515e2230ea1caf68a16f4c,first
24312,66515e2230ea1caf68a16f4d,woe
24314,66515e2230ea1caf68a16f54,keren
24315,66515e2230ea1caf68a16f55,mantep


#### Memeriksa panjang teks pada kolom 'text' dan menyimpan hanya baris dengan panjang antara 5 sampai 200 karakter

In [38]:
df = df[(df['textOriginal'].str.len() >= 5) & (df['textOriginal'].str.len() <= 200)]

df

Unnamed: 0,_id,textOriginal
0,66515de230ea1caf68a1005c,masih the best di hati oe
2,66515de230ea1caf68a1005e,scene yg gw suka
3,66515de230ea1caf68a1005f,diantara semua youtube rewind tahun ini yg pal...
4,66515de230ea1caf68a10060,ada yang balik kesini di ini rewind terbagus n...
5,66515de230ea1caf68a10061,masih ada bryan disana
...,...,...
24306,66515e2230ea1caf68a16f45,anjas
24311,66515e2230ea1caf68a16f4c,first
24314,66515e2230ea1caf68a16f54,keren
24315,66515e2230ea1caf68a16f55,mantep


### Memilih hanya 10000 data sebagai sampel dari keseluruhan dataset yang telah bersih

In [39]:
# Memilih 10.000 data secara acak
df_sampled = df.sample(n=10000, random_state=42)

df_sampled

Unnamed: 0,_id,textOriginal
16564,66515e0730ea1caf68a149f5,judul lagu di menit apa si ka
4413,66515de830ea1caf68a11339,rewind ter the best ini sih nice
16300,66515e0630ea1caf68a14896,ini lagu nya apa aja ada yang tau
5631,66515deb30ea1caf68a1188b,andai
23371,66515e1f30ea1caf68a16aa2,anjir keren gilak youtubers indonesia
...,...,...
13942,66515dfb30ea1caf68a13d7a,rachel godard mana
1565,66515de430ea1caf68a10705,kangen banget gua
16185,66515e0530ea1caf68a14812,kak chandra dan kak laurentius beatbox nya ker...
10095,66515df230ea1caf68a12c26,we mis lord rando


### Normalisasi

In [40]:
norm = {
    " yg ": " yang ",
    " kren ": " keren",
    " sdh ": " sudah ",
    " dgn ": " dengan ",
    " tdk ": " tidak ",
    " dlm ": " dalam ",
    " dl ": " dulu ",
    " gw ": " saya ",
    " kok ": " kenapa ",
    " bs ": " bisa ",
    " bgt ": " sangat ",
    " njir ": " kagum ",
    " anjir ": " kagum ",
    " g ": " tidak ",
    " lg ": " lagi ",
    " dr ": " dari ",
    " tp ": " tapi ",
    " jg ": " juga ",
    " sm ": " sama ",
    " krn ": " karena ",
    " spt ": " seperti ",
    " yg ": " yang ",
    " pd ": " pada ",
    " dn ": " dan ",
    " d ": " di ",
    " kt ": " kita ",
    " km ": " kamu ",
    " ms ": " masih ",
    " tlg ": " tolong ",
    " sja ": " saja ",
    " sy ": " saya ",
    " sdikit ": " sedikit ",
    " dket ": " dekat ",
    " lbh ": " lebih ",
    " sj ": " saja ",
    " bnyk ": " banyak ",
    " dgnnya ": " dengannya ",
    " sgala ": " segala ",
    " skrg ": " sekarang ",
    " nmr ": " nomor ",
    " srg ": " sering ",
    " ttg ": " tentang ",
    " smua ": " semua ",
    " trus ": " terus ",
    " pdhl ": " padahal ",
    " kt ": " kita ",
    " dkt ": " dekat ",
    " jg ": " juga ",
    " dlm ": " dalam ",
    " bhw ": " bahwa ",
    " sja ": " saja ",
    " mrk ": " mereka ",
    " sgth ": " sangat ",
    " dn ": " dan ",
    " trlalu ": " terlalu ",
    " msing2 ": " masing-masing ",
    " brp ": " berapa ",
    " aq ": " aku ",
    " smpe ": " sampai ",
    " kalo ": " kalau ",
    " bkn ": " bukan ",
    " jd ": " jadi ",
    " dket ": " dekat ",
    " ntr ": " nanti ",
    " klo ": " kalau ",
    " sda ": " sudah ",
    " sy ": " saya ",
    " kpn ": " kapan ",
    " dmn ": " dimana ",
    " lgsng ": " langsung ",
    " tdk ": " tidak ",
    " pgi ": " pagi ",
    " ngga ": " tidak ",
    " gak ": " tidak ",
    " lbih ": " lebih ",
    " cm ": " cuman ",
    " sgt ": " sangat ",
    " aj ": " saja ",
    " apa2 ": " apa-apa ",
    " krg ": " kurang ",
    " ngerti ": " mengerti ",
    " slalu ": " selalu ",
    " bbrp ": " beberapa ",
    " smpe ": " sampai ",
    " krja ": " kerja ",
    " bbrp ": " beberapa ",
    " jln ": " jalan ",
    " hr ": " hari ",
    " kok ": " kenapa ",
    " mgkin ": " mungkin "
    # Tambahkan aturan normalisasi lainnya di sini sesuai kebutuhan
}

#### Fungsi normalisasi

In [41]:
def normalisasi(text):
    for i in norm:
        text = text.replace(i, norm[i])
    return text

In [42]:
df_sampled.loc[:, 'textOriginal'] = df_sampled['textOriginal'].apply(normalisasi)

df_sampled

Unnamed: 0,_id,textOriginal
16564,66515e0730ea1caf68a149f5,judul lagu di menit apa si ka
4413,66515de830ea1caf68a11339,rewind ter the best ini sih nice
16300,66515e0630ea1caf68a14896,ini lagu nya apa aja ada yang tau
5631,66515deb30ea1caf68a1188b,andai
23371,66515e1f30ea1caf68a16aa2,anjir keren gilak youtubers indonesia
...,...,...
13942,66515dfb30ea1caf68a13d7a,rachel godard mana
1565,66515de430ea1caf68a10705,kangen banget gua
16185,66515e0530ea1caf68a14812,kak chandra dan kak laurentius beatbox nya ker...
10095,66515df230ea1caf68a12c26,we mis lord rando


## Stopwords

In [43]:
import Sastrawi

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

more_stop_words = ["tidak"]

stop_words = StopWordRemoverFactory().get_stop_words()
stop_words.extend(more_stop_words)

new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)

def stopword(str_text):
  str_text = stop_words_remover_new.remove(str_text)
  return str_text

In [44]:
df_sampled.loc[:, 'textOriginal'] = df_sampled['textOriginal'].apply(stopword)

df_sampled

Unnamed: 0,_id,textOriginal
16564,66515e0730ea1caf68a149f5,judul lagu menit apa si ka
4413,66515de830ea1caf68a11339,rewind ter the best sih nice
16300,66515e0630ea1caf68a14896,lagu nya apa aja yang tau
5631,66515deb30ea1caf68a1188b,andai
23371,66515e1f30ea1caf68a16aa2,anjir keren gilak youtubers indonesia
...,...,...
13942,66515dfb30ea1caf68a13d7a,rachel godard mana
1565,66515de430ea1caf68a10705,kangen banget gua
16185,66515e0530ea1caf68a14812,kak chandra kak laurentius beatbox nya keren b...
10095,66515df230ea1caf68a12c26,we mis lord rando


## Tokenisasi
#### Memisahkan kalimat menjadi kata-kata

In [45]:
tokenized = df_sampled['textOriginal'].apply(lambda x:x.split())
tokenized

16564                    [judul, lagu, menit, apa, si, ka]
4413                   [rewind, ter, the, best, sih, nice]
16300                     [lagu, nya, apa, aja, yang, tau]
5631                                               [andai]
23371          [anjir, keren, gilak, youtubers, indonesia]
                               ...                        
13942                               [rachel, godard, mana]
1565                                 [kangen, banget, gua]
16185    [kak, chandra, kak, laurentius, beatbox, nya, ...
10095                               [we, mis, lord, rando]
2790     [anjir, jaman, jamanya, maen, warnet, ampe, gk...
Name: textOriginal, Length: 10000, dtype: object

## Stemming
#### Mengubah kata berimbuhan menjadi kata dasar

In [46]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Inisialisasi counter
counter = 1

def stemming(text_cleaning):
    global counter
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    do = []
    for w in text_cleaning:
        dt = stemmer.stem(w)
        do.append(dt)
    d_clean = []
    d_clean = " ".join(do)
    print(f"{counter}: {d_clean}") 
    counter += 1
    return d_clean

Proses berikut agak lama karena data besar

In [47]:
tokenized = tokenized.apply(stemming)
# tokenized.to_csv("tokenized_comments.csv", index=False)
tokenized

1: judul lagu menit apa si ka
2: rewind ter the best sih nice
3: lagu nya apa aja yang tau
4: andai
5: anjir keren gilak youtubers indonesia
6: harus charlie charlie pencil
7: sini azka jadi bocah tk sekarang udah besar bgt
8: lagu detik
9: yang punya lagu gk
10: nungu youtube rewind pas nonton baru nyadar kalau bang arief muhamad
11: wow seru banget video kompak
12: am only one who always watching this video al the time
13: wow kerensnjshshsjsh aku suka banget
14: gw kyk lihat rich brian
15: love this rewind nice work guys
16: sukses terus buat kalian ea kak bangun semangat anak muda sekarang salam saya lestary cilacap
17: judul musik apa nama suka banget ama musik tau tolomg comen
18: baju pink nesie nama chanelnya nasie judge
19: nonton di bulan november mana nie gimana yak fix bakal backsound entah apa rasuk
20: baju pink mana anying
21: udah nonton ga bosen
22: yg baru nonton tahun
23: amazing wonderful guys
24: baru nyadar briab imanuel wkwkw
25: wo mantep sekale
26: siapa nonton

16564                           judul lagu menit apa si ka
4413                          rewind ter the best sih nice
16300                            lagu nya apa aja yang tau
5631                                                 andai
23371                anjir keren gilak youtubers indonesia
                               ...                        
13942                                   rachel godard mana
1565                                     kangen banget gua
16185    kak chandra kak laurentius beatbox nya keren b...
10095                                    we mis lord rando
2790     anjir jaman jamanya maen warnet ampe gk inget ...
Name: textOriginal, Length: 10000, dtype: object

In [48]:
tokenized

16564                           judul lagu menit apa si ka
4413                          rewind ter the best sih nice
16300                            lagu nya apa aja yang tau
5631                                                 andai
23371                anjir keren gilak youtubers indonesia
                               ...                        
13942                                   rachel godard mana
1565                                     kangen banget gua
16185    kak chandra kak laurentius beatbox nya keren b...
10095                                    we mis lord rando
2790     anjir jaman jamanya maen warnet ampe gk inget ...
Name: textOriginal, Length: 10000, dtype: object

#### Mengimpan Tokenized Comments ke MongoDB Local

In [49]:
# Koneksi ke MongoDB
client = MongoClient(local_url)
db = client[database_name]
collection = db[collection_name]

# Konversi tokenized menjadi list of dictionaries
data_to_insert = [{'textOriginal': text} for text in tokenized]

# Masukkan data ke dalam koleksi MongoDB
collection.insert_many(data_to_insert)

print("Data berhasil diimpor ke MongoDB Local.")

Data berhasil diimpor ke MongoDB Local.
