# Preprocessing data Youtube Rewind 2020

#### Import library

In [1]:
from dotenv import load_dotenv
from pymongo import MongoClient
import os
import pandas as pd
import re

In [2]:
# Memuat value dari file .env
load_dotenv()

mongodb_url = os.getenv('URL_SANDY')
local_url = os.getenv('URL_LOCAL')

In [3]:
database_name = "youtube_rewind_indonesia"
collection_from = "cleansing_2020"
collection_name = "preprocessing_2020"

#### Import dataset dari MongoDB

In [4]:
# Koneksi ke MongoDB
client = MongoClient(local_url)

# Pilih database dan koleksi
db = client[database_name]
collection = db[collection_from]

# Query data dari koleksi
cursor = collection.find()

# Konversi cursor ke list dan kemudian ke DataFrame
data = list(cursor)
df = pd.DataFrame(data)

# Menutup koneksi
client.close()

In [5]:
df

Unnamed: 0,_id,textOriginal
0,66516226b0b542363152a0f6,"We, from the Rewind Indonesia 2020 team respec..."
1,66516226b0b542363152a0f7,Afa ada
2,66516226b0b542363152a0f8,❤❤❤❤❤
3,66516226b0b542363152a0f9,Gw kangen 2020
4,66516226b0b542363152a0fa,Kerja bagus ini sunggnh bagus woh semoga di baca
...,...,...
244029,665165f5b0b542363157b7ad,6:40 merinding😶
244030,665165f5b0b542363157b7ae,WOII KEREN BANGEETTTTT
244031,665165f5b0b542363157b7af,Masih best rewind 2018
244032,665165f5b0b542363157b7b0,The epic moments #RI2020


#### Mengubah data text menjadi lowercase

In [6]:
df['textOriginal'] = df['textOriginal'].str.lower()

df

Unnamed: 0,_id,textOriginal
0,66516226b0b542363152a0f6,"we, from the rewind indonesia 2020 team respec..."
1,66516226b0b542363152a0f7,afa ada
2,66516226b0b542363152a0f8,❤❤❤❤❤
3,66516226b0b542363152a0f9,gw kangen 2020
4,66516226b0b542363152a0fa,kerja bagus ini sunggnh bagus woh semoga di baca
...,...,...
244029,665165f5b0b542363157b7ad,6:40 merinding😶
244030,665165f5b0b542363157b7ae,woii keren bangeettttt
244031,665165f5b0b542363157b7af,masih best rewind 2018
244032,665165f5b0b542363157b7b0,the epic moments #ri2020


#### Menghapus data dengan atribut text null

In [7]:
df = df[df['textOriginal'] != '']
df

Unnamed: 0,_id,textOriginal
0,66516226b0b542363152a0f6,"we, from the rewind indonesia 2020 team respec..."
1,66516226b0b542363152a0f7,afa ada
2,66516226b0b542363152a0f8,❤❤❤❤❤
3,66516226b0b542363152a0f9,gw kangen 2020
4,66516226b0b542363152a0fa,kerja bagus ini sunggnh bagus woh semoga di baca
...,...,...
244029,665165f5b0b542363157b7ad,6:40 merinding😶
244030,665165f5b0b542363157b7ae,woii keren bangeettttt
244031,665165f5b0b542363157b7af,masih best rewind 2018
244032,665165f5b0b542363157b7b0,the epic moments #ri2020


#### Menghapus baris dengan nilai yang hanya berupa numeric pada kolom 'text'

In [8]:
df = df[~df['textOriginal'].str.isnumeric()]

df

Unnamed: 0,_id,textOriginal
0,66516226b0b542363152a0f6,"we, from the rewind indonesia 2020 team respec..."
1,66516226b0b542363152a0f7,afa ada
2,66516226b0b542363152a0f8,❤❤❤❤❤
3,66516226b0b542363152a0f9,gw kangen 2020
4,66516226b0b542363152a0fa,kerja bagus ini sunggnh bagus woh semoga di baca
...,...,...
244029,665165f5b0b542363157b7ad,6:40 merinding😶
244030,665165f5b0b542363157b7ae,woii keren bangeettttt
244031,665165f5b0b542363157b7af,masih best rewind 2018
244032,665165f5b0b542363157b7b0,the epic moments #ri2020


#### Fungsi untuk membersihkan teks dari beberapa pola yang umum ditemukan dalam data teks

In [9]:
def clean_data_text(text):
    # Menghapus username
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    # Menghapus hashtag
    text = re.sub(r'#\w+', '', text)
    # Menghapus angka
    text = re.sub(r'\d+', '', text)
    # Menghapus 'RT' (retweet)
    text = re.sub(r'RT[\s]+', '', text)
    # Menghapus URL
    text = re.sub(r'https?://\S+', '', text)
    # Menghapus karakter non-alfanumerik
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    # Menghapus spasi ekstra di awal dan akhir teks, serta spasi berlebih di antara kata
    text = re.sub(r'\s+', ' ', text).strip()
    # Mencari dan menghapus karakter tunggal yang terpisah dari kata-kata
    text = re.sub(r'\b\w\b', '', text)
    # Menghapus angka tunggal
    text = re.sub(r'\b\d+\b', '', text)
    # Menghapus karakter berulang, misalnya: 'aa', 'bbb', 'ccc'
    text = re.sub(r'(.)\1+', r'\1', text)
    # Menghapus emotikon
    text = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    return text

In [10]:
# Apply clean_data_text pada dataframe
# Menggunakan .loc untuk mengakses DataFrame
df.loc[:, 'textOriginal'] = df['textOriginal'].apply(clean_data_text)

df

Unnamed: 0,_id,textOriginal
0,66516226b0b542363152a0f6,we from the rewind indonesia team respect al t...
1,66516226b0b542363152a0f7,afa ada
2,66516226b0b542363152a0f8,
3,66516226b0b542363152a0f9,gw kangen
4,66516226b0b542363152a0fa,kerja bagus ini sungnh bagus woh semoga di baca
...,...,...
244029,665165f5b0b542363157b7ad,merinding
244030,665165f5b0b542363157b7ae,woi keren banget
244031,665165f5b0b542363157b7af,masih best rewind
244032,665165f5b0b542363157b7b0,the epic moments


#### Fungsi untuk membersihkan teks dari kata yang hanya 1 huruf

In [11]:
def remove_single_letter_words(text):
    return re.sub(r'\b\w\b', '', text).strip()

In [12]:
# Apply remove_single_letter_words pada dataframe
# Menggunakan .loc untuk mengakses DataFrame
df.loc[:, 'textOriginal'] = df['textOriginal'].apply(remove_single_letter_words)

df

Unnamed: 0,_id,textOriginal
0,66516226b0b542363152a0f6,we from the rewind indonesia team respect al t...
1,66516226b0b542363152a0f7,afa ada
2,66516226b0b542363152a0f8,
3,66516226b0b542363152a0f9,gw kangen
4,66516226b0b542363152a0fa,kerja bagus ini sungnh bagus woh semoga di baca
...,...,...
244029,665165f5b0b542363157b7ad,merinding
244030,665165f5b0b542363157b7ae,woi keren banget
244031,665165f5b0b542363157b7af,masih best rewind
244032,665165f5b0b542363157b7b0,the epic moments


#### Memeriksa panjang teks pada kolom 'text' dan menyimpan hanya baris dengan panjang antara 5 sampai 200 karakter

In [13]:
df = df[(df['textOriginal'].str.len() >= 5) & (df['textOriginal'].str.len() <= 200)]

df

Unnamed: 0,_id,textOriginal
1,66516226b0b542363152a0f7,afa ada
3,66516226b0b542363152a0f9,gw kangen
4,66516226b0b542363152a0fa,kerja bagus ini sungnh bagus woh semoga di baca
5,66516226b0b542363152a0fb,tahun sudah terlewati
6,66516226b0b542363152a0fc,ga kerasa udah tahun lewat
...,...,...
244029,665165f5b0b542363157b7ad,merinding
244030,665165f5b0b542363157b7ae,woi keren banget
244031,665165f5b0b542363157b7af,masih best rewind
244032,665165f5b0b542363157b7b0,the epic moments


### Memilih hanya 10000 data sebagai sampel dari keseluruhan dataset yang telah bersih

In [14]:
# Memilih 10.000 data secara acak
df_sampled = df.sample(n=10000, random_state=42)

df_sampled

Unnamed: 0,_id,textOriginal
193354,66516508b0b542363156a91d,keren ending nya sedih
49227,665162eab0b542363153b8bb,apakah komen gw bisa kalian baca di antara rat...
29011,6651629cb0b542363153479d,kep it trending guys
16836,66516265b0b542363152f262,ngueri cok
160101,6651647fb0b542363155f59c,banget keren
...,...,...
134211,6651641fb0b5423631556e03,bagus banget
30559,665162a2b0b54236315350ed,days times
234890,665165c1b0b5423631578674,oke bagus ngak kayak yang taun kemaren mantap
70743,66516332b0b5423631542515,keren banget


### Normalisasi

In [15]:
norm = {
    " yg ": " yang ",
    " kren ": " keren",
    " sdh ": " sudah ",
    " dgn ": " dengan ",
    " tdk ": " tidak ",
    " dlm ": " dalam ",
    " dl ": " dulu ",
    " gw ": " saya ",
    " kok ": " kenapa ",
    " bs ": " bisa ",
    " bgt ": " sangat ",
    " njir ": " kagum ",
    " anjir ": " kagum ",
    " g ": " tidak ",
    " lg ": " lagi ",
    " dr ": " dari ",
    " tp ": " tapi ",
    " jg ": " juga ",
    " sm ": " sama ",
    " krn ": " karena ",
    " spt ": " seperti ",
    " yg ": " yang ",
    " pd ": " pada ",
    " dn ": " dan ",
    " d ": " di ",
    " kt ": " kita ",
    " km ": " kamu ",
    " ms ": " masih ",
    " tlg ": " tolong ",
    " sja ": " saja ",
    " sy ": " saya ",
    " sdikit ": " sedikit ",
    " dket ": " dekat ",
    " lbh ": " lebih ",
    " sj ": " saja ",
    " bnyk ": " banyak ",
    " dgnnya ": " dengannya ",
    " sgala ": " segala ",
    " skrg ": " sekarang ",
    " nmr ": " nomor ",
    " srg ": " sering ",
    " ttg ": " tentang ",
    " smua ": " semua ",
    " trus ": " terus ",
    " pdhl ": " padahal ",
    " kt ": " kita ",
    " dkt ": " dekat ",
    " jg ": " juga ",
    " dlm ": " dalam ",
    " bhw ": " bahwa ",
    " sja ": " saja ",
    " mrk ": " mereka ",
    " sgth ": " sangat ",
    " dn ": " dan ",
    " trlalu ": " terlalu ",
    " msing2 ": " masing-masing ",
    " brp ": " berapa ",
    " aq ": " aku ",
    " smpe ": " sampai ",
    " kalo ": " kalau ",
    " bkn ": " bukan ",
    " jd ": " jadi ",
    " dket ": " dekat ",
    " ntr ": " nanti ",
    " klo ": " kalau ",
    " sda ": " sudah ",
    " sy ": " saya ",
    " kpn ": " kapan ",
    " dmn ": " dimana ",
    " lgsng ": " langsung ",
    " tdk ": " tidak ",
    " pgi ": " pagi ",
    " ngga ": " tidak ",
    " gak ": " tidak ",
    " lbih ": " lebih ",
    " cm ": " cuman ",
    " sgt ": " sangat ",
    " aj ": " saja ",
    " apa2 ": " apa-apa ",
    " krg ": " kurang ",
    " ngerti ": " mengerti ",
    " slalu ": " selalu ",
    " bbrp ": " beberapa ",
    " smpe ": " sampai ",
    " krja ": " kerja ",
    " bbrp ": " beberapa ",
    " jln ": " jalan ",
    " hr ": " hari ",
    " kok ": " kenapa ",
    " mgkin ": " mungkin "
    # Tambahkan aturan normalisasi lainnya di sini sesuai kebutuhan
}

#### Fungsi normalisasi

In [16]:
def normalisasi(text):
    for i in norm:
        text = text.replace(i, norm[i])
    return text

In [17]:
df_sampled.loc[:, 'textOriginal'] = df_sampled['textOriginal'].apply(normalisasi)

df_sampled

Unnamed: 0,_id,textOriginal
193354,66516508b0b542363156a91d,keren ending nya sedih
49227,665162eab0b542363153b8bb,apakah komen saya bisa kalian baca di antara r...
29011,6651629cb0b542363153479d,kep it trending guys
16836,66516265b0b542363152f262,ngueri cok
160101,6651647fb0b542363155f59c,banget keren
...,...,...
134211,6651641fb0b5423631556e03,bagus banget
30559,665162a2b0b54236315350ed,days times
234890,665165c1b0b5423631578674,oke bagus ngak kayak yang taun kemaren mantap
70743,66516332b0b5423631542515,keren banget


## Stopwords

In [18]:
import Sastrawi

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

more_stop_words = ["tidak"]

stop_words = StopWordRemoverFactory().get_stop_words()
stop_words.extend(more_stop_words)

new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)

def stopword(str_text):
  str_text = stop_words_remover_new.remove(str_text)
  return str_text

In [19]:
df_sampled.loc[:, 'textOriginal'] = df_sampled['textOriginal'].apply(stopword)

df_sampled

Unnamed: 0,_id,textOriginal
193354,66516508b0b542363156a91d,keren ending nya sedih
49227,665162eab0b542363153b8bb,komen bisa kalian baca antara ratusan ribu kom...
29011,6651629cb0b542363153479d,kep it trending guys
16836,66516265b0b542363152f262,ngueri cok
160101,6651647fb0b542363155f59c,banget keren
...,...,...
134211,6651641fb0b5423631556e03,bagus banget
30559,665162a2b0b54236315350ed,days times
234890,665165c1b0b5423631578674,oke bagus ngak kayak taun kemaren mantap
70743,66516332b0b5423631542515,keren banget


## Tokenisasi
#### Memisahkan kalimat menjadi kata-kata

In [20]:
tokenized = df_sampled['textOriginal'].apply(lambda x:x.split())
tokenized

193354                          [keren, ending, nya, sedih]
49227     [komen, bisa, kalian, baca, antara, ratusan, r...
29011                             [kep, it, trending, guys]
16836                                         [ngueri, cok]
160101                                      [banget, keren]
                                ...                        
134211                                      [bagus, banget]
30559                                         [days, times]
234890     [oke, bagus, ngak, kayak, taun, kemaren, mantap]
70743                                       [keren, banget]
158441                                         [chefs, kis]
Name: textOriginal, Length: 10000, dtype: object

## Stemming
#### Mengubah kata berimbuhan menjadi kata dasar

In [21]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Inisialisasi counter
counter = 1

def stemming(text_cleaning):
    global counter
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    do = []
    for w in text_cleaning:
        dt = stemmer.stem(w)
        do.append(dt)
    d_clean = []
    d_clean = " ".join(do)
    print(f"{counter}: {d_clean}") 
    counter += 1
    return d_clean

Proses berikut agak lama karena data besar

In [22]:
tokenized = tokenized.apply(stemming)
# tokenized.to_csv("tokenized_comments.csv", index=False)
tokenized

1: keren ending nya sedih
2: komen bisa kalian baca antara ratus ribu komentarv
3: kep it trending guys
4: ngueri cok
5: banget keren
6: ini keren banget
7: ga bobon santoso
8: merinding nontonya awal akhir god job
9: nih video keren parah sih
10: mah rekor hari udah jt viewers
11: keren haru
12: wowbanyak pesan makna gua dapet gua rindu konten konten youtube sehat waras kaya dulu
13: tadi liat oto gledek auto dislike terus cok muslim auto ga jadi dislike malah like
14: merinding banget sih the best lah pokoknyadari youtubers fav selebgram bahkan tiktokers adaluv
15: gua kangen si arap
16: petcah
17: bagus banget merinding
18: rakyat malaysia dukung indonesia iaitu shabat sejati malaysia moga indonesia tetap maju
19: apa keren banget
20: trendingin lah
21: aku banga indonesia
22: singkat jelas banga jadi bagi indonesia
23: lagu nya apa
24: owmaygwad
25: sukses selalu buat kalian semua skinyindonesian
26: emg ga salah kalau project garap mantan member eclipse
27: dabest lah
28: amazing 

193354                               keren ending nya sedih
49227     komen bisa kalian baca antara ratus ribu komen...
29011                                  kep it trending guys
16836                                            ngueri cok
160101                                         banget keren
                                ...                        
134211                                         bagus banget
30559                                            days times
234890             oke bagus ngak kayak taun kemaren mantap
70743                                          keren banget
158441                                            chefs kis
Name: textOriginal, Length: 10000, dtype: object

In [23]:
tokenized

193354                               keren ending nya sedih
49227     komen bisa kalian baca antara ratus ribu komen...
29011                                  kep it trending guys
16836                                            ngueri cok
160101                                         banget keren
                                ...                        
134211                                         bagus banget
30559                                            days times
234890             oke bagus ngak kayak taun kemaren mantap
70743                                          keren banget
158441                                            chefs kis
Name: textOriginal, Length: 10000, dtype: object

#### Mengimpan Tokenized Comments ke MongoDB Local

In [24]:
# Koneksi ke MongoDB
client = MongoClient(local_url)
db = client[database_name]
collection = db[collection_name]

# Konversi tokenized menjadi list of dictionaries
data_to_insert = [{'textOriginal': text} for text in tokenized]

# Masukkan data ke dalam koleksi MongoDB
collection.insert_many(data_to_insert)

print("Data berhasil diimpor ke MongoDB Local.")

Data berhasil diimpor ke MongoDB Local.


## Translate

import data dari file csv

In [25]:
# from translate import Translator

# # Inisialisasi counter
# counter = 1

# def convert_eng(text):
#     global counter  # Menggunakan counter sebagai variabel global
#     translator = Translator(to_lang="en", from_lang="id")
#     translation = translator.translate(text)
#     print(f"{counter}: {translation}")  # Mencetak teks terjemahan dengan nomor urut
#     counter += 1  # Menambah nilai counter setiap kali fungsi dipanggil
#     return translation

In [26]:
# # data["tweet_english"] = data["text"].apply(convert_eng)
# data.to_csv("translate_comments.csv", index=False)