### **Import Library**

In [2]:
import re
import nltk
import time
import spacy
import string
import swifter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from wordcloud import WordCloud
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectPercentile, mutual_info_classif
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

#set warning
import warnings
warnings.filterwarnings('ignore')

pd.pandas.set_option('display.max_columns', None)

In [5]:
filename = '../database/datasetpemilu2024.csv'
df = pd.read_csv(filename, encoding = 'latin-1')
df

Unnamed: 0,label,username,tweet
0,positif,UmumTahun,siapa calon presiden dan wakil presiden 2024
1,positif,NurlinaPermata,Sepertinya sinyal dukungan untuk Pak Prabowo s...
2,positif,SintaDamay12,"Keren banget, Pak Prabowo didampingi juga sama..."
3,positif,aziz_jordan,"@cnbcindonesia Yg penting , ekonomi nasional a..."
4,positif,tvOneNews,Sri Mulyani: Pemilu 2024 Mendorong Aktivitas P...
...,...,...,...
995,negatif,AD1T_RG,"Mau sistem terbuka at tertutup sama saja, wong..."
996,negatif,VirgoMan333,Jelas sekali si Kodok Bau udah bersiap curang ...
997,negatif,CarloLian3,@conan_idn @bawaslu_RI Kakitangan rezim yaa be...
998,negatif,ayhasbi0,"@musniumar Rektor kok goblok ,, emang nya anis..."


In [7]:
# Mengambil hanya kolom 'label' dan 'tweet'
df = df[['label', 'tweet']]

# Mengubah nilai label positif menjadi 1 dan nilai negatif menjadi 0
df['label'] = df['label'].replace({'positif': 1, 'negatif': 0})

# Menampilkan seluruh DataFrame
df

Unnamed: 0,label,tweet
0,1,siapa calon presiden dan wakil presiden 2024
1,1,Sepertinya sinyal dukungan untuk Pak Prabowo s...
2,1,"Keren banget, Pak Prabowo didampingi juga sama..."
3,1,"@cnbcindonesia Yg penting , ekonomi nasional a..."
4,1,Sri Mulyani: Pemilu 2024 Mendorong Aktivitas P...
...,...,...
995,0,"Mau sistem terbuka at tertutup sama saja, wong..."
996,0,Jelas sekali si Kodok Bau udah bersiap curang ...
997,0,@conan_idn @bawaslu_RI Kakitangan rezim yaa be...
998,0,"@musniumar Rektor kok goblok ,, emang nya anis..."


### **Preprocessing Data**

Cleaning

In [8]:
def clean_text(text):
    # Mengubah ke huruf kecil
    text = text.lower()
    #Menghapus mention
    text = re.sub(r"@\S+", "",text)
    # Menghapus Username Twitter, http, hashtag, karakter html dengan tanda petik
    text = re.sub(r'(@[^\s]+|http\S+|#\w+|<.*?>)', '', text)
    # Menghapus single character
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    # Menghapus karakter khusus, angka, dan simbol
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Menghapus emotikon dari teks
    text = text.encode('ascii', 'replace').decode('ascii')
    # Menghilangkan tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(text.split())

# Membersihkan teks pada set pelatihan
df['cleaned'] = df['tweet'].apply(clean_text)

# Menampilkan lima baris pertama
df.head()

Unnamed: 0,label,tweet,cleaned
0,1,siapa calon presiden dan wakil presiden 2024,siapa calon presiden dan wakil presiden
1,1,Sepertinya sinyal dukungan untuk Pak Prabowo s...,sepertinya sinyal dukungan untuk pak prabowo s...
2,1,"Keren banget, Pak Prabowo didampingi juga sama...",keren banget pak prabowo didampingi juga sama ...
3,1,"@cnbcindonesia Yg penting , ekonomi nasional a...",yg penting ekonomi nasional aman amp lancar bu...
4,1,Sri Mulyani: Pemilu 2024 Mendorong Aktivitas P...,sri mulyani pemilu mendorong aktivitas perekon...


Tokenizing

In [9]:
def tokenize_text(text):
    # Membuat objek tokenizer dengan pola ekspresi reguler
    regexp = RegexpTokenizer(r'\w+|\$[0-9]+|\S+')

    # Menerapkan fungsi tokenizing ke teks
    tokens = regexp.tokenize(text)

    return tokens

# Membuat kolom baru 'tokenized' untuk menyimpan hasil tokenisasi
df['tokenized'] = df['cleaned'].apply(tokenize_text)

# Menampilkan lima baris pertama
df.head()

Unnamed: 0,label,tweet,cleaned,tokenized
0,1,siapa calon presiden dan wakil presiden 2024,siapa calon presiden dan wakil presiden,"[siapa, calon, presiden, dan, wakil, presiden]"
1,1,Sepertinya sinyal dukungan untuk Pak Prabowo s...,sepertinya sinyal dukungan untuk pak prabowo s...,"[sepertinya, sinyal, dukungan, untuk, pak, pra..."
2,1,"Keren banget, Pak Prabowo didampingi juga sama...",keren banget pak prabowo didampingi juga sama ...,"[keren, banget, pak, prabowo, didampingi, juga..."
3,1,"@cnbcindonesia Yg penting , ekonomi nasional a...",yg penting ekonomi nasional aman amp lancar bu...,"[yg, penting, ekonomi, nasional, aman, amp, la..."
4,1,Sri Mulyani: Pemilu 2024 Mendorong Aktivitas P...,sri mulyani pemilu mendorong aktivitas perekon...,"[sri, mulyani, pemilu, mendorong, aktivitas, p..."


Normalization

In [14]:
#Membaca file CSV 'normalisasi.csv'
normalized_word = pd.read_csv('./machine/preprocessing/normalisasi.csv', encoding='latin1')

#Membuat kamus normalisasi dari data CSV
normalized_word_dict = {}
for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1]

#Mendefinisikan fungsi normalisasi
def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

#Menerapkan fungsi normalisasi ke kolom 'Token'
df['normalized'] = df['tokenized'].apply(normalized_term)

# Menampilkan lima baris pertama
df.head()

Unnamed: 0,label,tweet,cleaned,tokenized,normalized,stopwords
0,1,siapa calon presiden dan wakil presiden 2024,siapa calon presiden dan wakil presiden,"[siapa, calon, presiden, dan, wakil, presiden]","[siapa, calon, presiden, dan, wakil, presiden]","[siapa, calon, presiden, wakil, presiden]"
1,1,Sepertinya sinyal dukungan untuk Pak Prabowo s...,sepertinya sinyal dukungan untuk pak prabowo s...,"[sepertinya, sinyal, dukungan, untuk, pak, pra...","[sepertinya, sinyal, dukungan, untuk, bapak, p...","[sepertinya, sinyal, dukungan, bapak, prabowo,..."
2,1,"Keren banget, Pak Prabowo didampingi juga sama...",keren banget pak prabowo didampingi juga sama ...,"[keren, banget, pak, prabowo, didampingi, juga...","[keren, banget, bapak, prabowo, didampingi, ju...","[keren, banget, bapak, prabowo, didampingi, sa..."
3,1,"@cnbcindonesia Yg penting , ekonomi nasional a...",yg penting ekonomi nasional aman amp lancar bu...,"[yg, penting, ekonomi, nasional, aman, amp, la...","[yang, penting, ekonomi, nasional, aman, amp, ...","[penting, ekonomi, nasional, aman, amp, lancar..."
4,1,Sri Mulyani: Pemilu 2024 Mendorong Aktivitas P...,sri mulyani pemilu mendorong aktivitas perekon...,"[sri, mulyani, pemilu, mendorong, aktivitas, p...","[sri, mulyani, pemilihan, mendorong, aktivitas...","[sri, mulyani, pemilihan, mendorong, aktivitas..."


Stopword Removal

In [15]:
# Membaca file CSV yang berisi stopwords tambahan
txt_stopword = pd.read_csv('./machine/preprocessing/stopword.csv', header=None, names=["stopwords"])
additional_stopwords = list(txt_stopword["stopwords"][0].split(' '))

# Membuat daftar stopwords dari Sastrawi
factory = StopWordRemoverFactory()
stopword_sastrawi = factory.get_stop_words()

# Menambahkan stopwords dari file CSV dan stopwords tambahan ke dalam daftar stopword
stopword = stopword_sastrawi + additional_stopwords + ["petrus", "sech", "bulakparen", "dcs", "mug","apa", "dkk", "kek","bla","nihhh","nyinyir",
            "background2","nya", "klik", "nih", "wah", "bd","cie", "wahh", "gtgt", "wkwkw", "grgr", "thun", "dong", "mkmk","gp","brengkelan","woi",
            "twit", "iii", "08alian", "wkwkwkwk", "wkwk","wkwkwk", "ah", "ampnbsp", "bawaslu", "hihihi", "hihi", "eh", "ng","dl","do","kwkwkwkk",
            "ltpgtampnbspltpgt", "dancukkk", "yach", "kepl", "wow","kretek", "woww", "smpn", "hmmmm", "hehe", "oooiii","onana","kjaernett",
            "hahaha", "ppp", "nek", "rang", "tuh", "pls", "otw", "pas","haha", "ha", "hahahahaha", "hahahasenget","wakakakakak","wkwkwkw",
            "xixixixi", "hehehehee", "nder", "aduuuhhh", "lah","lah", "deh", "si", "kan", "njirrrr", "huehehee","yoongi","sulli","bjir",
            "hehehe", "yahh", "yah", "loh", "elo", "gw", "didkgkl","sih", "lu", "yeyeye", "dlllllllllll", "se","yoon","de","ruu","apeeeeee",
            "pisss", "yo", "kok", "nge", "wkwkkw", "dah", "wahhh", "apa", "btw", "kwkwkwkwk", "nahh", "nah", "iya"]

def filter_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

# Membuat kolom baru 'stopwords' untuk menyimpan teks setelah penghapusan stopwords
df['stopwords'] = df['normalized'].apply(filter_stopwords)

# Menampilkan lima baris pertama
df.head()

Unnamed: 0,label,tweet,cleaned,tokenized,normalized,stopwords
0,1,siapa calon presiden dan wakil presiden 2024,siapa calon presiden dan wakil presiden,"[siapa, calon, presiden, dan, wakil, presiden]","[siapa, calon, presiden, dan, wakil, presiden]","[siapa, calon, presiden, wakil, presiden]"
1,1,Sepertinya sinyal dukungan untuk Pak Prabowo s...,sepertinya sinyal dukungan untuk pak prabowo s...,"[sepertinya, sinyal, dukungan, untuk, pak, pra...","[sepertinya, sinyal, dukungan, untuk, bapak, p...","[sepertinya, sinyal, dukungan, bapak, prabowo,..."
2,1,"Keren banget, Pak Prabowo didampingi juga sama...",keren banget pak prabowo didampingi juga sama ...,"[keren, banget, pak, prabowo, didampingi, juga...","[keren, banget, bapak, prabowo, didampingi, ju...","[keren, banget, bapak, prabowo, didampingi, sa..."
3,1,"@cnbcindonesia Yg penting , ekonomi nasional a...",yg penting ekonomi nasional aman amp lancar bu...,"[yg, penting, ekonomi, nasional, aman, amp, la...","[yang, penting, ekonomi, nasional, aman, amp, ...","[penting, ekonomi, nasional, aman, amp, lancar..."
4,1,Sri Mulyani: Pemilu 2024 Mendorong Aktivitas P...,sri mulyani pemilu mendorong aktivitas perekon...,"[sri, mulyani, pemilu, mendorong, aktivitas, p...","[sri, mulyani, pemilihan, mendorong, aktivitas...","[sri, mulyani, pemilihan, mendorong, aktivitas..."


 Stemming

In [16]:
# Membuat objek stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Membuat fungsi untuk menerapkan stemming pada satu kata
def stemmed_wrapper(term):
    return stemmer.stem(term)

# Membuat fungsi untuk menerapkan stemming pada seluruh dokumen
def stemming_text(text):
    return [stemmed_wrapper(term) for term in text]

# Menerapkan fungsi stemming_text pada kolom 'stopwords'
df['stemmed'] = df['stopwords'].apply(stemming_text)

# Menampilkan hasil
df.head()

Unnamed: 0,label,tweet,cleaned,tokenized,normalized,stopwords,stemmed
0,1,siapa calon presiden dan wakil presiden 2024,siapa calon presiden dan wakil presiden,"[siapa, calon, presiden, dan, wakil, presiden]","[siapa, calon, presiden, dan, wakil, presiden]","[siapa, calon, presiden, wakil, presiden]","[siapa, calon, presiden, wakil, presiden]"
1,1,Sepertinya sinyal dukungan untuk Pak Prabowo s...,sepertinya sinyal dukungan untuk pak prabowo s...,"[sepertinya, sinyal, dukungan, untuk, pak, pra...","[sepertinya, sinyal, dukungan, untuk, bapak, p...","[sepertinya, sinyal, dukungan, bapak, prabowo,...","[seperti, sinyal, dukung, bapak, prabowo, maki..."
2,1,"Keren banget, Pak Prabowo didampingi juga sama...",keren banget pak prabowo didampingi juga sama ...,"[keren, banget, pak, prabowo, didampingi, juga...","[keren, banget, bapak, prabowo, didampingi, ju...","[keren, banget, bapak, prabowo, didampingi, sa...","[keren, banget, bapak, prabowo, damping, sama,..."
3,1,"@cnbcindonesia Yg penting , ekonomi nasional a...",yg penting ekonomi nasional aman amp lancar bu...,"[yg, penting, ekonomi, nasional, aman, amp, la...","[yang, penting, ekonomi, nasional, aman, amp, ...","[penting, ekonomi, nasional, aman, amp, lancar...","[penting, ekonomi, nasional, aman, amp, lancar..."
4,1,Sri Mulyani: Pemilu 2024 Mendorong Aktivitas P...,sri mulyani pemilu mendorong aktivitas perekon...,"[sri, mulyani, pemilu, mendorong, aktivitas, p...","[sri, mulyani, pemilihan, mendorong, aktivitas...","[sri, mulyani, pemilihan, mendorong, aktivitas...","[sri, mulyani, pilih, dorong, aktivitas, ekono..."


In [17]:
# Mengambil hasil stemming dari kolom 'stemmed' dan menyimpannya di kolom 'tweet_clean'
df['tweet_clean'] = df['stemmed'].apply(lambda x: ' '.join(x))

# Memilih kolom 'label' dan 'tweet_clean'
df = df[['label', 'tweet_clean']]

# Menampilkan hasil
df

Unnamed: 0,label,tweet_clean
0,1,siapa calon presiden wakil presiden
1,1,seperti sinyal dukung bapak prabowo makin kuat...
2,1,keren banget bapak prabowo damping sama pangli...
3,1,penting ekonomi nasional aman amp lancar bu ap...
4,1,sri mulyani pilih dorong aktivitas ekonomi baik
...,...,...
995,0,mau sistem buka at tutup sama wong jurdil alia...
996,0,jelas sekali kodok bau siap curang main kasar ...
997,0,kakitangan rezim yaa begini laku percaya pilih...
998,0,rektor bodoh emang anis truk kuat nanjak ganja...


In [18]:
# Menyimpan data yang telah melalui preprocessing
df.to_csv('../database/dataClean.csv', index=False)