# Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

# Load Dataset

In [3]:
data = pd.read_csv('dataset_sirekap.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,content,sentiment
0,772,Tolong untuk fotonya di tingkatkan lagi kualit...,0
1,939,Aplikasi tidak bisa berjalan di HP ram 6/128 d...,-1
2,760,"Tolong untuk admin di perbaiki app nya, karena...",-1
3,962,Tolong perbaiki aplikasinya.. Mau log in aja s...,-1
4,438,"Aplikasinya ribet susah , udah diganti sandi p...",-1


# Text Preprocessing

## Case Folding

In [9]:
import re

# membuat fungsi untuk case folding
def casefolding(text):
    text = text.lower()                             # ubah jadi huruf kecil
    text = re.sub(r'[-+]?[0-9]+', '', text)          # hapus angka
    text = re.sub(r'[^\w\s]', '', text)             # hapus simbol, tanda baca
    text = text.strip()
    return text

In [10]:
# membandingkan before dan after
raw_sample = data['content'].iloc[2]
case_fold = casefolding(raw_sample)

print('Raw Data\t : ', raw_sample)
print('Case Folding\t : ', case_fold)

Raw Data	 :  Tolong untuk admin di perbaiki app nya, karena di hp Tecno pova4 tidak bisa masuk. Keterangan nya gagal mendapatkan kunci digital. Tolong agar bisa segera di perbaiki karena ini menyangkut kerja kami PPS untuk membintek KPPS.
Case Folding	 :  tolong untuk admin di perbaiki app nya karena di hp tecno pova tidak bisa masuk keterangan nya gagal mendapatkan kunci digital tolong agar bisa segera di perbaiki karena ini menyangkut kerja kami pps untuk membintek kpps


## Filtering / Stopwords Removal

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('Indonesian')

In [12]:
# melihat seberapa banyak stopwords bahasa Indonesia
len(stopwords_ind)

758

In [13]:
# membuat fungsi stopwords removal
# menambahkan kata dalam stopwords
more_stopwords = ['sirekap', 'pova', 'tecno']
stopwords_ind = stopwords_ind + more_stopwords

def remove_stop_word(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in stopwords_ind:
            clean_words.append(word)
    return " ".join(clean_words)

In [14]:
# mencoba fungsi
raw_sample = data['content'].iloc[400]
case_fold = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_fold)

print('Raw Data\t : ', raw_sample)
print('Case Folding\t : ', case_fold)
print('Stopword Removal\t : ', stopword_removal)

Raw Data	 :  Lucu aplikasinya. Udah berhasil login . Pas digunakan malah keluar sendiri, terus pas mau login username dan password salah mulu, padahal udah sesuai dengan yang sebelumnya 🤣
Case Folding	 :  lucu aplikasinya udah berhasil login  pas digunakan malah keluar sendiri terus pas mau login username dan password salah mulu padahal udah sesuai dengan yang sebelumnya
Stopword Removal	 :  lucu aplikasinya udah berhasil login pas pas login username password salah mulu udah sesuai


## Stemming

In [15]:
!pip -q install sastrawi

In [18]:
# merubah kata menjadi kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()


# membuat fungsi untuk stemming bahasa Indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text

In [19]:
# mencoba fungsi
raw_sample = data['content'].iloc[400]
case_fold = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_fold)
stemming_text = stemming(stopword_removal)

print('Raw Data\t : ', raw_sample)
print('Case Folding\t : ', case_fold)
print('Stopword Removal\t : ', stopword_removal)
print('Stemming Text\t : ', stemming_text)

Raw Data	 :  Lucu aplikasinya. Udah berhasil login . Pas digunakan malah keluar sendiri, terus pas mau login username dan password salah mulu, padahal udah sesuai dengan yang sebelumnya 🤣
Case Folding	 :  lucu aplikasinya udah berhasil login  pas digunakan malah keluar sendiri terus pas mau login username dan password salah mulu padahal udah sesuai dengan yang sebelumnya
Stopword Removal	 :  lucu aplikasinya udah berhasil login pas pas login username password salah mulu udah sesuai
Stemming Text	 :  lucu aplikasi udah hasil login pas pas login username password salah mulu udah sesuai


## Text Preprocessing Pipeline

In [21]:
# membuat fungsi untuk menggabungkan seluruh langkah text preprocessing
def text_preprocessing_process(text):
    text = casefolding(text)
    text = remove_stop_word(text)
    text = stemming(text)
    return text

In [22]:
%%time
data['clean_content']=data['content'].apply(text_preprocessing_process)

CPU times: total: 6min 8s
Wall time: 6min 42s


In [23]:
data

Unnamed: 0.1,Unnamed: 0,content,sentiment,clean_content
0,772,Tolong untuk fotonya di tingkatkan lagi kualit...,0,tolong foto tingkat kualitas nya iya hasil fot...
1,939,Aplikasi tidak bisa berjalan di HP ram 6/128 d...,-1,aplikasi jalan hp ram dgn sinyal yg bagus yg s...
2,760,"Tolong untuk admin di perbaiki app nya, karena...",-1,tolong admin baik app nya hp masuk terang nya ...
3,962,Tolong perbaiki aplikasinya.. Mau log in aja s...,-1,tolong baik aplikasi log in aja sulit sinyal k...
4,438,"Aplikasinya ribet susah , udah diganti sandi p...",-1,aplikasi ribet susah udah ganti sandi pake sid...
...,...,...,...,...
995,0,"Aplikasi butut, herannya kpu dikasih waktu 5 t...",-1,aplikasi butut heran kpu kasih ni aplikasi has...
996,296,Made in pemerintah dari dulu gini mulu ya. Min...,-1,made in perintah gin mulu ya minus ga auto foc...
997,163,"Hmmm ocr nya menggelitik, datanya gak pas, di ...",-1,hmmm ocr nya gelitik data gak pas edit manual ...
998,507,Aplikasi jelek. Sekiranya mau bikin aplikasi y...,-1,aplikasi jelek bikin aplikasi bagus dana aplik...


In [24]:
# simpan data yang sudah dipreprocessing ke dalam file csv
data.to_csv('clean_data_sirekap.csv')

# Feature Engineering

In [25]:
# memisahkan kolom clean_content dengan sentiment
x = data['clean_content']
y = data['sentiment']

## Feature Extraction (TF - IDF)

In [28]:
# save model
import pickle

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF.transform(x)

pickle.dump(vec_TF_IDF.vocabulary_,open("feature_tf-idf.sav", "wb"))

In [29]:
# menampilkan vocabulary dari TF-IDF
vec_TF_IDF.vocabulary_

{'tolong': 2838,
 'foto': 811,
 'tingkat': 2819,
 'kualitas': 1377,
 'nya': 1896,
 'iya': 1122,
 'hasil': 983,
 'kamera': 1217,
 'burik': 416,
 'bgt': 317,
 'kaya': 1235,
 'downgrade': 686,
 'aplikasi': 125,
 'deh': 549,
 'beda': 266,
 'bawa': 253,
 'jalan': 1131,
 'hp': 1015,
 'ram': 2226,
 'dgn': 585,
 'sinyal': 2534,
 'yg': 3061,
 'bagus': 207,
 'salah': 2322,
 'codingan': 495,
 'putar': 2215,
 'pd': 2030,
 'admin': 19,
 'baik': 213,
 'app': 144,
 'masuk': 1604,
 'terang': 2742,
 'gagal': 831,
 'kunci': 1383,
 'digital': 595,
 'kerja': 1285,
 'pps': 2157,
 'membintek': 1638,
 'kpps': 1368,
 'log': 1501,
 'in': 1046,
 'aja': 29,
 'sulit': 2640,
 'kuat': 1378,
 'apakabar': 115,
 'sodara': 2580,
 'pelosok': 2046,
 'pakai': 1979,
 'kendala': 1275,
 'ribet': 2290,
 'susah': 2655,
 'udah': 2906,
 'ganti': 857,
 'sandi': 2341,
 'pake': 1981,
 'sidik': 2509,
 'jari': 1142,
 'muka': 1760,
 'tetep': 2788,
 'gak': 840,
 'diuninstal': 656,
 'ttp': 2878,
 'mohon': 1740,
 'mudah': 1755,
 'bantu':

In [31]:
# melihat jumlah feature
print(len(vec_TF_IDF.get_feature_names_out()))

3071


In [32]:
# melihat feature apa saja yang ada dalam corpus
print(vec_TF_IDF.get_feature_names_out())

['aaja' 'aatu' 'abal' ... 'zaman' 'zonk' 'zoom']


In [33]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1, columns=vec_TF_IDF.get_feature_names_out())

data_tabular_tf_idf

Unnamed: 0,aaja,aatu,abal,abalabal,abis,about,acara,acc,acces,access,...,yg,yng,yo,yok,youtube,yt,yth,zaman,zonk,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.272526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Selection

In [35]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [36]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k=3000)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

# reduce features
print('Original Features Number\t : ', x_train.shape[1])
print('Reduced Features Number \t : ', x_kbest_features.shape[1])

Original Features Number	 :  3071
Reduced Features Number 	 :  3000


In [37]:
Data = pd.DataFrame(chi2_features.scores_, columns=['nilai'])
Data

Unnamed: 0,nilai
0,4.333876
1,8.881601
2,0.111369
3,0.051889
4,0.041695
...,...
3066,0.039104
3067,3.919449
3068,0.023294
3069,0.028259


In [38]:
# menampilkan data features beserta nilainya
feature = vec_TF_IDF.get_feature_names_out()
feature

Data['Fitur'] = feature
Data

Unnamed: 0,nilai,Fitur
0,4.333876,aaja
1,8.881601,aatu
2,0.111369,abal
3,0.051889,abalabal
4,0.041695,abis
...,...,...
3066,0.039104,yt
3067,3.919449,yth
3068,0.023294,zaman
3069,0.028259,zonk


In [39]:
# mengurutkan nilai feature terbaik
Data.sort_values(by='nilai', ascending=False)

Unnamed: 0,nilai,Fitur
2941,23.177307,unt
2430,15.054708,semangat
2239,14.426651,rating
1152,13.661296,jawab
2003,13.661296,pasdword
...,...,...
1504,0.004811,login
1143,0.003637,jaring
716,0.003182,edit
326,0.002483,bimtek


In [40]:
mask = chi2_features.get_support()
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [41]:
# menampilkan feature yang terpilih berdasarkan nilai mask atau nilai tertinggi yang sudah ditetapkan pada chi_square
new_feature = []
for bool, f in zip(mask, feature):
    if bool:
        new_feature.append(f)
    selected_feature = new_feature
selected_feature

['aaja',
 'aatu',
 'abal',
 'abalabal',
 'abis',
 'acara',
 'acc',
 'acces',
 'access',
 'ad',
 'ada',
 'adain',
 'adakan',
 'adaptasi',
 'adauang',
 'adhoc',
 'adi',
 'adil',
 'admin',
 'aduh',
 'agenda',
 'agregator',
 'ah',
 'ahir',
 'ahli',
 'ai',
 'aing',
 'aj',
 'aja',
 'ajaa',
 'ajaaplikasinya',
 'ajah',
 'ajakualitas',
 'ajanyesel',
 'ajar',
 'akal',
 'akan',
 'akhlak',
 'akibat',
 'akretasi',
 'akses',
 'aksesterus',
 'aktif',
 'aktifasi',
 'aktifin',
 'aktifitas',
 'aktivasi',
 'akunudah',
 'akurat',
 'al',
 'ala',
 'alah',
 'alam',
 'alama',
 'alamat',
 'alami',
 'alangkah',
 'alas',
 'alat',
 'alesan',
 'alesannya',
 'algoritma',
 'algoritmanya',
 'alhamdulillah',
 'alhasil',
 'alias',
 'alih',
 'all',
 'allah',
 'alternatif',
 'ama',
 'aman',
 'amanah',
 'amat',
 'amatir',
 'ambigu',
 'ambil',
 'amburadul',
 'ampas',
 'ampe',
 'ampun',
 'ampunapa',
 'ampunpas',
 'an',
 'ana',
 'anak',
 'analis',
 'analisis',
 'ancur',
 'and',
 'anda',
 'andai',
 'andal',
 'andorid',
 'ando

In [42]:
# membuat vocabulary baru berdasarkan feature yang terseleksi
new_selected_feature = {}

for (k, v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k] = v
new_selected_feature

{'tolong': 2838,
 'foto': 811,
 'tingkat': 2819,
 'kualitas': 1377,
 'nya': 1896,
 'iya': 1122,
 'hasil': 983,
 'kamera': 1217,
 'burik': 416,
 'bgt': 317,
 'kaya': 1235,
 'downgrade': 686,
 'aplikasi': 125,
 'deh': 549,
 'beda': 266,
 'bawa': 253,
 'jalan': 1131,
 'hp': 1015,
 'dgn': 585,
 'sinyal': 2534,
 'yg': 3061,
 'salah': 2322,
 'codingan': 495,
 'putar': 2215,
 'pd': 2030,
 'admin': 19,
 'baik': 213,
 'app': 144,
 'masuk': 1604,
 'terang': 2742,
 'gagal': 831,
 'kunci': 1383,
 'digital': 595,
 'kerja': 1285,
 'pps': 2157,
 'membintek': 1638,
 'kpps': 1368,
 'log': 1501,
 'in': 1046,
 'aja': 29,
 'sulit': 2640,
 'kuat': 1378,
 'apakabar': 115,
 'sodara': 2580,
 'pelosok': 2046,
 'pakai': 1979,
 'kendala': 1275,
 'ribet': 2290,
 'susah': 2655,
 'udah': 2906,
 'ganti': 857,
 'sandi': 2341,
 'pake': 1981,
 'sidik': 2509,
 'jari': 1142,
 'muka': 1760,
 'tetep': 2788,
 'gak': 840,
 'diuninstal': 656,
 'ttp': 2878,
 'mohon': 1740,
 'mudah': 1755,
 'bantu': 230,
 'sertifikat': 2470,
 '

In [43]:
len(new_selected_feature)

3000

In [44]:
pickle.dump(new_selected_feature,open("new_selected_feature_tf-idf.sav","wb"))

In [45]:
# menampilkan feature yang sudah diseleksi
data_selected_feature = pd.DataFrame(x_kbest_features, columns=selected_feature)
data_selected_feature

Unnamed: 0,aaja,aatu,abal,abalabal,abis,acara,acc,acces,access,ad,...,yg,yng,yo,yok,youtube,yt,yth,zaman,zonk,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.272526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modelling

In [46]:
selected_x = x_kbest_features
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [47]:
# import library
import random
from sklearn.model_selection import train_test_split

# import algoritma naive bayes
from sklearn.naive_bayes import MultinomialNB

# import algortima random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# import algoritma svm
from sklearn.svm import SVC

In [49]:
X = selected_x
y = data.sentiment

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [50]:
# menampilkan jumlah data_train dan data_test
print("Banyak x_train\t : ", len(x_train))
print("Banyak x_test\t : ", len(x_test))
print("Banyak y_train\t : ", len(y_train))
print("Banyak y_test\t : ", len(y_test))

Banyak x_train	 :  800
Banyak x_test	 :  200
Banyak y_train	 :  800
Banyak y_test	 :  200


In [53]:
# proses train dengan naive bayes
text_alg_nb = MultinomialNB()

In [54]:
model_nb = text_alg_nb.fit(x_train, y_train)

In [60]:
# membuat model prediksi
data_input = ("tolong foto tingkat kualitas nya iya hasil foto kamera burik bgt kaya downgrade aplikasi deh beda hasil kamera bawa")
data_input = text_preprocessing_process(data_input)

# load
tfIdf = TfidfVectorizer()
loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))
hasil = model_nb.predict(loaded_vec.fit_transform([data_input]))

# kondisi sederhana
if (hasil == 0):
    s = "Sentimen Netral"
elif (hasil == 1):
    s = "Sentimen Positif"
else:
    s = "Sentimen Negatif"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 Sentimen Negatif


# Evaluasi Model

In [62]:
# library
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model_nb.predict(x_test)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

          -1       0.92      1.00      0.96       183
           0       0.00      0.00      0.00        13
           1       0.00      0.00      0.00         4

    accuracy                           0.92       200
   macro avg       0.30      0.33      0.32       200
weighted avg       0.84      0.92      0.87       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [63]:
# menyimpan model
pickle.dump(model_nb, open("model_sentimen.sav", "wb"))