In [None]:
import pandas as pd
data = pd.read_csv('Data Komentar DL 900.csv')

In [None]:
data.head(10)

Unnamed: 0,KOMENTAR,SENTIMEN
0,Bagus. Salah satu pendekatan antropologi diken...,1
1,"Terimakasih, saya dapat informasi yg sangat be...",1
2,Terima kasih mahasiswa perwakilan dari Indones...,1
3,"bagus mas menteri, saya bangga",1
4,ide sangat cemerlang n patut didukung oleh sem...,1
5,bagus sih mari dicoba\\t,1
6,ayo sukseskan bersama kebijakan kampus merdeka...,1
7,luar biasa cara respon rencana solusi pelaksan...,1
8,"Suka bgt sama mba no 3, mewakilkan para anak P...",1
9,"ITUlah CARA yg paling baik n BERKWALITAS, MAJ...",1


**TEXT PRERPOCESSING**

this process consists of :
1. Case Folding
2. Tokenization
3. Perbaikan Kata Tidak Baku
3. Stopwords Removal
4. Stemming


In [None]:
#-- CASE FOLDING --
import string
import re #regex library

data['KOMENTAR'] = data['KOMENTAR'].str.lower()

def casef(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    text= text.replace("http://", " ").replace("https://", " ")
    #remove number
    text = re.sub(r"\d+", "", text)
    #remove punctuation/tanda baca
    text = text.translate(str.maketrans("","",string.punctuation))
    #remove whitespace leading & trailing/ spasi
    text = text.strip()
    #remove multiple whitespace into single whitespace
    text = re.sub('\s+',' ',text)
    return text

data['casefold'] = data['KOMENTAR'].apply(casef)
hasil_casefolding= data['casefold']
hasil_casefolding

0      bagus salah satu pendekatan antropologi dikena...
1      terimakasih saya dapat informasi yg sangat ber...
2      terima kasih mahasiswa perwakilan dari indones...
3                          bagus mas menteri saya bangga
4      ide sangat cemerlang n patut didukung oleh sem...
                             ...                        
995    klo negara mau harusnya pendidikan ditegakkan ...
996                   sayang sekali belum ada pemerataan
997    kampusnya banyak lapangan kerjanya dikit meman...
998    saya pesimis dengan mental birokrasi skrg untu...
999    risetnya ga bisa asal harus kualitas terbaik h...
Name: casefold, Length: 1000, dtype: object

In [None]:
# ------ Tokenizing ---------
import nltk
nltk.download('punkt')
# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 

# NLTK word tokenize 
def word_tokenize_wrapper(text):
  return word_tokenize(text)

data['komen_tokens'] = data['casefold'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(data['komen_tokens'].head())
print('\n\n\n')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Tokenizing Result : 

0    [bagus, salah, satu, pendekatan, antropologi, ...
1    [terimakasih, saya, dapat, informasi, yg, sang...
2    [terima, kasih, mahasiswa, perwakilan, dari, i...
3                  [bagus, mas, menteri, saya, bangga]
4    [ide, sangat, cemerlang, n, patut, didukung, o...
Name: komen_tokens, dtype: object






In [None]:
#-- PERBAIKAN KATA TIDAK BAKU --

normalized_word = pd.read_excel("kamus perbaikan kata.xlsx")

normalized_word_dict = {}

for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

data['komen_perbaikan'] = data['komen_tokens'].apply(normalized_term)

data['komen_perbaikan'].tail(10)

990    [magang, kelamaan, membuat, stres, juga, perus...
991    [saya, pesimis, dengan, mental, birokrasi, sek...
992    [kalau, cuma, pak, nadiem, saja, yang, revolus...
993    [harus, bagaimana, dan, tempat, magangnya, jug...
994    [kalau, untuk, yang, kuliah, sambil, kerja, ba...
995    [kalau, negara, mau, harusnya, pendidikan, dit...
996             [sayang, sekali, belum, ada, pemerataan]
997    [kampusnya, banyak, lapangan, kerjanya, sediki...
998    [saya, pesimis, dengan, mental, birokrasi, sek...
999    [risetnya, tidak, bisa, asal, harus, kualitas,...
Name: komen_perbaikan, dtype: object

In [None]:
#-- STOPWORDS REMOVAL --
list_stopwords = {"adalah","akan","akhir","aku","saya","antara","antaranya","apabila","atau","bahwa","bahwasannya","berikut","berkata","berupa","dan","dalam","dapat","dari","demikian","dengan","di","dia","beliau","mas","pak","diri","dirinya","guna","hal","hingga","ia","ialah","ibarat","ibaratnya","ibu","ingin","inginkan","ini","itu","jadi","kami","kalian","kamu","kan","karena","kini","lalu","kita","maka","mereka","merupakan","misal","misalkan","misalnya","pertama","orang","pada","nya","saat","sendiri","sini","yaitu","yang","kalau","jika","untuk","secara","sedangkan","luar","alangkah","wkkk",
                  "wkwkw","wkwkwkw","wk","wkkw"}

list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

data['komen_filtered'] = data['komen_perbaikan'].apply(stopwords_removal) 

print(data['komen_filtered'].head())

0    [bagus, salah, satu, pendekatan, antropologi, ...
1    [terimakasih, informasi, sangat, berguna, memo...
2    [terima, kasih, mahasiswa, perwakilan, indones...
3                             [bagus, menteri, bangga]
4    [ide, sangat, cemerlang, patut, didukung, oleh...
Name: komen_filtered, dtype: object


In [None]:
pip install Sastrawi



In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}
for document in data['komen_filtered']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

data['komen_stemmed'] = data['komen_filtered'].apply(get_stemmed_term)
print(data['komen_stemmed'].tail(10))

990    [magang, lama, buat, stres, juga, usaha, mana,...
991        [pesimis, mental, birokrasi, sekarang, wujud]
992    [cuma, nadiem, saja, revolusioner, tapi, bawah...
993      [harus, bagaimana, tempat, magang, juga, susah]
994    [kuliah, sambil, kerja, bagaimana, tinggal, do...
995    [negara, mau, harus, didik, tegak, wilayah, lu...
996                 [sayang, sekali, belum, ada, perata]
997    [kampus, banyak, lapang, kerja, sedikit, meman...
998        [pesimis, mental, birokrasi, sekarang, wujud]
999    [riset, tidak, bisa, asal, harus, kualitas, ba...
Name: komen_stemmed, dtype: object


In [None]:
#install scikit-learn library
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.24.1)


In [None]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer 

**USING ORDINARY TERM WEIGHTING (TF-only)**

in this step the term weighting is just identify biner-weighting (like yes or no) about term in the document.

In [None]:
max_features = 1000
databaru= data['komen_stemmed'].astype(str)

#menambahkan ngram=(1,2) dst kalo mau pake pemisahan per 2 kata atau lebih
cvect = CountVectorizer(max_features=max_features, ngram_range=(1,3), binary=True)
TF_vector = cvect.fit_transform(databaru)

# hitung TF
tfidf_matt = cvect.fit_transform(databaru).toarray()

In [None]:
X= tfidf_matt
Y= data['SENTIMEN']

**SPLITTING DATA**

total data is 1000 sentiments consist of positive and negative with 50:50 portion. data splits is consist of 900 data train and 100 data test.

In [None]:
#splitting data
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.10,random_state=0)

**CLASSIFICATION PROCESS**
in this project the classification is using Naive Bayes Classifier.

In [None]:
from sklearn.naive_bayes import BernoulliNB
clasfc= BernoulliNB()
cl= clasfc.fit(X_train,Y_train)
Y_pred= cl.predict(X_test)
Y_pred

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0])

**EVALUATION**

the evaluation method is confusion matrix.

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test,Y_pred)

array([[45,  3],
       [ 6, 46]])

In [24]:
TP = 46
TN = 45
FP = 3
FN = 6

In [25]:
accuracy= (TN+TP)/(TN+TP+FN+FP)
accuracy

0.91

In [26]:
precision= TP/(TP+FP)
precision

0.9387755102040817

In [27]:
recall= TP/(TP+FN)
recall

0.8846153846153846

In [28]:
f1meas= (2*precision*recall)/(precision+recall)
f1meas

0.9108910891089108