In [1]:
import pandas as pd
import numpy as np
import csv
import re

In [2]:
df_tp = pd.read_table('train_preprocess.tsv.txt', names=['txt', 'label'])
df_tp

Unnamed: 0,txt,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative
...,...,...
10995,tidak kecewa,positive
10996,enak rasa masakan nya apalagi kepiting yang me...,positive
10997,hormati partai-partai yang telah berkoalisi,neutral
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative


In [3]:
df_tp.label.value_counts()

label
positive    6416
negative    3436
neutral     1148
Name: count, dtype: int64

In [4]:
df_tp.describe()

Unnamed: 0,txt,label
count,11000,11000
unique,10933,3
top,kesal,positive
freq,4,6416


In [5]:
def cleansing(sent):
    string = sent.lower()
    string = re.sub(r'[^a-zA-Z0-9]', ' ',string)
    string = re.sub(r'[^\w\s]', '',string)
    return string
df_tp['text_clean'] = df_tp['txt'].apply(cleansing)
df_tp.head()

Unnamed: 0,txt,label,text_clean
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung ini dimiliki oleh pengusaha pabrik tahu...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus dan k212 mmbri hujjah partai...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis di jalan sumatera bandung t...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia nya diri ini saat unboxing pake...
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,duh jadi mahasiswa jangan sombong dong kas...


In [6]:
data_preprocessed = df_tp.text_clean.tolist()
type(data_preprocessed)

list

In [7]:
len(data_preprocessed)

11000

In [8]:
data_preprocessed = df_tp.text_clean.tolist()
len(data_preprocessed)

11000

In [10]:
# klasifikasikan data sentimen ke variable/label
neg = df_tp.loc[df_tp['label'] =='negative'].text_clean.tolist()
neu = df_tp.loc[df_tp['label'] =='neutral'].text_clean.tolist()
pos = df_tp.loc[df_tp['label'] =='positive'].text_clean.tolist()

neg_label = df_tp.loc[df_tp['label'] == 'negative'].label.tolist()
neu_label = df_tp.loc[df_tp['label'] == 'neutral'].label.tolist()
pos_label = df_tp.loc[df_tp['label'] == 'positive'].label.tolist()

total_data = pos + neu + neg
labels = pos_label + neu_label + neg_label

print("Pos: %s, Neu: %s, Neg: %s" % (len(pos), len(neu), len(neg)))
print("Total data: %s" % len(total_data))

Pos: 6416, Neu: 1148, Neg: 3436
Total data: 11000


# Feature Extraction dengan Neural Network

In [11]:
import sklearn as sk
import math


In [12]:
# proses feature extraction
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
count_vect.fit(data_preprocessed)

X = count_vect.transform(data_preprocessed)
print("Feature Extraction selesai")

Feature Extraction selesai


In [13]:
# menyimpan hasil feature extraction
import pickle

pickle.dump(count_vect, open("feature.p", "wb"))
pickle.dump

<function _pickle.dump(obj, file, protocol=None, *, fix_imports=True, buffer_callback=None)>

In [14]:
# split dataset 80 data & 20 data uji for training dan testing
from sklearn.model_selection import train_test_split

classes = df_tp.label
classes

0        positive
1         neutral
2        positive
3        positive
4        negative
           ...   
10995    positive
10996    positive
10997     neutral
10998    negative
10999    positive
Name: label, Length: 11000, dtype: object

In [15]:
# split dataset buat training dan testing
X_train, x_test, y_train, y_test = train_test_split(X, classes, test_size = 0.2, random_state=1)
X_train, x_test, y_train, y_test

(<8800x17240 sparse matrix of type '<class 'numpy.int64'>'
 	with 215508 stored elements in Compressed Sparse Row format>,
 <2200x17240 sparse matrix of type '<class 'numpy.int64'>'
 	with 53571 stored elements in Compressed Sparse Row format>,
 8854     negative
 887      positive
 2477     positive
 89       negative
 3956     positive
            ...   
 7813     positive
 10955    negative
 905      positive
 5192     positive
 235      negative
 Name: label, Length: 8800, dtype: object,
 7030     positive
 2095     negative
 7168     positive
 7969     positive
 6320     negative
            ...   
 8986     positive
 10540    positive
 9009      neutral
 5212     negative
 7388     positive
 Name: label, Length: 2200, dtype: object)

In [16]:
X_train

<8800x17240 sparse matrix of type '<class 'numpy.int64'>'
	with 215508 stored elements in Compressed Sparse Row format>

In [17]:
# metode training dengan neural network pakai modul MLPClassifier dari library SKlearn

from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

model = MLPClassifier(hidden_layer_sizes=(512, 256))
model.fit(X_train, y_train)

print("Training selesai")

Training selesai


In [18]:
# menyimpan data yang sudah di uji ke dalam sebuah file
pickle.dump(model, open("model.p", "wb"))

In [19]:
# evaluation yang di pakai dari SKlearn untuk melakukan evaluasi "classification_report"

from sklearn.metrics import classification_report

test = model.predict(x_test)

print("Testing selesai")

print(classification_report(y_test, test, zero_division=0))

Testing selesai
              precision    recall  f1-score   support

    negative       0.80      0.82      0.81       672
     neutral       0.81      0.68      0.74       239
    positive       0.90      0.91      0.91      1289

    accuracy                           0.86      2200
   macro avg       0.84      0.80      0.82      2200
weighted avg       0.86      0.86      0.86      2200



In [20]:
# menguji model yang sudah kita buat (cross-validation)

import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, random_state=42, shuffle=True)

accuracies = []

y = classes

for iteration, data in enumerate(kf.split(X), start=1):
    
    data_train = X[data[0]]
    target_train = y[data[0]]
    
    data_test = X[data[1]]
    target_test = y[data[1]]
    
    clf = MLPClassifier()
    clf.fit(data_train, target_train)
    
    preds = clf.predict(data_test)
    
    accuracy = accuracy_score(target_test,preds)
    
    
    print("Training ke-", iteration)
    print(classification_report(target_test,preds, zero_division=0))
    print("========================================================")

    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)

print()
print()
print()
print("Rata-rata Accuracy: ", average_accuracy)

Training ke- 1
              precision    recall  f1-score   support

    negative       0.77      0.78      0.77       680
     neutral       0.76      0.64      0.69       239
    positive       0.87      0.90      0.89      1281

    accuracy                           0.83      2200
   macro avg       0.80      0.77      0.78      2200
weighted avg       0.83      0.83      0.83      2200

Training ke- 2
              precision    recall  f1-score   support

    negative       0.79      0.76      0.78       706
     neutral       0.73      0.70      0.71       220
    positive       0.88      0.90      0.89      1274

    accuracy                           0.83      2200
   macro avg       0.80      0.79      0.79      2200
weighted avg       0.83      0.83      0.83      2200

Training ke- 3
              precision    recall  f1-score   support

    negative       0.80      0.80      0.80       682
     neutral       0.85      0.71      0.77       215
    positive       0.89      0

In [22]:
# model prediksi
original_text = '''
kafe ini menyajikan sensasi makan dalam gelap . cukup unik sih , tetapi lebih dari sekali , rasanya orang jarang datang lagi . ha*!^@%R!&^%*&!#*&!#^*&#^(!#*(!#*(nya menarik untuk sekadar mencoba
'''

text = count_vect.transform([cleansing(original_text)])

result = model.predict(text)[0]
print("Sentiment:")
print()
print(result)

Sentiment:

negative
