In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv( "../data/processed/train_1.csv")
test = pd.read_csv("../data/processed/test_1.csv")
validation = pd.read_csv("../data/processed/validation_1.csv")

In [3]:
from sklearn.model_selection import train_test_split
X = train['review']
y = train['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [4]:
print("Creating the bag of words...")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = None, 
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
%time train_data_features = vectorizer.fit_transform(X_train)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...
CPU times: user 146 ms, sys: 210 µs, total: 146 ms
Wall time: 146 ms


In [5]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
%time forest = forest.fit(train_data_features, y_train)

Training the random forest...
CPU times: user 1min 5s, sys: 125 ms, total: 1min 5s
Wall time: 1min 5s


In [6]:
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()
pred = forest.predict(test_data_features)

In [7]:
from sklearn.metrics import accuracy_score, log_loss,confusion_matrix, roc_curve, roc_auc_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
acc = accuracy_score(y_test,pred)
cm = confusion_matrix(y_test,pred)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))

Accuracy Score: 0.7669597989949749
Confusion Matrix: [[ 592  484]
 [ 258 1850]]


In [8]:
# 1. import
from sklearn.naive_bayes import MultinomialNB

# 2. instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()
%time nb.fit(train_data_features, y_train)
pred = nb.predict(test_data_features)

CPU times: user 239 ms, sys: 152 ms, total: 391 ms
Wall time: 390 ms


In [9]:
acc = accuracy_score(y_test, pred)
cm = confusion_matrix(y_test,pred)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,pred)))
print("Recall Score: "+ str(recall_score(y_test,pred)))
print("F1 Score: "+ str(f1_score(y_test,pred)))

Accuracy Score: 0.7647613065326633
Confusion Matrix: [[ 603  473]
 [ 276 1832]]
Precision Score: 0.7947939262472885
Recall Score: 0.8690702087286527
F1 Score: 0.8302741898934964


In [10]:
null_ = []
for i in range(0,len(y_test)):
    null_.append(1)
null_accuracy = accuracy_score(y_test, null_)
print('Null accuracy:', null_accuracy)

Null accuracy: 0.6620603015075377


In [11]:
# 1. import
import lightgbm as lgb

In [12]:
# 2. instantiate a Multinomial Naive Bayes model
lgbm = lgb.LGBMClassifier()
%time lgbm.fit(train_data_features, y_train)
pred = lgbm.predict(test_data_features)
acc = accuracy_score(y_test, pred)
print("Accuracy Score: " + str(acc))

CPU times: user 3.32 s, sys: 112 ms, total: 3.44 s
Wall time: 1.19 s
Accuracy Score: 0.7766959798994975


  if diff:


In [13]:
output = pd.DataFrame({"review":X_test, "actual":y_test, "pred":pred})
wrong = output[output['actual'] != output['pred']]

In [14]:
wrong.to_csv("wrong_predictions.csv",index=False,quoting=3)

In [15]:
import xgboost as xgb

In [16]:
xgbo = xgb.XGBClassifier()
%time xgbo.fit(train_data_features, y_train)
predic = xgbo.predict(test_data_features)
acc = accuracy_score(y_test, predic)
print("Accuracy Score: " + str(acc))

CPU times: user 1min 29s, sys: 352 ms, total: 1min 30s
Wall time: 1min 30s
Accuracy Score: 0.753140703517588


  if diff:


In [17]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
%time logistic.fit(train_data_features, y_train)
predic = logistic.predict(test_data_features)
acc = accuracy_score(y_test, predic)
cm = confusion_matrix(y_test,predic)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,predic)))
print("Recall Score: "+ str(recall_score(y_test,predic)))
print("F1 Score: "+ str(f1_score(y_test,predic)))

CPU times: user 383 ms, sys: 132 ms, total: 515 ms
Wall time: 515 ms
Accuracy Score: 0.7647613065326633
Confusion Matrix: [[ 569  507]
 [ 242 1866]]
Precision Score: 0.786346396965866
Recall Score: 0.8851992409867173
F1 Score: 0.8328498103101986


In [18]:
X_test = validation['review']
y_test = validation['sentiment']
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()

In [19]:
pred = nb.predict(test_data_features)
acc = accuracy_score(y_test, pred)
cm = confusion_matrix(y_test,pred)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,pred)))
print("Recall Score: "+ str(recall_score(y_test,pred)))
print("F1 Score: "+ str(f1_score(y_test,pred)))

Accuracy Score: 0.7753768844221105
Confusion Matrix: [[ 706  593]
 [ 301 2380]]
Precision Score: 0.8005381769256643
Recall Score: 0.8877284595300261
F1 Score: 0.8418818535550053


In [20]:
predic = logistic.predict(test_data_features)
acc = accuracy_score(y_test, predic)
cm = confusion_matrix(y_test,predic)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,predic)))
print("Recall Score: "+ str(recall_score(y_test,predic)))
print("F1 Score: "+ str(f1_score(y_test,predic)))

Accuracy Score: 0.7793969849246232
Confusion Matrix: [[ 677  622]
 [ 256 2425]]
Precision Score: 0.7958647850344601
Recall Score: 0.9045132413278627
F1 Score: 0.846717877094972


In [26]:
tes = ['barang bagus sekali saya suka saya suka', 
       'penipu saya beli telepon genggam yang sampai di rumah saya malah sabun batang awas ya',
       'apa apaan ini sudah sampai lama barang rusak lagi',
       'awal saya khawatir karena penjual belum punya reputasi yang bagus, tapi ternyata barang cepat sekali sampai packing tebal dan rapi barang sampai dengan selamat.']
tes_features = vectorizer.transform(tes).toarray()
print(nb.predict(tes_features))
print(logistic.predict(tes_features))

[0 0 0 1]
[0 1 0 1]
