In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv( "../data/processed/train_1.csv")
test = pd.read_csv("../data/processed/test_1.csv")
validation = pd.read_csv("../data/processed/validation_1.csv")

In [3]:
from sklearn.model_selection import train_test_split
X = train['review']
y = train['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [4]:
print("Creating tfidf..")
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = TfidfVectorizer(analyzer = "word",
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = None, 
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
%time train_data_features = vectorizer.fit_transform(X_train)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating tfidf..
CPU times: user 151 ms, sys: 7.73 ms, total: 158 ms
Wall time: 158 ms


In [5]:
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()

In [6]:
from sklearn.metrics import accuracy_score, log_loss,confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [7]:
# 1. import
from sklearn.naive_bayes import MultinomialNB

# 2. instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()
%time nb.fit(train_data_features, y_train)
pred = nb.predict(test_data_features)

CPU times: user 140 ms, sys: 47.9 ms, total: 187 ms
Wall time: 187 ms


In [8]:
acc = accuracy_score(y_test, pred)
cm = confusion_matrix(y_test,pred)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,pred)))
print("Recall Score: "+ str(recall_score(y_test,pred)))
print("F1 Score: "+ str(f1_score(y_test,pred)))

Accuracy Score: 0.7584798994974874
Confusion Matrix: [[ 464  612]
 [ 157 1951]]
Precision Score: 0.7612173234490831
Recall Score: 0.9255218216318786
F1 Score: 0.8353671590665811


In [9]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
%time logistic.fit(train_data_features, y_train)
predic = logistic.predict(test_data_features)
acc = accuracy_score(y_test, predic)
cm = confusion_matrix(y_test,predic)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,predic)))
print("Recall Score: "+ str(recall_score(y_test,predic)))
print("F1 Score: "+ str(f1_score(y_test,predic)))

CPU times: user 146 ms, sys: 4.63 ms, total: 150 ms
Wall time: 150 ms
Accuracy Score: 0.7713567839195979
Confusion Matrix: [[ 580  496]
 [ 232 1876]]
Precision Score: 0.790893760539629
Recall Score: 0.889943074003795
F1 Score: 0.8375


In [10]:
null_ = []
for i in range(0,len(y_test)):
    null_.append(1)
null_accuracy = accuracy_score(y_test, null_)
print('Null accuracy:', null_accuracy)

Null accuracy: 0.6620603015075377


In [11]:
X_test = validation['review']
y_test = validation['sentiment']
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()

In [12]:
predic = logistic.predict(test_data_features)
acc = accuracy_score(y_test, predic)
cm = confusion_matrix(y_test,predic)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,predic)))
print("Recall Score: "+ str(recall_score(y_test,predic)))
print("F1 Score: "+ str(f1_score(y_test,predic)))

Accuracy Score: 0.7874371859296483
Confusion Matrix: [[ 704  595]
 [ 251 2430]]
Precision Score: 0.8033057851239669
Recall Score: 0.9063782170831779
F1 Score: 0.8517350157728706


In [13]:
predic = nb.predict(test_data_features)
acc = accuracy_score(y_test, predic)
cm = confusion_matrix(y_test,predic)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,predic)))
print("Recall Score: "+ str(recall_score(y_test,predic)))
print("F1 Score: "+ str(f1_score(y_test,predic)))

Accuracy Score: 0.7597989949748744
Confusion Matrix: [[ 515  784]
 [ 172 2509]]
Precision Score: 0.7619192225933799
Recall Score: 0.9358448340171578
F1 Score: 0.8399732172748576


In [15]:
tes = ['barang bagus sekali saya suka saya suka', 
       'penipu saya beli telepon genggam yang sampai di rumah saya malah sabun batang awas ya',
       'apa apaan ini sudah sampai lama barang rusak lagi',
       'awal saya khawatir karena penjual belum punya reputasi yang bagus, tapi ternyata barang cepat sekali sampai packing tebal dan rapi barang sampai dengan selamat.']
tes_features = vectorizer.transform(tes).toarray()
print(nb.predict(tes_features))
print(logistic.predict(tes_features))

[1 1 1 1]
[0 1 0 1]
