In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats

In [2]:
# input data
feedback_raw = pd.read_csv('yelp_labelled.txt', delimiter= '\t', header=None)

# name columns
feedback_raw.columns=['text', 'target']
feedback_raw.shape

(1000, 2)

In [3]:
# tokenizer
from nltk.tokenize import word_tokenize


# split the data into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feedback_raw['text'].values, 
                 feedback_raw['target'].values,        
                 test_size=0.2)

# vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(tokenizer=word_tokenize)
tf_train = vect.fit_transform(X_train)
tf_test = vect.transform(X_test)

print(vect.get_feature_names())
# import sys
# np.set_printoptions(threshold=sys.maxsize)
# tf_train.toarray()



In [4]:
# logistic regression
# use training dataset to test the test dataset
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
model.fit(tf_train, y_train)
preds = model.predict(tf_test)
accuracy = (preds == y_test).mean()
print('the accuracy of logistic regression is :{}'.format(accuracy))

the accuracy of logistic regression is :0.85


In [5]:
from sklearn.metrics import confusion_matrix

#confusion_matrix(tf_test, preds)
confusion_matrix(y_test, preds)



array([[84, 12],
       [18, 86]], dtype=int64)

In [6]:
cm= pd.crosstab(y_test, preds, rownames=['True'], colnames=['Predicted'], margins=True)
display(cm)

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,84,12,96
1,18,86,104
All,102,98,200


In [7]:
print('the accuracy is {}'.format((cm.iloc[0,0]+cm.iloc[1,1])/cm.iloc[2,2]))
print('type I error is {}'.format(cm.iloc[0,1]/cm.iloc[2,2]))
print('type II error is {}'.format(cm.iloc[1,0]/cm.iloc[2,2]))

the accuracy is 0.85
type I error is 0.06
type II error is 0.09


In [8]:
# multinomial naive bayes model
from sklearn.naive_bayes import MultinomialNB
modelMNB = MultinomialNB()
modelMNB.fit(tf_train, y_train)
preds = modelMNB.predict(tf_test)
accuracy = (preds == y_test).mean()
print('the accuracy of multinomial naive bayes is :{}'.format(accuracy))

the accuracy of multinomial naive bayes is :0.815


# Do any of your classifiers seem to overfit?

In [9]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, tf_train, y_train, cv=10)

array([0.81481481, 0.83950617, 0.75308642, 0.91358025, 0.8375    ,
       0.8375    , 0.82278481, 0.73417722, 0.79746835, 0.79746835])

In [10]:
from sklearn.model_selection import cross_val_score
cross_val_score(modelMNB, tf_train, y_train, cv=10)

array([0.80246914, 0.85185185, 0.79012346, 0.7654321 , 0.8125    ,
       0.7125    , 0.82278481, 0.74683544, 0.82278481, 0.81012658])

Both multinomial Naive Bayes and multinomial logistic regression works well. Although we see the accuracy score varies from 0.7 to 0.86, no overfitting is observed.  

# Which seem to perform the best? Why?

Logistic regression and MNB model both produce very descent result with accuracy in the test dataset 0.85 and 0.82 respectively.  Logistic regression seems to perform a bit better.  

# Which features seemed to be most impactful to performance?

The word tokenizer is a very powerful tool and by tuning it, we can get a better result.  For example, we may include the stopwords in the tokenizer.  The performance do not vary much on different models once the bag of words have been collected.  