In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,recall_score,precision_score

## Create df for results

In [5]:
col_names =  ['model', 'alpha','CountVectorizer','TfidfVectorizer','text','score']
results  = pd.DataFrame(columns = col_names)
results

Unnamed: 0,model,alpha,CountVectorizer,TfidfVectorizer,text,score


# Load data

In [6]:
path = 'yelp_data/health_text_sentiment.csv'
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

df = df[['stars','text',
 'clean_text']]

## select 1 & 5 stars

In [7]:
data = df.ix[np.where((df.stars==1)|(df.stars==5))]
# data.stars.replace(1,0,inplace=True)
# data.stars.replace(5,1,inplace=True)

data.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...
5,1,I went to the emergency room because i was hav...,go emergency room kidney stone attack emergenc...
6,5,Dr. Byrne is an excellent doctor with all the ...,dr byrne excellent doctor right skill include ...


## split


In [8]:
# split
X = data['text'].values.astype(str)
Xc = data['clean_text'].values.astype(str)

y = data['stars'].values

## train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(Xc, y, test_size=0.33, random_state=42)

## CountVectorizer

In [10]:
count_vectorizer = CountVectorizer()
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

## TfidfVectorizer

In [11]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

# Multinimial NB: CountVectorizer

In [12]:
mn_count_clf = MultinomialNB()

mn_count_clf.fit(count_train, y_train)
pred = mn_count_clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.957


In [None]:
mn_count_clf.alpha

In [13]:
my_dic = {'model':'MultinomialNB', 
          'alpha':mn_count_clf.alpha, 
          'CountVectorizer': 1,
          'TfidfVectorizer': 0,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,alpha,CountVectorizer,TfidfVectorizer,text,score
0,MultinomialNB,1.0,1,0,clean,0.95673


In [None]:
feture_names = vect

In [None]:
alpha_results = dict()

alpha_values = np.arange(0,1.1,.1)
alpha_values = np.around(alpha_values, decimals=2)
alpha_values

In [None]:
last_score = 0
for alpha in alpha_values:
    mn_count_clf = MultinomialNB(alpha=alpha)
    mn_count_clf.fit(count_train, y_train)
    pred = mn_count_clf.predict(count_test)
    score = metrics.accuracy_score(y_test, pred)
    
    print("Alpha: {:.2f} Score: {:.5f}".format(alpha, score))
    alpha_results[alpha] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(alpha_results.keys())
y=list(alpha_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'alpha 0.{}\nval:{:3f}'.format(xpos,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), xytext=(xmax, ymax-.025)
            )
plt.title('Alpha values')
plt.xlabel('alpha values')
plt.ylabel('score')
plt.show()

In [14]:
mn_count_clf = MultinomialNB(alpha=0.3)

mn_count_clf.fit(count_train, y_train)
pred = mn_count_clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.957


In [15]:
my_dic = {'model':'MultinomialNB', 
          'alpha':mn_count_clf.alpha, 
          'CountVectorizer': 1,
          'TfidfVectorizer': 0,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,alpha,CountVectorizer,TfidfVectorizer,text,score
0,MultinomialNB,1.0,1,0,clean,0.95673
1,MultinomialNB,0.3,1,0,clean,0.956674


In [None]:
print(classification_report(y_test, pred))

In [None]:
cm = confusion_matrix(y_test, pred)
plt.figure(figsize = (8,4))
sns.heatmap(cm, annot=True, fmt="d", cbar=False)
plt.title('Confusion matrix: CountVectorizer')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

# Multinimial NB: TfidfVectorizer

In [16]:
mn_tfidf_clf = MultinomialNB()
mn_tfidf_clf.fit(tfidf_train, y_train)
pred = mn_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.955


In [17]:
my_dic = {'model':'MultinomialNB', 
          'alpha':mn_tfidf_clf.alpha, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,alpha,CountVectorizer,TfidfVectorizer,text,score
0,MultinomialNB,1.0,1,0,clean,0.95673
1,MultinomialNB,0.3,1,0,clean,0.956674
2,MultinomialNB,1.0,0,1,clean,0.955433


In [None]:
feature_names = tfidf_vectorizer.get_feature_names()
coefs = mn_tfidf_clf.coef_
intercept = mn_tfidf_clf.intercept_
coefs_w_fns = sorted(zip(mn_tfidf_clf.coef_[0],feature_names))
n=10
top_n_coefs = zip(coefs_w_fns[:n], coefs_w_fns[:-(n+1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top_n_coefs:
    print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1,fn_1,coef_2, fn_2))

## alpha values

In [None]:
alpha_results = dict()
alpha_values = np.arange(0,1.1,.1)
alpha_values = np.around(alpha_values, decimals=2)
alpha_values

In [None]:
for alpha in alpha_values:
    mn_tfidf_clf = MultinomialNB(alpha=alpha)

    mn_tfidf_clf.fit(tfidf_train, y_train)
    pred = mn_tfidf_clf.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    
    print("Alpha: {:.2f} Score: {:.5f}".format(alpha, score))
    alpha_results[alpha] = score

## Plot best alpha value

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(alpha_results.keys())
y=list(alpha_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'alpha 0.{}\nval:{:3f}'.format(xpos,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), xytext=(xmax, ymax-.025)
            )
plt.title('Alpha values')
plt.xlabel('alpha values')
plt.ylabel('score')
plt.show()

In [18]:
mn_tfidf_clf = MultinomialNB(alpha=0.6)
mn_tfidf_clf.fit(tfidf_train, y_train)
pred = mn_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.956


In [19]:
my_dic = {'model':'MultinomialNB', 
          'alpha':mn_tfidf_clf.alpha, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,alpha,CountVectorizer,TfidfVectorizer,text,score
0,MultinomialNB,1.0,1,0,clean,0.95673
1,MultinomialNB,0.3,1,0,clean,0.956674
2,MultinomialNB,1.0,0,1,clean,0.955433
3,MultinomialNB,0.6,0,1,clean,0.95611


In [None]:
ls

In [20]:
results.to_csv('MNB_results_clean_text.csv')

In [None]:
print(classification_report(y_test, pred))

In [None]:
cm = confusion_matrix(y_test, pred)
plt.figure(figsize = (8,4))
sns.heatmap(cm, annot=True, fmt="d", cbar=False)
plt.title('Confusion matrix: TfidfVectorizer')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [21]:
feature_names = tfidf_vectorizer.get_feature_names()
coefs = mn_tfidf_clf.coef_
intercept = mn_tfidf_clf.intercept_
coefs_w_fns = sorted(zip(mn_tfidf_clf.coef_[0],feature_names))
n=10
top_n_coefs = zip(coefs_w_fns[:n], coefs_w_fns[:-(n+1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top_n_coefs:
    print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1,fn_1,coef_2, fn_2))

	-12.2929	____           		-4.7233	dr             
	-12.2929	_o             		-5.1072	staff          
	-12.2929	aaaaaaahhh     		-5.1553	great          
	-12.2929	aaaah          		-5.2483	not            
	-12.2929	aaaai          		-5.2633	good           
	-12.2929	aaahh          		-5.2685	doctor         
	-12.2929	aair           		-5.2944	care           
	-12.2929	aal            		-5.3803	time           
	-12.2929	aara           		-5.3970	recommend      
	-12.2929	aasm           		-5.4059	go             
