In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,recall_score,precision_score

## Create df for results
## columns
- model
- parameter 1
- parameter 2
- feature_extraction
- text (original,clean)
- score
- 

In [5]:
col_names =  ['model', 'C_value','CountVectorizer','TfidfVectorizer','ngram_range','max_df','text','score']
results  = pd.DataFrame(columns = col_names)
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score


# Load data

In [6]:
path = 'yelp_data/health_text_sentiment.csv'
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

df = df[['stars','text',
 'clean_text']]

## select 1 & 5 stars

In [7]:
data = df.ix[np.where((df.stars==1)|(df.stars==5))]
# data.stars.replace(1,0,inplace=True)
# data.stars.replace(5,1,inplace=True)

data.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...
5,1,I went to the emergency room because i was hav...,go emergency room kidney stone attack emergenc...
6,5,Dr. Byrne is an excellent doctor with all the ...,dr byrne excellent doctor right skill include ...


## split


In [8]:
# split
X = data['text'].values.astype(str)
Xc = data['clean_text'].values.astype(str)

y = data['stars'].values

## train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(Xc, y, test_size=0.33, random_state=42)

## CountVectorizer

In [10]:
count_vectorizer = CountVectorizer()
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

## TfidfVectorizer

In [11]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

# LogisticRegression: CountVectorizer

In [12]:
lr_count_clf = LogisticRegression()
lr_count_clf.fit(count_train, y_train)
score = lr_count_clf.score(count_test, y_test)
print("accuracy:   %0.3f" % score)

accuracy:   0.967


In [None]:
lr_count_clf

In [None]:
lr_count_clf.C

In [None]:
count_vectorizer.ngram_range

In [None]:
count_vectorizer.max_df

In [None]:
#['model', 'C_value','CountVectorizer','TfidfVectorizer','ngram_range','max_df','text','score']


In [13]:
my_dic = {'model':'LogisticRegression', 
          'C_value':lr_count_clf.C, 
          'CountVectorizer': 1,
          'TfidfVectorizer': 0,
          'ngram_range':count_vectorizer.ngram_range,
          'max_df':count_vectorizer.max_df,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LogisticRegression,1.0,1,0,"(1, 1)",1.0,clean,0.967223


In [None]:
results

## C value

In [None]:
c_results = dict()

c_values = np.arange(0.1,1,.1)
# c_values = np.around(c_values, decimals=2)
#c_values = [1,2,3,4,5,10,15,20]
c_values

In [None]:
for c in c_values:
    lr_count_clf = LogisticRegression(C=c)
    lr_count_clf.fit(count_train, y_train)
    score = lr_count_clf.score(count_test, y_test)
    print("C value: {:.2f} Score: {:.5f}".format(c, score))
    c_results[c] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(c_results.keys())
y=list(c_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'C value :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
            xytext=(xmax+.01,ymax)
            )
plt.title('C values')
plt.xlabel('C values')
plt.ylabel('score')
plt.show()

In [14]:
lr_count_clf = LogisticRegression(C=0.2)

lr_count_clf.fit(count_train, y_train)
pred = lr_count_clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.968


In [15]:
my_dic = {'model':'LogisticRegression', 
          'C_value':lr_count_clf.C, 
          'CountVectorizer': 1,
          'TfidfVectorizer': 0,
          'ngram_range':count_vectorizer.ngram_range,
          'max_df':count_vectorizer.max_df,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 

In [16]:
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LogisticRegression,1.0,1,0,"(1, 1)",1.0,clean,0.967223
1,LogisticRegression,0.2,1,0,"(1, 1)",1.0,clean,0.967675


In [None]:
print(classification_report(y_test, pred))

In [None]:
cm = confusion_matrix(y_test, pred)
plt.figure(figsize = (8,4))
sns.heatmap(cm, annot=True, fmt="d", cbar=False)
plt.title('Confusion matrix: CountVectorizer')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

# LogisticRegression: TfidfVectorizer

In [17]:
lr_tfidf_clf = LogisticRegression()
lr_tfidf_clf.fit(tfidf_train, y_train)
score = lr_tfidf_clf.score(tfidf_test, y_test)
print("accuracy:   %0.3f" % score)

accuracy:   0.967


In [18]:
my_dic = {'model':'LogisticRegression', 
          'C_value':lr_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 

In [19]:
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LogisticRegression,1.0,1,0,"(1, 1)",1.0,clean,0.967223
1,LogisticRegression,0.2,1,0,"(1, 1)",1.0,clean,0.967675
2,LogisticRegression,1.0,0,1,"(1, 1)",1.0,clean,0.96711


In [None]:
feature_names = tfidf_vectorizer.get_feature_names()
coefs = lr_tfidf_clf.coef_
intercept = lr_tfidf_clf.intercept_
coefs_w_fns = sorted(zip(lr_tfidf_clf.coef_[0],feature_names))
n=10
top_n_coefs = zip(coefs_w_fns[:n], coefs_w_fns[:-(n+1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top_n_coefs:
    print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1,fn_1,coef_2, fn_2))

In [None]:
c_results = dict()

c_values = np.arange(1,11,1)
# c_values = np.around(c_values, decimals=2)
#c_values = [1,2,3,4,5,10,15,20]
c_values

In [None]:
for c in c_values:    
    lr_tfidf_clf = LogisticRegression(C=c)
    lr_tfidf_clf.fit(tfidf_train, y_train)
    score = lr_tfidf_clf.score(tfidf_test, y_test)
    print("C value: {:.2f} Score: {:.5f}".format(c, score))
    c_results[c] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(c_results.keys())
y=list(c_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'C value :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
            xytext=(xmax+.01,ymax)
            )
plt.title('C values')
plt.xlabel('C values')
plt.ylabel('score')
plt.show()

In [20]:
lr_tfidf_clf = LogisticRegression(C=8)
lr_tfidf_clf.fit(tfidf_train, y_train)
score = lr_tfidf_clf.score(tfidf_test, y_test)
print("accuracy:   %0.3f" % score)

accuracy:   0.970


In [21]:
my_dic = {'model':'LogisticRegression', 
          'C_value':lr_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LogisticRegression,1.0,1,0,"(1, 1)",1.0,clean,0.967223
1,LogisticRegression,0.2,1,0,"(1, 1)",1.0,clean,0.967675
2,LogisticRegression,1.0,0,1,"(1, 1)",1.0,clean,0.96711
3,LogisticRegression,8.0,0,1,"(1, 1)",1.0,clean,0.970044


In [None]:
results

## max_df values

In [None]:
dfrq_results = dict()

dfrq_range = np.arange(0.1,1.1,0.1)
dfrq_range = np.around(dfrq_range, decimals=2)
dfrq_range

In [None]:
for dfq in dfrq_range: 
    tfidf_vectorizer = TfidfVectorizer(max_df=dfq)
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test = tfidf_vectorizer.transform(X_test)
    
    lr_tfidf_clf = LogisticRegression(C=8)
    lr_tfidf_clf.fit(tfidf_train, y_train)
    score = lr_tfidf_clf.score(tfidf_test, y_test)
    print("max_df: {:.2f} Score: {:.5f}".format(dfq, score))
    dfrq_results[dfq] = score

In [None]:
dfrq_results

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(dfrq_results.keys())
y=list(dfrq_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'max_df :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
           # xytext=(xmax+.01,ymax)
            )
plt.title('max_df')
plt.xlabel('max_df values')
plt.ylabel('score')
plt.show()

##  ngrams

In [None]:
ngram_results = dict()
ngram_ranges = ((1,2),(1,3),(1,4),(2,3),(2,4))
ngram_ranges

In [None]:

for ngram_range in ngram_ranges:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test = tfidf_vectorizer.transform(X_test)
    
    lr_tfidf_clf = LogisticRegression()
    lr_tfidf_clf.fit(tfidf_train, y_train)
    score = lr_tfidf_clf.score(tfidf_test, y_test)
    print("ngram_range: {} Score: {:.5f}".format(ngram_range, score))
    ngram_results[ngram_range] = score

In [22]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

lr_tfidf_clf = LogisticRegression()
lr_tfidf_clf.fit(tfidf_train, y_train)
score = lr_tfidf_clf.score(tfidf_test, y_test)
print("accuracy:   %0.3f" % score)

accuracy:   0.966


In [23]:
my_dic = {'model':'LogisticRegression', 
          'C_value':lr_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LogisticRegression,1.0,1,0,"(1, 1)",1.0,clean,0.967223
1,LogisticRegression,0.2,1,0,"(1, 1)",1.0,clean,0.967675
2,LogisticRegression,1.0,0,1,"(1, 1)",1.0,clean,0.96711
3,LogisticRegression,8.0,0,1,"(1, 1)",1.0,clean,0.970044
4,LogisticRegression,1.0,0,1,"(1, 2)",1.0,clean,0.966151


In [None]:
dfrq_results = dict()

dfrq_range = np.arange(0.1,1.1,0.1)
dfrq_range = np.around(dfrq_range, decimals=2)
dfrq_range

In [None]:
for dfq in dfrq_range: 
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=dfq)
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test = tfidf_vectorizer.transform(X_test)
    
    lr_tfidf_clf = LogisticRegression()
    lr_tfidf_clf.fit(tfidf_train, y_train)
    score = lr_tfidf_clf.score(tfidf_test, y_test)
    print("max_df: {:.2f} Score: {:.5f}".format(dfq, score))
    dfrq_results[dfq] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(dfrq_results.keys())
y=list(dfrq_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'max_df :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
           # xytext=(xmax+.01,ymax)
            )
plt.title('max_df')
plt.xlabel('max_df values')
plt.ylabel('score')
plt.show()

In [24]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=0.1)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

lr_tfidf_clf = LogisticRegression()
lr_tfidf_clf.fit(tfidf_train, y_train)
score = lr_tfidf_clf.score(tfidf_test, y_test)
print("accuracy:   %0.3f" % score)

accuracy:   0.967


In [25]:
my_dic = {'model':'LogisticRegression', 
          'C_value':lr_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LogisticRegression,1.0,1,0,"(1, 1)",1.0,clean,0.967223
1,LogisticRegression,0.2,1,0,"(1, 1)",1.0,clean,0.967675
2,LogisticRegression,1.0,0,1,"(1, 1)",1.0,clean,0.96711
3,LogisticRegression,8.0,0,1,"(1, 1)",1.0,clean,0.970044
4,LogisticRegression,1.0,0,1,"(1, 2)",1.0,clean,0.966151
5,LogisticRegression,1.0,0,1,"(1, 2)",0.1,clean,0.967223


In [None]:
c_results = dict()

c_values = np.arange(1,11,1)
# c_values = np.around(c_values, decimals=2)
#c_values = [1,2,3,4,5,10,15,20]
c_values

In [None]:
for c in c_values:
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=0.1)
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test = tfidf_vectorizer.transform(X_test)
    
    lr_tfidf_clf = LogisticRegression(C=c)
    lr_tfidf_clf.fit(tfidf_train, y_train)
    score = lr_tfidf_clf.score(tfidf_test, y_test)
    print("C value: {:.2f} Score: {:.5f}".format(c, score))
    c_results[c] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(c_results.keys())
y=list(c_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'C value :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
            xytext=(xmax+.01,ymax)
            )
plt.title('C values')
plt.xlabel('C values')
plt.ylabel('score')
plt.show()

## Best Results

In [26]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=0.1)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

lr_tfidf_clf = LogisticRegression(C=10)
lr_tfidf_clf.fit(tfidf_train, y_train)
score = lr_tfidf_clf.score(tfidf_test, y_test)
print("accuracy:   %0.5f" % score)

accuracy:   0.97484


In [27]:
my_dic = {'model':'LogisticRegression', 
          'C_value':lr_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'clean',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LogisticRegression,1.0,1,0,"(1, 1)",1.0,clean,0.967223
1,LogisticRegression,0.2,1,0,"(1, 1)",1.0,clean,0.967675
2,LogisticRegression,1.0,0,1,"(1, 1)",1.0,clean,0.96711
3,LogisticRegression,8.0,0,1,"(1, 1)",1.0,clean,0.970044
4,LogisticRegression,1.0,0,1,"(1, 2)",1.0,clean,0.966151
5,LogisticRegression,1.0,0,1,"(1, 2)",0.1,clean,0.967223
6,LogisticRegression,10.0,0,1,"(1, 2)",0.1,clean,0.974839


In [28]:
results.to_csv('logistic_results_clean_text.csv')

In [29]:
feature_names = tfidf_vectorizer.get_feature_names()
coefs = lr_tfidf_clf.coef_
intercept = lr_tfidf_clf.intercept_
coefs_w_fns = sorted(zip(lr_tfidf_clf.coef_[0],feature_names))
n=10
top_n_coefs = zip(coefs_w_fns[:n], coefs_w_fns[:-(n+1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top_n_coefs:
    print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1,fn_1,coef_2, fn_2))

	-20.4802	rude           		16.5593	highly recommend
	-13.8742	horrible       		14.1822	love           
	-12.8411	unprofessional 		13.6592	awesome        
	-11.6954	not recommend  		13.2474	wonderful      
	-11.2939	money          		12.2251	happy          
	-10.9641	terrible       		11.9029	thorough       
	-10.8932	bill           		11.8047	excellent      
	-10.7367	not even       		11.4981	take time      
	-10.5480	wait hour      		11.0497	helpful        
	-10.3586	waste          		10.8811	knowledgeable  


In [None]:
ls