In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,recall_score,precision_score

## Create df for results

In [5]:
col_names =  ['model', 'C_value','CountVectorizer','TfidfVectorizer','ngram_range','max_df','text','score']
results  = pd.DataFrame(columns = col_names)
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score


# Load data

In [7]:
path = 'yelp_data/health_text_sentiment.csv'
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

df = df[['stars','text',
 'clean_text']]

## select 1 & 5 stars

In [8]:
data = df.ix[np.where((df.stars==1)|(df.stars==5))]
# data.stars.replace(1,0,inplace=True)
# data.stars.replace(5,1,inplace=True)

data.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...
5,1,I went to the emergency room because i was hav...,go emergency room kidney stone attack emergenc...
6,5,Dr. Byrne is an excellent doctor with all the ...,dr byrne excellent doctor right skill include ...


## split


In [9]:
# split
X = data['text'].values.astype(str)
Xc = data['clean_text'].values.astype(str)

y = data['stars'].values

## train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## CountVectorizer

In [11]:
count_vectorizer = CountVectorizer()
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

## TfidfVectorizer

In [12]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

# LinearSVC: CountVectorizer

In [13]:
svc_count_clf = LinearSVC()
svc_count_clf.fit(count_train, y_train)
pred = svc_count_clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.966


In [14]:
my_dic = {'model':'LinearSVC', 
          'C_value':svc_count_clf.C, 
          'CountVectorizer': 1,
          'TfidfVectorizer': 0,
          'ngram_range':count_vectorizer.ngram_range,
          'max_df':count_vectorizer.max_df,
          'text':'original',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LinearSVC,1.0,1,0,"(1, 1)",1.0,original,0.965757


## C value

In [None]:
c_results = dict()

c_values = np.arange(0.1,1,.1)
# c_values = np.around(c_values, decimals=2)
#c_values = [1,2,3,4,5,10,15,20]
c_values

In [None]:
for c in c_values:
    svc_count_clf = LinearSVC()
    svc_count_clf.fit(count_train, y_train)
    pred = svc_count_clf.predict(count_test)
    score = metrics.accuracy_score(y_test, pred)
    print("C value: {:.2f} Score: {:.5f}".format(c, score))
    c_results[c] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(c_results.keys())
y=list(c_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'C value :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
            xytext=(xmax+.01,ymax)
            )
plt.title('C values')
plt.xlabel('C values')
plt.ylabel('score')
plt.show()

In [15]:
svc_count_clf = LinearSVC(C=0.7)
svc_count_clf.fit(count_train, y_train)
pred = svc_count_clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.967


In [16]:
my_dic = {'model':'LinearSVC', 
          'C_value':svc_count_clf.C, 
          'CountVectorizer': 1,
          'TfidfVectorizer': 0,
          'ngram_range':count_vectorizer.ngram_range,
          'max_df':count_vectorizer.max_df,
          'text':'original',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LinearSVC,1.0,1,0,"(1, 1)",1.0,original,0.965757
1,LinearSVC,0.7,1,0,"(1, 1)",1.0,original,0.966659


In [None]:
print(classification_report(y_test, pred))

In [None]:
cm = confusion_matrix(y_test, pred)
plt.figure(figsize = (8,4))
sns.heatmap(cm, annot=True, fmt="d", cbar=False)
plt.title('Confusion matrix: CountVectorizer')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

# LinearSVC: TfidfVectorizer

In [17]:
svc_tfidf_clf = LinearSVC()
svc_tfidf_clf.fit(tfidf_train, y_train)
pred = svc_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.975


In [18]:
my_dic = {'model':'LinearSVC', 
          'C_value':svc_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'original',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LinearSVC,1.0,1,0,"(1, 1)",1.0,original,0.965757
1,LinearSVC,0.7,1,0,"(1, 1)",1.0,original,0.966659
2,LinearSVC,1.0,0,1,"(1, 1)",1.0,original,0.974501


In [None]:
c_results = dict()

c_values = np.arange(0.1,1.1,.1)
# c_values = np.around(c_values, decimals=2)
#c_values = [1,2,3,4,5,10,15,20]
c_values

In [None]:
# c_results = dict()

# c_values = np.arange(1,11,1)
# # c_values = np.around(c_values, decimals=2)
# c_values = [1,2,3,4,5,10,15,20]
# c_values

In [None]:
for c in c_values:
    svc_tfidf_clf = LinearSVC(C=c)
    svc_tfidf_clf.fit(tfidf_train, y_train)
    pred = svc_tfidf_clf.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    print("C value: {:.2f} Score: {:.5f}".format(c, score))
    c_results[c] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(c_results.keys())
y=list(c_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'C value :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
            xytext=(xmax,ymax)
            )
plt.title('C values')
plt.xlabel('C values')
plt.ylabel('score')
plt.show()

In [19]:
svc_tfidf_clf = LinearSVC(C=0.3)
svc_tfidf_clf.fit(tfidf_train, y_train)
pred = svc_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.976


In [20]:
my_dic = {'model':'LinearSVC', 
          'C_value':svc_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'original',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LinearSVC,1.0,1,0,"(1, 1)",1.0,original,0.965757
1,LinearSVC,0.7,1,0,"(1, 1)",1.0,original,0.966659
2,LinearSVC,1.0,0,1,"(1, 1)",1.0,original,0.974501
3,LinearSVC,0.3,0,1,"(1, 1)",1.0,original,0.975798


## max_df values

In [None]:
dfrq_results = dict()

dfrq_range = np.arange(0.1,1.1,0.1)
dfrq_range = np.around(dfrq_range, decimals=2)
dfrq_range

In [None]:
for dfq in dfrq_range: 
    tfidf_vectorizer = TfidfVectorizer(max_df=dfq)
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test = tfidf_vectorizer.transform(X_test)
    
    svc_tfidf_clf = LinearSVC(C=0.3)
    svc_tfidf_clf.fit(tfidf_train, y_train)
    pred = svc_tfidf_clf.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    print("max_df: {:.2f} Score: {:.5f}".format(dfq, score))
    dfrq_results[dfq] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(dfrq_results.keys())
y=list(dfrq_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'max_df :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
           # xytext=(xmax+.01,ymax)
            )
plt.title('max_df')
plt.xlabel('max_df values')
plt.ylabel('score')
plt.show()

##  ngrams

In [None]:
ngram_results = dict()
ngram_ranges = ((1,2),(1,3),(1,4),(2,3),(2,4))
ngram_ranges

In [None]:
svc_tfidf_clf = LinearSVC()
svc_tfidf_clf.fit(tfidf_train, y_train)
pred = svc_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

In [None]:
my_dic = {'model':'LinearSVC', 
          'C_value':svc_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'original',
          'score':score}
results.loc[len(results)] = my_dic 
results

In [None]:

# for ngram_range in ngram_ranges:
#     tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range)
#     tfidf_train = tfidf_vectorizer.fit_transform(X_train)
#     tfidf_test = tfidf_vectorizer.transform(X_test)
    
#     svc_tfidf_clf = LinearSVC()
#     svc_tfidf_clf.fit(tfidf_train, y_train)
#     pred = svc_tfidf_clf.predict(tfidf_test)
#     score = metrics.accuracy_score(y_test, pred)

#     print("ngram_range: {} Score: {:.5f}".format(ngram_range, score))
#     ngram_results[ngram_range] = score

In [21]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

svc_tfidf_clf = LinearSVC()
svc_tfidf_clf.fit(tfidf_train, y_train)
pred = svc_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.981


In [22]:
my_dic = {'model':'LinearSVC', 
          'C_value':svc_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'original',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LinearSVC,1.0,1,0,"(1, 1)",1.0,original,0.965757
1,LinearSVC,0.7,1,0,"(1, 1)",1.0,original,0.966659
2,LinearSVC,1.0,0,1,"(1, 1)",1.0,original,0.974501
3,LinearSVC,0.3,0,1,"(1, 1)",1.0,original,0.975798
4,LinearSVC,1.0,0,1,"(1, 2)",1.0,original,0.981158


In [None]:
dfrq_results = dict()

dfrq_range = np.arange(0.1,1.1,0.1)
dfrq_range = np.around(dfrq_range, decimals=2)
dfrq_range

In [None]:
for dfq in dfrq_range: 
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=dfq)
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test = tfidf_vectorizer.transform(X_test)
    
    svc_tfidf_clf = LinearSVC()
    svc_tfidf_clf.fit(tfidf_train, y_train)
    pred = svc_tfidf_clf.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    print("max_df: {:.2f} Score: {:.5f}".format(dfq, score))
    dfrq_results[dfq] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(dfrq_results.keys())
y=list(dfrq_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'max_df :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
           # xytext=(xmax+.01,ymax)
            )
plt.title('max_df')
plt.xlabel('max_df values')
plt.ylabel('score')
plt.show()

In [23]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=0.1)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

svc_tfidf_clf = LinearSVC()
svc_tfidf_clf.fit(tfidf_train, y_train)
pred = svc_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.983


In [24]:
my_dic = {'model':'LinearSVC', 
          'C_value':svc_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'original',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LinearSVC,1.0,1,0,"(1, 1)",1.0,original,0.965757
1,LinearSVC,0.7,1,0,"(1, 1)",1.0,original,0.966659
2,LinearSVC,1.0,0,1,"(1, 1)",1.0,original,0.974501
3,LinearSVC,0.3,0,1,"(1, 1)",1.0,original,0.975798
4,LinearSVC,1.0,0,1,"(1, 2)",1.0,original,0.981158
5,LinearSVC,1.0,0,1,"(1, 2)",0.1,original,0.983358


In [None]:
c_results = dict()

c_values = np.arange(0.1,1.1,.1)
# c_values = np.around(c_values, decimals=2)
#c_values = [1,2,3,4,5,10,15,20]
c_values

In [None]:
# c_results = dict()

# c_values = np.arange(1,11,1)
# # c_values = np.around(c_values, decimals=2)
c_values = [1,2,3,4,5,10,15,20]
c_values

In [None]:
for c in c_values:
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=0.1)
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test = tfidf_vectorizer.transform(X_test)
    

    svc_tfidf_clf = LinearSVC(C=c)
    svc_tfidf_clf.fit(tfidf_train, y_train)
    pred = svc_tfidf_clf.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    print("C value: {:.2f} Score: {:.5f}".format(c, score))
    c_results[c] = score

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

x=list(c_results.keys())
y=list(c_results.values())
line, = ax.plot(x, y)

ymax = max(y)
xpos = y.index(ymax)
xmax = x[xpos]
value = 'C value :{}\nval:{:3f}'.format(xmax,ymax)
ax.annotate(value, xy=(xmax, ymax),
            arrowprops=dict(facecolor='black'), 
            xytext=(xmax+.01,ymax)
            )
plt.title('C values')
plt.xlabel('C values')
plt.ylabel('score')
plt.show()

## Best Results

In [25]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=0.1)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

svc_tfidf_clf = LinearSVC()
svc_tfidf_clf.fit(tfidf_train, y_train)
pred = svc_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.983


In [26]:
my_dic = {'model':'LinearSVC', 
          'C_value':svc_tfidf_clf.C, 
          'CountVectorizer': 0,
          'TfidfVectorizer': 1,
          'ngram_range':tfidf_vectorizer.ngram_range,
          'max_df':tfidf_vectorizer.max_df,
          'text':'original',
          'score':score}
results.loc[len(results)] = my_dic 
results

Unnamed: 0,model,C_value,CountVectorizer,TfidfVectorizer,ngram_range,max_df,text,score
0,LinearSVC,1.0,1,0,"(1, 1)",1.0,original,0.965757
1,LinearSVC,0.7,1,0,"(1, 1)",1.0,original,0.966659
2,LinearSVC,1.0,0,1,"(1, 1)",1.0,original,0.974501
3,LinearSVC,0.3,0,1,"(1, 1)",1.0,original,0.975798
4,LinearSVC,1.0,0,1,"(1, 2)",1.0,original,0.981158
5,LinearSVC,1.0,0,1,"(1, 2)",0.1,original,0.983358
6,LinearSVC,1.0,0,1,"(1, 2)",0.1,original,0.983358


In [27]:
results.to_csv('svc_results_original_text.csv')

In [28]:
ls

[0m[01;32mcompare_models-clean_text.ipynb[0m*           [01;32mMultinomialNB_BEST_clean_text.ipynb[0m*
[01;32mcompare_models.ipynb[0m*                      [01;32mMultinomialNB_BEST.ipynb[0m*
[01;32mLogisticRegression_BEST-clean_text.ipynb[0m*  [01;32mSVC_BEST_clean.ipynb[0m*
[01;32mLogisticRegression_BEST.ipynb[0m*             [01;32mSVC_BEST.ipynb[0m*
[01;32mlogistic_results.csv[0m*                      [01;32mSVC_BEST-text_clean.ipynb[0m*
[01;32mlogistic_results_original_text.csv[0m*        [01;32msvc_results_original_text.csv[0m*
[01;32mMNB_results_original_text.csv[0m*             [01;36;40myelp_data[0m/


In [None]:
print(classification_report(y_test, pred))

In [None]:
labels = [1,5]
cm = confusion_matrix(y_test, pred,labels=labels)
plt.figure(figsize = (10,7))
g = sns.heatmap(cm, annot=True, fmt="d", cbar=False, cmap="Greens",xticklabels=True,yticklabels=True)
g.set_yticklabels(labels, rotation =0)
g.set_xticklabels(labels, rotation =0)
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## display coefficients


In [None]:
feature_names = np.array(tfidf_vectorizer.get_feature_names())
sorted_coef_index = svc_tfidf_clf.coef_[0].argsort()

In [None]:
print('Highest 10  Coef:')
print('----------------')
for i in feature_names[sorted_coef_index][-10:]:
    print('- {}'.format(i))

In [None]:
print('Lowest 10  Coef:')
print('----------------')
for i in feature_names[sorted_coef_index][:10]:
    print('- {}'.format(i))

In [None]:
#https://medium.com/@aneesha/visualising-top-features-in-linear-svm-with-scikit-learn-and-matplotlib-3454ab18a14d
import matplotlib.pyplot as plt
def plot_coefficients(classifier, feature_names, top_features=20):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
     # create plot
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
    #plt.title('asdf')
    plt.show()

In [None]:
plot_coefficients(svc_tfidf_clf, tfidf_vectorizer.get_feature_names())