In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
path = 'yelp_data/health_text_sentiment.csv'

In [4]:
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)


df = df[['stars',
 'text']]

In [5]:
df.head()

Unnamed: 0,stars,text
0,1,Please stay away from this place if you can! I...
1,5,My husband has been a patient of Dr. Byrne for...
2,4,Dr. Byrne is a great doctor! She has great bed...
3,3,I'm raising my review as Dr Bryne's has been m...
4,1,I wish I could give 0 stars. Worst office I've...


## split 1 and 5 stars

In [7]:
data = df.ix[np.where((df.stars==1)|(df.stars==5))]
# data.stars.replace(1,0,inplace=True)
# data.stars.replace(5,1,inplace=True)

data.head()

Unnamed: 0,stars,text
0,1,Please stay away from this place if you can! I...
1,5,My husband has been a patient of Dr. Byrne for...
4,1,I wish I could give 0 stars. Worst office I've...
5,1,I went to the emergency room because i was hav...
6,5,Dr. Byrne is an excellent doctor with all the ...


# linear SVC

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,recall_score,precision_score

In [9]:
# split
review = data['text'].values.astype(str)
sentiments = data['stars'].values

In [23]:
## function
def nlp_linearSVC_tf(X,y):
    vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=0.5,stop_words='english')
    vectors = vectorizer.fit_transform(review)
    X_train, X_test, y_train, y_test = train_test_split(vectors, y,
                                                        test_size=0.33, 
                                                        random_state=42)
    # SVM classifier
    classifier = LinearSVC()
    # train
    classifier.fit(X_train, y_train)
    # lets make our predictions
    preds = classifier.predict(X_test)
    
    feature_names = np.array(vectorizer.get_feature_names())
    sorted_coef_index = classifier.coef_[0].argsort()

    return classifier,y_test,preds,feature_names,sorted_coef_index

In [17]:
## function
def nlp_linearSVC_cv(X,y):
    vectorizer = CountVectorizer(ngram_range=(1,2))
    vectors = vectorizer.fit_transform(review)
    X_train, X_test, y_train, y_test = train_test_split(vectors, y,
                                                        test_size=0.33, 
                                                        random_state=42)
    # SVM classifier
    classifier = LinearSVC()
    # train
    classifier.fit(X_train, y_train)
    # lets make our predictions
    preds = classifier.predict(X_test)
    
    feature_names = np.array(vectorizer.get_feature_names())
    sorted_coef_index = classifier.coef_[0].argsort()

    return classifier,y_test,preds,feature_names,sorted_coef_index

In [24]:
%%time
clf,y_test,preds,feature_names,sorted_coef = nlp_linearSVC_cv(review,sentiments)
print('CountVectorizer Accuracy:{}'.format(accuracy_score(y_test, preds)))
print('------------------')
print(classification_report(y_test, preds))
print('------------------')

CountVectorizer Accuracy:0.9757982624393546
------------------
              precision    recall  f1-score   support

           1       0.97      0.97      0.97      7241
           5       0.98      0.98      0.98     10485

   micro avg       0.98      0.98      0.98     17726
   macro avg       0.97      0.97      0.97     17726
weighted avg       0.98      0.98      0.98     17726

------------------
CPU times: user 41.2 s, sys: 3.09 s, total: 44.3 s
Wall time: 42 s


## tfidf

In [25]:
%%time
clf,y_test,preds,feature_names,sorted_coef = nlp_linearSVC_tf(review,sentiments)
print('TfidfVectorizer Accuracy: {}'.format(accuracy_score(y_test, preds)))
print('------------------')
print(classification_report(y_test, preds))
print('------------------')

TfidfVectorizer Accuracy: 0.9738801760126368
------------------
              precision    recall  f1-score   support

           1       0.97      0.97      0.97      7241
           5       0.98      0.98      0.98     10485

   micro avg       0.97      0.97      0.97     17726
   macro avg       0.97      0.97      0.97     17726
weighted avg       0.97      0.97      0.97     17726

------------------
CPU times: user 30.5 s, sys: 2.97 s, total: 33.5 s
Wall time: 30.1 s


In [19]:
%%time
clf,y_test,preds,feature_names,sorted_coef = nlp_linearSVC_tf(review,sentiments)
print('TfidfVectorizer Accuracy:{}'.format(accuracy_score(y_test, preds)))
print('------------------')
print(classification_report(y_test, preds))
print('------------------')

TfidfVectorizer Accuracy:0.981552521719508
------------------
              precision    recall  f1-score   support

           1       0.97      0.98      0.98      7241
           5       0.99      0.98      0.98     10485

   micro avg       0.98      0.98      0.98     17726
   macro avg       0.98      0.98      0.98     17726
weighted avg       0.98      0.98      0.98     17726

------------------
CPU times: user 36.5 s, sys: 3.69 s, total: 40.2 s
Wall time: 36.5 s


## evaluate

In [12]:
print(accuracy_score(y_test, preds))

0.9794087780661176


In [13]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           1       0.97      0.98      0.97      7241
           5       0.98      0.98      0.98     10485

   micro avg       0.98      0.98      0.98     17726
   macro avg       0.98      0.98      0.98     17726
weighted avg       0.98      0.98      0.98     17726



In [None]:
labels = [1,5]
cm = confusion_matrix(y_test, preds,labels=labels)
plt.figure(figsize = (10,7))
g = sns.heatmap(cm, annot=True, fmt="d", cbar=False, cmap="Greens",xticklabels=True,yticklabels=True)
g.set_yticklabels(labels, rotation =0)
g.set_xticklabels(labels, rotation =0)
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(accuracy_score(y_test, preds))

In [None]:
print(classification_report(y_test, preds))

## display coefficients

In [None]:
print('Lowest 10  Coef: \n{}\n'.format(feature_names[sorted_coef][:10]))
print('Highest 10 Coef: \n{}\n'.format(feature_names[sorted_coef][-10:]))

## plot

In [None]:
#https://medium.com/@aneesha/visualising-top-features-in-linear-svm-with-scikit-learn-and-matplotlib-3454ab18a14d
import matplotlib.pyplot as plt
def plot_coefficients(classifier, feature_names, top_features=20,title='title'):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
     # create plot
    plt.figure(figsize=(10, 4))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
    plt.title(title)
    plt.show()

In [None]:
plot_coefficients(clf,feature_names,title='coeff')

## All stars

In [None]:
df.head()

In [None]:
review = df['text'].values.astype(str)
sentiments = df['stars'].values

In [None]:
%%time
clf,y_test,preds,feature_names,sorted_coef = nlp_linearSVC(review,sentiments)

## Evaluate

In [None]:
print(accuracy_score(y_test, preds))

In [None]:
print(classification_report(y_test, preds))

In [None]:
labels = [1,2,3,4,5]
cm = confusion_matrix(y_test, preds,labels=labels)
plt.figure(figsize = (10,7))
g = sns.heatmap(cm, annot=True, fmt="d", cbar=False, cmap="Greens",xticklabels=True,yticklabels=True)
g.set_yticklabels(labels, rotation =0)
g.set_xticklabels(labels, rotation =0)
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## Vectorize reviews


In [None]:
# split
review = data['text'].values.astype(str)
sentiments = data['stars'].values

In [None]:
sentiments

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectors = vectorizer.fit_transform(review)

In [None]:
from sklearn.model_selection import train_test_split
# attention: we now use vectorize reviews, not the reviews column!!
X_train, X_test, y_train, y_test = train_test_split(vectors, sentiments,
 test_size=0.33, random_state=42)

In [None]:
from sklearn.svm import LinearSVC

# SVM classifier
classifier = LinearSVC()

# train
classifier.fit(X_train, y_train)

In [None]:
# lets make our predictions
preds = classifier.predict(X_test)

## Evaluate



In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, preds))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

## Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, preds)
plt.figure(figsize = (10,7))
sns.heatmap(cm, annot=True, fmt="d", cbar=False, cmap="Greens")
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## Display Coefficients

In [None]:
feature_names = np.array(vectorizer.get_feature_names())
sorted_coef_index = classifier.coef_[0].argsort()

print('Lowest 10  Coef: \n{}\n'.format(feature_names[sorted_coef_index][:10]))
print('Highest 10 Coef: \n{}\n'.format(feature_names[sorted_coef_index][-10:]))

# plot

In [None]:
#https://medium.com/@aneesha/visualising-top-features-in-linear-svm-with-scikit-learn-and-matplotlib-3454ab18a14d
import matplotlib.pyplot as plt
def plot_coefficients(classifier, feature_names, top_features=20):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
     # create plot
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
    #plt.title('asdf')
    plt.show()

In [None]:
plot_coefficients(classifier, vectorizer.get_feature_names())

## Try All star revies

In [None]:
data2 = df.copy()

In [None]:
%%time
review = data2['text'].values.astype(str)
sentiments = data2['stars'].values

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectors = vectorizer.fit_transform(review)

In [None]:
from sklearn.model_selection import train_test_split
# attention: we now use vectorize reviews, not the reviews column!!
X_train, X_test, y_train, y_test = train_test_split(vectors, sentiments,
 test_size=0.33, random_state=42)

In [None]:
from sklearn.svm import LinearSVC

# SVM classifier
classifier = LinearSVC()

# train
classifier.fit(X_train, y_train)

In [None]:
# lets make our predictions
preds = classifier.predict(X_test)

# Evaluate

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, preds))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

## Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, preds)
plt.figure(figsize = (10,7))
sns.heatmap(cm, annot=True, fmt="d", cbar=False, cmap="Greens")
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## Result Original text review : SVC

### 1 & 5 stars
- accuracy score: 0.9813268644928353


### 1 - 5 stars
- accuracy score: 0.8323548906353565