In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#! pip install nltk
import nltk
nltk.download('stopwords')

In [2]:
import pandas as pd
import numpy as np

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
stemmer = SnowballStemmer('english')
words = stopwords.words("english")

In [3]:
ls yelp_data/

[0m[01;32mhealth_business00.csv[0m*  [01;32mhealth_text_sentiment.csv[0m*
[01;32mhealth_business01.csv[0m*  [01;32myelp_academic_dataset_business.json[0m*
[01;32mhealth_raw00.csv[0m*       [01;32myelp_academic_dataset_review.json[0m*
[01;32mhealth_text.csv[0m*


In [4]:
path = 'yelp_data/health_text_sentiment.csv'

In [5]:
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64006 entries, 0 to 64005
Data columns (total 7 columns):
stars               64006 non-null int64
text                64006 non-null object
sent_value          64006 non-null float64
sent_score          64006 non-null object
clean_text          64006 non-null object
sent_value_clean    64006 non-null float64
sent_score_clean    64006 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 3.4+ MB


In [8]:
df.head()

Unnamed: 0,stars,text,sent_value,sent_score,clean_text,sent_value_clean,sent_score_clean
0,1,Please stay away from this place if you can! I...,-0.050645,negative,please stay away place bad care imaginable sta...,-0.036719,negative
1,5,My husband has been a patient of Dr. Byrne for...,0.024962,positive,husband patient dr byrne last year half last m...,0.069479,positive
2,4,Dr. Byrne is a great doctor! She has great bed...,0.513333,positive,dr byrne great doctor great bed side manner ex...,0.317778,positive
3,3,I'm raising my review as Dr Bryne's has been m...,-0.035714,negative,raise review dr bryne receptive daughter go an...,0.002806,positive
4,1,I wish I could give 0 stars. Worst office I've...,-0.048246,negative,wish could give star bad office ever horrible ...,-0.084259,negative


## split review stars and text

In [9]:
data = df[['stars','text','clean_text']]

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64006 entries, 0 to 64005
Data columns (total 3 columns):
stars         64006 non-null int64
text          64006 non-null object
clean_text    64006 non-null object
dtypes: int64(1), object(2)
memory usage: 1.5+ MB


In [11]:
data.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
2,4,Dr. Byrne is a great doctor! She has great bed...,dr byrne great doctor great bed side manner ex...
3,3,I'm raising my review as Dr Bryne's has been m...,raise review dr bryne receptive daughter go an...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64006 entries, 0 to 64005
Data columns (total 3 columns):
stars         64006 non-null int64
text          64006 non-null object
clean_text    64006 non-null object
dtypes: int64(1), object(2)
memory usage: 1.5+ MB


## Select only 1 & 5 stars

In [13]:
data = data.ix[np.where((data.stars==1)|(data.stars==5))]
data.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...
5,1,I went to the emergency room because i was hav...,go emergency room kidney stone attack emergenc...
6,5,Dr. Byrne is an excellent doctor with all the ...,dr byrne excellent doctor right skill include ...


In [14]:
data.stars.replace(1,0,inplace=True)
data.stars.replace(5,1,inplace=True)

In [15]:
data.head()

Unnamed: 0,stars,text,clean_text
0,0,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,1,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
4,0,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...
5,0,I went to the emergency room because i was hav...,go emergency room kidney stone attack emergenc...
6,1,Dr. Byrne is an excellent doctor with all the ...,dr byrne excellent doctor right skill include ...


In [None]:
### stars

In [None]:
data.stars.value_counts()

### Split positive and negtative
- total positive 31,186
- total negative : 22451
- total meh : 10,853

drop 2-4

In [None]:
data.info()

In [None]:
data.stars.value_counts()

In [None]:
data.head()

## Demo test

In [None]:
review = data['text'].values.astype(str)
sentiments = data['stars'].values

In [None]:
# Split the data for training and for testing and shuffle it
X_train, X_test, y_train, y_test = train_test_split(review, sentiments,
test_size=0.2, shuffle=True)

In [None]:
%%time
pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True)),
                     ('chi',  SelectKBest(chi2, k=10000)),
                     ('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False))])

In [None]:
%%time
model = pipeline.fit(X_train, y_train)

In [None]:
%%time
vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

In [None]:
%%time
feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)

In [None]:
target_names =['']

In [None]:
print("top 10 keywords per class:")

for i, label in enumerate(target_names):
    top10 = np.argsort(clf.coef_[i])[-10:]
    print("%s: %s" % (label, " ".join(feature_names[top10])))

In [None]:
%%time
print("accuracy score: " + str(model.score(X_test, y_test)))

## Multinomial NB
-

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [20]:
train_test_split?

In [18]:
review = data['text'].values.astype(str)
sentiments = data['stars'].values

In [21]:
X_train, X_test, y_train, y_test = train_test_split(review, sentiments, test_size=0.33, random_state=42)

In [22]:
vectorizer = CountVectorizer(lowercase=True)

vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [23]:
# Transform the text list to a matrix form
X_train_vectorized = vectorizer.transform(X_train)

In [24]:
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train_vectorized, y_train)

# Vectorize the test data
X_test_vectorized = vectorizer.transform(X_test)

# Check our classifier performance
score = classifier.score(X_test_vectorized, y_test)

print("Accuracy=", score)

Accuracy= 0.9568430553988492


In [27]:
from sklearn.metrics import classification_report
print(classification_report(X_test_vectorized, y_test))

TypeError: len() of unsized object

# Results: out of box

- all stars: `Accuracy= 0.8001240502403474`
- 1 & 5 starts: `Accuracy= 0.9561894108873975`

In [None]:
X_train[0]

In [None]:
vectorizer.get_feature_names()

## Diffrent classifier: Logistic

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
len(review)

In [None]:
len(sentiments)

In [None]:
# Split the data for training and for testing and shuffle it
X_train, X_test, y_train, y_test = train_test_split(review, sentiments,
test_size=0.2, shuffle=True)

In [None]:
vectorizer = CountVectorizer(lowercase=True)
# Compute the vocabulary only on the training data
vectorizer.fit(X_train)

# Transform the text list to a matrix form
X_train_vectorized = vectorizer.transform(X_train)

classifier = LogisticRegression()

# Train the classifier
classifier.fit(X_train_vectorized, y_train)

# Vectorize the test data
X_test_vectorized = vectorizer.transform(X_test)

# Check our classifier performance
score = classifier.score(X_test_vectorized, y_test)
print("Accuracy=", score)

## Logistic results
`Accuracy= 0.9737136465324385
`

## Use Ngrams Instead of Words


In [None]:
# Split the data for training and for testing and shuffle it
X_train, X_test, y_train, y_test = train_test_split(review, sentiments,
test_size=0.2, shuffle=True)

In [None]:
%%time
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,3))
# Compute the vocabulary only on the training data
vectorizer.fit(X_train)

# Transform the text list to a matrix form
X_train_vectorized = vectorizer.transform(X_train)

classifier = LogisticRegression()

# Train the classifier
classifier.fit(X_train_vectorized, y_train)

# Vectorize the test data
X_test_vectorized = vectorizer.transform(X_test)

# Check our classifier performance
score = classifier.score(X_test_vectorized, y_test)
print("Accuracy=", score)

### Results
- `Accuracy= 0.9793064876957495
`

## pipeline
- Result `Accuracy= 0.9777218493661447
`

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
# Split the data for training and for testing and shuffle it
X_train, X_test, y_train, y_test = train_test_split(review, sentiments,
test_size=0.2, shuffle=True)

In [None]:
%%time
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', 
     CountVectorizer(lowercase=True,ngram_range=(1, 3)))
    ,('classifier', LogisticRegression())])

pipeline.fit(X_train, y_train)
# Check our classifier performance

score = pipeline.score(X_test, y_test)

print("Accuracy=", score)

## Cross validation score

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

In [None]:
%%time
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(
        lowercase=True,
        ngram_range=(1, 3))),
    ('classifier', LogisticRegression())])

In [None]:
review, sentiments = shuffle(review, sentiments)

In [None]:
%%time
print("MeanAccuracy=", cross_val_score(pipeline, review, sentiments, cv=5).mean())

## Result:
```
MeanAccuracy= 0.9773477193058582
CPU times: user 10min 21s, sys: 44.1 s, total: 11min 5s
Wall time: 8min 45s

```

## Grid search

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
# shuffle data
review, sentiments = shuffle(review, sentiments)

In [None]:
%%time
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(
        lowercase=True,
        ngram_range=(1, 3))),
    ('classifier', LogisticRegression())])


classifier = GridSearchCV(pipeline, {
    # try out different ngram ranges
    'vectorizer__ngram_range': ((1, 2), (2, 3), (1, 3)),
    # check if setting all non zero counts to 1 makes a difference
    'vectorizer__binary': (True, False),},
                          n_jobs=-1, 
                          verbose=True,
                          error_score=0.0, 
                          cv=5)

In [None]:
%%time
# Compute the vocabulary and train the classifier
classifier.fit(review, sentiments)

In [None]:
print("Best Accuracy: ", classifier.best_score_)
print("Best Parameters: ", classifier.best_params_)