In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
path = 'storage/yelp_data/health_raw00.csv'
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,business_id,name,categories,cool,date,funny,review_id,stars,text,useful,user_id,len_text
0,y-4xTZNKVm8mAZpiXMS5ZA,"Lauren Byrne, MD","urologists, doctors, health & medical",0,2018-03-03,0,TNNkSmMfshsD3G60jTNjDA,1,Please stay away from this place if you can! I...,2,xv2V2GO5IZYvtw4oW7gQ1w,2002
1,y-4xTZNKVm8mAZpiXMS5ZA,"Lauren Byrne, MD","urologists, doctors, health & medical",0,2015-11-29,0,v-iKdstPdCxJr8zV1ZMdrw,5,My husband has been a patient of Dr. Byrne for...,1,SjvWP7c9toeZoV_q62zhTA,877
2,y-4xTZNKVm8mAZpiXMS5ZA,"Lauren Byrne, MD","urologists, doctors, health & medical",0,2016-06-03,0,BmNDRCV9_NzQ_KCChyfdEw,4,Dr. Byrne is a great doctor! She has great bed...,2,sZVHm1aLtvyH9trAc2_MgA,333
3,y-4xTZNKVm8mAZpiXMS5ZA,"Lauren Byrne, MD","urologists, doctors, health & medical",0,2017-03-17,0,t_TKVMxKFYm9Hl-TIO7UUw,3,I'm raising my review as Dr Bryne's has been m...,3,8Y_irXocZdZxLs_qgzpjBw,649
4,y-4xTZNKVm8mAZpiXMS5ZA,"Lauren Byrne, MD","urologists, doctors, health & medical",0,2016-08-31,0,NWRrpGRgWZBBj3lvCZGVKA,1,I wish I could give 0 stars. Worst office I've...,1,hVKPDGpG12z7vpScXaSakw,1408


In [6]:
data = df[['stars','text']]

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54756 entries, 0 to 54755
Data columns (total 2 columns):
stars    54756 non-null int64
text     54756 non-null object
dtypes: int64(1), object(1)
memory usage: 855.6+ KB


In [8]:
data.head()

Unnamed: 0,stars,text
0,1,Please stay away from this place if you can! I...
1,5,My husband has been a patient of Dr. Byrne for...
2,4,Dr. Byrne is a great doctor! She has great bed...
3,3,I'm raising my review as Dr Bryne's has been m...
4,1,I wish I could give 0 stars. Worst office I've...


In [9]:
data = data.ix[np.where((data.stars==1)|(data.stars==5))]

data.stars.replace(1,0,inplace=True)
data.stars.replace(5,1,inplace=True)

In [10]:
data.head()

Unnamed: 0,stars,text
0,0,Please stay away from this place if you can! I...
1,1,My husband has been a patient of Dr. Byrne for...
4,0,I wish I could give 0 stars. Worst office I've...
5,0,I went to the emergency room because i was hav...
6,1,Dr. Byrne is an excellent doctor with all the ...


# Build up a pipeline


In [11]:
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn import preprocessing

  from numpy.core.umath_tests import inner1d


# Binarise the category labels¶


In [12]:
lb = preprocessing.LabelBinarizer()

In [13]:
lb.fit(data['stars'])
data['stars_bin']=lb.transform(data['stars'])

In [14]:
data.head()

Unnamed: 0,stars,text,stars_bin
0,0,Please stay away from this place if you can! I...,0
1,1,My husband has been a patient of Dr. Byrne for...,1
4,0,I wish I could give 0 stars. Worst office I've...,0
5,0,I went to the emergency room because i was hav...,0
6,1,Dr. Byrne is an excellent doctor with all the ...,1


# Test Naive Bayes Classifier fr our baseline¶


In [15]:
steps=[('vectorise',CountVectorizer()),\
       ('transform',TfidfTransformer()),\
       ('clf',MultinomialNB())]
# Our pipeline has three steps

In [16]:
pipe=Pipeline(steps)

In [17]:
X_train, X_test, y_train, y_test=\
train_test_split(data['text'],data['stars_bin'],test_size=0.25)

In [18]:
%%time 
pipe.fit(X_train,y_train)

CPU times: user 3.45 s, sys: 68 ms, total: 3.52 s
Wall time: 3.51 s


Pipeline(memory=None,
     steps=[('vectorise', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [19]:
pred=pipe.predict(X_test)

In [20]:
print( 'Accuracy = %.3f' % f1_score(y_test,pred))

Accuracy = 0.962


In [None]:
ls storage/yelp_data/

# Write out model


In [21]:
import pickle
pickle.dump(pipe, open('storage/yelp_data/y_model.out','wb'))

## grid Search

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [23]:
param_gridparam_gr  = dict(vectorise__min_df=[1,5,10])
# fewwer 1, 5, 10 words in documents

In [24]:
data.stars.value_counts()

1    25502
0    20129
Name: stars, dtype: int64

In [25]:
pipe.named_steps.keys()

dict_keys(['vectorise', 'transform', 'clf'])

In [26]:
%%time
param_grid = dict(vectorise__stop_words=[None,'english'],\
                  vectorise__binary=[True,False],\
                  #vectorise__min_df=[1,5,10],\
                  #clf__class_weight=[None,'balanced'],\
                  #transform__norm=['l1','l2']
                 )

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.78 µs


In [27]:
%%time
grid_search = GridSearchCV(pipe, param_grid=param_grid,\
                           scoring=make_scorer(f1_score),n_jobs=2)  
# With n_jobs=1, takes 10.33
# With n_jobs=-1 takes YYYs

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 184 µs


In [28]:
%%time 
res=grid_search.fit(data['text'],data['stars_bin'])

CPU times: user 5.36 s, sys: 1.06 s, total: 6.41 s
Wall time: 50.6 s


In [29]:
res.best_params_

{'vectorise__binary': True, 'vectorise__stop_words': None}

In [30]:
res

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorise', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'vectorise__stop_words': [None, 'english'], 'vectorise__binary': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

In [31]:
print ('Best score = %.3f' % res.best_score_)

Best score = 0.964


# Compare classifiers¶


In [32]:
%%time
for clf in [SGDClassifier(),LogisticRegression(),RandomForestClassifier()]:
    print( clf.__class__)
    steps=[('vectorise',CountVectorizer()),('transform',TfidfTransformer()),\
           ('clf',clf)]
    pipe=Pipeline(steps)
    pipe.set_params(vectorise__decode_error='ignore')
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1,\
                           scoring=make_scorer(f1_score))

    res=grid_search.fit(data['text'],data['stars_bin'])
    
    print ( 'Best score = %.3f' % res.best_score_)
    print ( res.best_params_)
    print('')

<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
Best score = 0.977
{'vectorise__binary': True, 'vectorise__stop_words': None}

<class 'sklearn.linear_model.logistic.LogisticRegression'>
Best score = 0.975
{'vectorise__binary': True, 'vectorise__stop_words': None}

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Best score = 0.917
{'vectorise__binary': False, 'vectorise__stop_words': 'english'}

CPU times: user 25.9 s, sys: 4.26 s, total: 30.2 s
Wall time: 1min 7s
