# Libraries

In [137]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.colors
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text
import sklearn.cross_validation
import sklearn.naive_bayes
import sklearn.tree
import sklearn.ensemble
import sklearn.metrics
import nltk
import re

# Read data

In [12]:
work_dir='/home/ggomarr/Documents/Education/20170623 Udemy - Machine Learning A-Z: Hands-On Python and R in Data Science/' + \
         'Course data/Part 7 - Natural Language Processing/Section 36 - Natural Language Processing/'
df=pd.read_csv(work_dir+'Restaurant_Reviews.tsv',sep='\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [13]:
df.describe()

Unnamed: 0,Liked
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


# Clean up data

In [104]:
def process_str(txt,regexp_lst,stemmer=nltk.stem.porter.PorterStemmer()):
    for regexp in regexp_lst:
        txt=regexp[0].sub(regexp[1],txt)
    if stemmer==None:
        return txt.strip().lower().split()
    else:
        return [stemmer.stem(wrd) for wrd in txt.strip().lower().split()]

In [105]:
regexp_lst=[
            (re.compile('[^a-zA-Z]+'),' '),
            (re.compile('(\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b)'),''),
           ]

In [120]:
df['Review'][0],process_str(df['Review'][0],regexp_lst)

('Wow... Loved this place.', ['wow', 'love', 'place'])

In [129]:
cv=sklearn.feature_extraction.text.CountVectorizer(analyzer=lambda txt: process_str(txt,regexp_lst),
                                                   max_features=1500)
X_sparse=cv.fit_transform(df['Review'])
X=X_sparse.toarray()
Y=df['Liked'].values

# Train/test split

In [138]:
X_train,X_test,Y_train,Y_test=sklearn.cross_validation.train_test_split(X,Y,
                                                                        test_size=0.2,random_state=0)
Y_train.size,Y_test.size

(800, 200)

In [145]:
model_classes={ '1 - GaussianNB':{ 'model':sklearn.naive_bayes.GaussianNB,
                               'params':{} },
                '2 - Decision Tree':{ 'model':sklearn.tree.DecisionTreeClassifier,
                                  'params':{'criterion':'entropy',
                                            'random_state':0} },
                '3 - Random forest 1 [10 trees]':{ 'model':sklearn.ensemble.RandomForestClassifier,
                                               'params':{'criterion':'entropy',
                                                         'n_estimators':10,
                                                         'random_state':0} },
                '4 - Random forest 2 [100 trees]':{ 'model':sklearn.ensemble.RandomForestClassifier,
                                                'params':{'criterion':'entropy',
                                                          'n_estimators':100,
                                                          'random_state':0} },
              }

In [146]:
for class_name in sorted(model_classes):
    print('=== {} ==='.format(class_name))
    model_class=model_classes[class_name]['model']
    params=model_classes[class_name]['params']
    model=model_class(**params).fit(X_train,Y_train)
    print('\nScore:')
    print('Train {:0.2f} - Test {:0.2f}'.format(model.score(X_train,Y_train),model.score(X_test,Y_test)))
    print('\nConfusion matrix:')
    print(sklearn.metrics.confusion_matrix(Y_test,model.predict(X_test)))
    print('\nClassification report:')
    print(sklearn.metrics.classification_report(Y_test,model.predict(X_test)))

=== 1 - GaussianNB ===

Score:
Train 0.93 - Test 0.73

Confusion matrix:
[[55 42]
 [11 92]]

Classification report:
             precision    recall  f1-score   support

          0       0.83      0.57      0.67        97
          1       0.69      0.89      0.78       103

avg / total       0.76      0.73      0.73       200

=== 2 - Decision Tree ===

Score:
Train 1.00 - Test 0.69

Confusion matrix:
[[75 22]
 [40 63]]

Classification report:
             precision    recall  f1-score   support

          0       0.65      0.77      0.71        97
          1       0.74      0.61      0.67       103

avg / total       0.70      0.69      0.69       200

=== 3 - Random forest 1 [10 trees] ===

Score:
Train 0.98 - Test 0.73

Confusion matrix:
[[84 13]
 [41 62]]

Classification report:
             precision    recall  f1-score   support

          0       0.67      0.87      0.76        97
          1       0.83      0.60      0.70       103

avg / total       0.75      0.73      0.73