In [1]:
#import dependencies


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
#from sklearn.grid_search import GridSearchCV
%matplotlib inline

In [2]:
#read in the dataset
data = pd.read_csv('../Output_folder/3_Table.csv')

In [3]:
#dataset descriptions

# dataset shape
print("Shape of the dataset:")
print(data.shape)
# column names
print("Column names:")
print(data.columns)
# dtypes for columns
print("Datatype of each column:")
print(data.dtypes)

Shape of the dataset:
(2346, 22)
Column names:
Index(['review_id', 'user_id', 'business_id', 'stars_x', 'useful', 'funny',
       'cool', 'text', 'date', 'name', 'address', 'city', 'state',
       'postal_code', 'latitude', 'longitude', 'stars_y', 'review_count',
       'is_open', 'attributes', 'categories', 'hours'],
      dtype='object')
Datatype of each column:
review_id        object
user_id          object
business_id      object
stars_x           int64
useful            int64
funny             int64
cool              int64
text             object
date             object
name             object
address          object
city             object
state            object
postal_code       int64
latitude        float64
longitude       float64
stars_y         float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object


In [4]:
# summarization of dataset
data.describe(include='all')

Unnamed: 0,review_id,user_id,business_id,stars_x,useful,funny,cool,text,date,name,...,state,postal_code,latitude,longitude,stars_y,review_count,is_open,attributes,categories,hours
count,2346,2346,2346,2346.0,2346.0,2346.0,2346.0,2346,2346,2346,...,2346,2346.0,2346.0,2346.0,2346.0,2346.0,2346.0,2333,2346,2237
unique,2346,2224,145,,,,,2344,2346,144,...,1,,,,,,,122,143,111
top,pUycOfUwM8vqX7KjRRhUEA,EzHVkq_aplBXpLre5lqJ8g,SZU9c8V2GuREDN5KgyHFJw,,,,,We were back last night for the second time: a...,2016-07-25 07:31:06,Santa Barbara Shellfish Company,...,CA,,,,,,,"{'OutdoorSeating': 'True', 'RestaurantsAttire'...","Live/Raw Food, Restaurants, Seafood, Beer Bar,...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."
freq,1,4,351,,,,,2,1,351,...,2346,,,,,,,351,351,351
mean,,,,4.02046,0.763427,0.246803,0.338022,,,,...,,93101.0,34.41633,-119.695237,3.91965,603.054135,0.815431,,,
std,,,,1.280828,1.876996,1.012523,1.364951,,,,...,,0.0,0.005383,0.007612,0.659716,785.184364,0.388031,,,
min,,,,1.0,0.0,0.0,0.0,,,,...,,93101.0,34.403986,-119.779586,1.0,5.0,0.0,,,
25%,,,,3.0,0.0,0.0,0.0,,,,...,,93101.0,34.410521,-119.6999,4.0,113.0,1.0,,,
50%,,,,5.0,0.0,0.0,0.0,,,,...,,93101.0,34.41654,-119.694732,4.0,290.0,1.0,,,
75%,,,,5.0,1.0,0.0,0.0,,,,...,,93101.0,34.419679,-119.690492,4.5,659.0,1.0,,,


In [5]:
data.isna().sum()

review_id         0
user_id           0
business_id       0
stars_x           0
useful            0
funny             0
cool              0
text              0
date              0
name              0
address          17
city              0
state             0
postal_code       0
latitude          0
longitude         0
stars_y           0
review_count      0
is_open           0
attributes       13
categories        0
hours           109
dtype: int64

In [6]:
df = data.dropna()


In [7]:
df.isna().sum()

review_id       0
user_id         0
business_id     0
stars_x         0
useful          0
funny           0
cool            0
text            0
date            0
name            0
address         0
city            0
state           0
postal_code     0
latitude        0
longitude       0
stars_y         0
review_count    0
is_open         0
attributes      0
categories      0
hours           0
dtype: int64

In [8]:
#classify the dataset and split

data_classes = data[(data['stars_x']==1) | (data['stars_x']==2) | (data['stars_x']==3) |(data['stars_x']==4) | (data['stars_x']==5)]
data_classes.head()
print(data_classes.shape)

# Seperate the dataset into X and Y for prediction
x = data_classes['text']
y = data_classes['stars_x']
print(x.head())
print(y.head())

(2346, 22)
0    Had a party of 6 here for hibachi. Our waitres...
1    What a great addition to the Funk Zone!  Grab ...
2    Farmhouse, rustic, chic.Helpful staff with gre...
3    We were a bit weary about trying the Shellfish...
4    The kayaking tour at the Santa Cruz Island was...
Name: text, dtype: object
0    3
1    5
2    5
3    5
4    5
Name: stars_x, dtype: int64


In [9]:
#cleaning which includes stopword and puncuation removal
def text_process(text):
    stop_words = stopwords.words('english')
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [10]:
#single reveiew vectorization to test for usability

vocab = CountVectorizer(analyzer=text_process).fit(x)
print(len(vocab.vocabulary_))
r0 = x[0]
print(r0)
vocab0 = vocab.transform([r0])
print(vocab0)

print("Getting the words back:")
print(vocab.get_feature_names()[19648])
print(vocab.get_feature_names()[10643])

13481
Had a party of 6 here for hibachi. Our waitress brought our separate sushi orders on one plate so we couldn't really tell who's was who's and forgot several items on an order. I understand making mistakes but the restaraunt was really quiet so we were kind of surprised. Usually hibachi is a fun lively experience and our  cook  said maybe three words, but he cooked very well his name was Francisco. Service was fishy, food was pretty good, and im hoping it was just an off night here. But for the money I wouldn't go back.
  (0, 251)	1
  (0, 1466)	1
  (0, 2921)	1
  (0, 3391)	1
  (0, 4239)	1
  (0, 4731)	1
  (0, 5538)	1
  (0, 5539)	1
  (0, 5602)	1
  (0, 6691)	1
  (0, 6933)	1
  (0, 7032)	1
  (0, 7057)	1
  (0, 7166)	1
  (0, 7305)	1
  (0, 7327)	1
  (0, 7677)	2
  (0, 7770)	1
  (0, 7892)	1
  (0, 8154)	1
  (0, 8288)	1
  (0, 8567)	1
  (0, 8721)	1
  (0, 8815)	1
  (0, 8979)	1
  (0, 9021)	1
  (0, 9138)	1
  (0, 9227)	1
  (0, 9409)	1
  (0, 9466)	1
  (0, 9471)	1
  (0, 9665)	1
  (0, 9936)	1
  (0, 10



IndexError: list index out of range

In [11]:
x = vocab.transform(x)
#Shape of the matrix:
print("Shape of the sparse matrix: ", x.shape)
#Non-zero occurences:
print("Non-Zero occurences: ",x.nnz)

# DENSITY OF THE MATRIX
density = (x.nnz/(x.shape[0]*x.shape[1]))*100
print("Density of the matrix = ",density)

Shape of the sparse matrix:  (2346, 13481)
Non-Zero occurences:  98869
Density of the matrix =  0.31261515291041736


In [12]:
#splitting the dataset, test and train
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5, random_state=42)

In [13]:
#we will now test several models

#the first is random forest classifier
from sklearn.ensemble import RandomForestClassifier
rmfr = RandomForestClassifier()
rmfr.fit(x_train,y_train)
predrmfr = rmfr.predict(x_test)
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test,predrmfr))
print("Score:",round(accuracy_score(y_test,predrmfr)*100,2))
print("Classification Report:\n",classification_report(y_test,predrmfr))

Confusion Matrix for Random Forest Classifier:
[[ 22   0   2  12  68]
 [  3   0   2  18  61]
 [  0   0   4  36  71]
 [  0   0   2  48 219]
 [  0   0   0  42 563]]
Score: 54.31
Classification Report:
               precision    recall  f1-score   support

           1       0.88      0.21      0.34       104
           2       0.00      0.00      0.00        84
           3       0.40      0.04      0.07       111
           4       0.31      0.18      0.23       269
           5       0.57      0.93      0.71       605

    accuracy                           0.54      1173
   macro avg       0.43      0.27      0.27      1173
weighted avg       0.48      0.54      0.45      1173



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
#decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
preddt = dt.predict(x_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:\n",classification_report(y_test,preddt))

Confusion Matrix for Decision Tree:
[[ 26   3  17  20  38]
 [  9   5  18  20  32]
 [  8   3  29  38  33]
 [  6   7  40  82 134]
 [ 19  11  49 103 423]]
Score: 48.17
Classification Report:
               precision    recall  f1-score   support

           1       0.38      0.25      0.30       104
           2       0.17      0.06      0.09        84
           3       0.19      0.26      0.22       111
           4       0.31      0.30      0.31       269
           5       0.64      0.70      0.67       605

    accuracy                           0.48      1173
   macro avg       0.34      0.31      0.32      1173
weighted avg       0.47      0.48      0.47      1173



In [16]:
#support vector machines
from sklearn.svm import SVC
svm = SVC(random_state=101)
svm.fit(x_train,y_train)
predsvm = svm.predict(x_test)
print("Confusion Matrix for Support Vector Machines:")
print(confusion_matrix(y_test,predsvm))
print("Score:",round(accuracy_score(y_test,predsvm)*100,2))
print("Classification Report:\n",classification_report(y_test,predsvm))

Confusion Matrix for Support Vector Machines:
[[  0   0   0   0 104]
 [  0   0   0   5  79]
 [  0   0   0  13  98]
 [  0   0   0  20 249]
 [  0   0   0  11 594]]
Score: 52.34
Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       104
           2       0.00      0.00      0.00        84
           3       0.00      0.00      0.00       111
           4       0.41      0.07      0.13       269
           5       0.53      0.98      0.69       605

    accuracy                           0.52      1173
   macro avg       0.19      0.21      0.16      1173
weighted avg       0.37      0.52      0.38      1173



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
