In [2]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv(r"Restaurant_Reviews.tsv", delimiter= '\t')
df.shape

(1000, 2)

### Cleaning the text

In [3]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []

In [4]:
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ', df['Review'][i]) #each row of review column
    review = review.lower().split()
    ps = PorterStemmer()
    rev = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    rev = ' '.join(rev)
    corpus.append(rev)

In [5]:
len(corpus)

1000

In [6]:
#creating bag of words model

from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:,1].values

In [7]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(1000, 1565))

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


from sklearn.tree import DecisionTreeClassifier
dtclf = DecisionTreeClassifier()
dtclf.fit(X_train, y_train)

In [9]:
#Predicting on the test set


y_pred = dtclf.predict(X_test)

#Making confusion matrix

from sklearn.metrics import confusion_matrix
cm =  confusion_matrix(y_test, y_pred)
print(cm)

[[74 22]
 [35 69]]


In [10]:
from sklearn.metrics import accuracy_score
ac = accuracy_score(y_test, y_pred)
print(ac)

0.715


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

models = [LogisticRegression(),
          KNeighborsClassifier(),
          RandomForestClassifier(),
          DecisionTreeClassifier(),
          SVC(),
          GaussianNB()]



cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:,1].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)
result = {}
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    name = str(model)[:-2]
    ac_score = accuracy_score(y_test, y_pred)
    result[name]  = {"Accuracy" : ac_score, "bias" : model.score(X_train,y_train),"variance" : model.score(X_test, y_test)}
    
result

{'LogisticRegression': {'Accuracy': 0.755, 'bias': 0.94625, 'variance': 0.755},
 'KNeighborsClassifier': {'Accuracy': 0.68, 'bias': 0.83875, 'variance': 0.68},
 'RandomForestClassifier': {'Accuracy': 0.72,
  'bias': 0.99625,
  'variance': 0.72},
 'DecisionTreeClassifier': {'Accuracy': 0.685,
  'bias': 0.99625,
  'variance': 0.685},
 'SVC': {'Accuracy': 0.755, 'bias': 0.9925, 'variance': 0.755},
 'GaussianNB': {'Accuracy': 0.72, 'bias': 0.93625, 'variance': 0.72}}

In [12]:
model = LogisticRegression()
st = str(model)
st[:-2]

'LogisticRegression'

In [13]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
