# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
dataset= pd.read_csv('Restaurant_Reviews.tsv', delimiter= "\t", quoting= 3)

## Cleaning the texts

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus=[]
for i in range(0,1000):
  review= re.sub('[^a-zA-z]', ' ', dataset['Review'][i])
  review= review.lower()
  review= review.split()
  ps= PorterStemmer()
  all_stopwords= stopwords.words('english')
  all_stopwords.remove('not')
  review= [ps.stem(word) for word in review if word not in all_stopwords]
  review= ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=1500)
X= cv.fit_transform(corpus).toarray()
y= dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=0)

## Training the Naive Bayes model on the Training set

In [6]:
from sklearn.naive_bayes import GaussianNB
sc= GaussianNB()
sc.fit(X_train, y_train)

GaussianNB()

In [7]:
from sklearn.svm import SVC
svc= SVC(kernel='linear', random_state=0)
svc.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [8]:
from sklearn.tree import DecisionTreeClassifier
dc= DecisionTreeClassifier(criterion='entropy', random_state=0)
dc.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [10]:
from sklearn.linear_model import LogisticRegression
lr= LogisticRegression(random_state=0)
lr.fit(X_train,y_train)

LogisticRegression(random_state=0)

In [11]:
from sklearn.svm import SVC
sv= SVC(kernel='rbf', random_state=0)
sv.fit(X_train, y_train)

SVC(random_state=0)

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

## Predicting the Test set results

In [13]:
y_pred= sc.predict(X_test)

In [14]:
y_pred2= svc.predict(X_test)

In [15]:
y_pred3= dc.predict(X_test)

In [16]:
y_pred4= knn.predict(X_test)

In [17]:
y_pred5= lr.predict(X_test)

In [18]:
y_pred6= sv.predict(X_test)

In [19]:
y_pred7= sv.predict(X_test)

## Making the Confusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm= confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73

In [21]:
cm= confusion_matrix(y_test, y_pred2)
print(cm)
accuracy_score(y_test, y_pred2)

[[79 18]
 [24 79]]


0.79

In [22]:
cm= confusion_matrix(y_test, y_pred3)
print(cm)
accuracy_score(y_test, y_pred3)

[[78 19]
 [31 72]]


0.75

In [23]:
cm= confusion_matrix(y_test, y_pred4)
print(cm)
accuracy_score(y_test, y_pred4)

[[68 29]
 [42 61]]


0.645

In [24]:
cm= confusion_matrix(y_test, y_pred5)
print(cm)
accuracy_score(y_test, y_pred5)

[[80 17]
 [28 75]]


0.775

In [25]:
cm= confusion_matrix(y_test, y_pred6)
print(cm)
accuracy_score(y_test, y_pred6)

[[89  8]
 [36 67]]


0.78

In [26]:
cm= confusion_matrix(y_test, y_pred7)
print(cm)
accuracy_score(y_test, y_pred7)

[[89  8]
 [36 67]]


0.78