# Natural Language Processing

## Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [3]:
dataset = pd.read_csv("C:\datasets\Restaurant_Reviews.tsv", delimiter = '\t', quoting = 3)

## Cleaning the texts

In [4]:
import re
import nltk
# nltk.download('stopwords') #downloads the stopwords
from nltk.corpus import stopwords #imports the downloaded stopwords
from nltk.stem.porter import PorterStemmer #this class helps us remove stem words
corpus = [] #all reviews after being cleaned
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #replaces any element thats not a letter gets replaced by space
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stop_words = stopwords.words('english')
    all_stop_words.remove("not")
    review = [ps.stem(word) for word in review if not word in set(all_stop_words)]
    review = ' '.join(review)
    corpus.append(review)
#   print(corpus)

## Creating the Bag of Words model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [7]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Predicting the Test set results

In [8]:
y_pred = classifier.predict(X_test)

## Performance Metrics for Naive Bayes

In [10]:
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("accuracy_score",accuracy_score(y_test,y_pred))
print("Precision score:",precision_score(y_test,y_pred))
print("recall_score:",recall_score(y_test,y_pred))

[[55 42]
 [12 91]]
accuracy_score 0.73
Precision score: 0.6842105263157895
recall_score: 0.883495145631068


## Training the RandomForestClassifier model on the Training set

In [11]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion ='entropy',random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

## Perfromance metrics for RandomForestClassifier

In [12]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("accuracy_score",accuracy_score(y_test,y_pred))
print("Precision score:",precision_score(y_test,y_pred))
print("recall score:",recall_score(y_test,y_pred))

[[90  7]
 [39 64]]
accuracy_score 0.77
Precision score: 0.9014084507042254
recall score: 0.6213592233009708


## Training the Support Vector Machine model on the Training set

In [13]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear',random_state = 0,C=1.80)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

## Performance metrics for SVM

In [14]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("accuracy_score",accuracy_score(y_test,y_pred))
print("Precision score:",precision_score(y_test,y_pred))
print("recall_score:",recall_score(y_test,y_pred))

[[78 19]
 [19 84]]
accuracy_score 0.81
Precision score: 0.8155339805825242
recall_score: 0.8155339805825242
