In [22]:
#Importing Libraries and Loading Data
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [23]:
#Preprocessing the Reviews
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []

for i in range(0, 1000) :
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #remove special characters, numbers and punctuation 
    review = review.lower() 
    review = review.split() #split into list of words

    ps = PorterStemmer()
    all_stopwords = stopwords.words('english') #initialize the variable all_stopwords with the English stopwords from NLTK
    all_stopwords.remove('not')
    all_stopwords.remove('no')
    all_stopwords.remove('but')
    all_stopwords.remove("won't")

    review = [ps.stem(word) for word in review if word not in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
X = dataset['Review']  # Feature matrix (input data)
y = dataset['Liked'].astype(int)       # Target variable (labels)

In [25]:
#Vectorizing the Reviews
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)

X = cv.fit_transform(corpus).toarray()

In [26]:
print(dataset['Liked'].value_counts())

Liked
1    500
0    500
Name: count, dtype: int64


In [27]:
#Splitting the Data and training the model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

models = [
    ('Logistic Regression', LogisticRegression(C=1.)),
    ('Naive Bayes', MultinomialNB()),
    ('Support Vector Machine', SVC(C=1., kernel='rbf')),
    ('Random Forest', RandomForestClassifier())
]

for model_name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} accuracy: {accuracy}')

Logistic Regression accuracy: 0.78
Naive Bayes accuracy: 0.8
Support Vector Machine accuracy: 0.775
Random Forest accuracy: 0.78


In [32]:
#Cross-Validation and Choosing the Best Model
best_model = models[0][0]
best_accuracy = 0

for model_name, model in models:
    accuracy_scores = cross_val_score(model, X_train, y_train, cv=10)
    mean_accuracy = np.mean(accuracy_scores)
    if mean_accuracy > best_accuracy :
        best_accuracy = mean_accuracy
        best_model = model_name

print(f'Best model: {best_model}')
print(f'Accuracy with k-fold cross-validation: {best_accuracy}')

Best model: Logistic Regression
Accuracy with k-fold cross-validation: 0.80375


In [33]:
#choosing logistic regression as the best model
classifier = LogisticRegression(C=1.0) #moderate regularization
classifier.fit(X_train, y_train)

In [35]:
#Saving and Loading the Trained Model
import joblib

# Save the trained model to a file
joblib.dump(classifier, 'logistic_regression_NLPreviews.joblib')

# Save the vectorizer
joblib.dump(cv, 'vectorizer_reviews.joblib')

# Load the saved model from a file
loaded_model = joblib.load('logistic_regression_NLPreviews.joblib')
loaded_vectorizer = joblib.load('vectorizer_reviews.joblib')

In [36]:
#Making Predictions on New Data
new_review = loaded_vectorizer.transform(['The food could have been better.'])
predictions = loaded_model.predict(new_review)

if predictions == 0:
    print('negative sentiment')
else:
    print('positive sentiment')

negative sentiment


In [37]:
#Making Predictions on New Data
new_review = loaded_vectorizer.transform(['I was disgusted because I was pretty sure that was human hair.'])
predictions = loaded_model.predict(new_review)

if predictions == 0:
    print('negative sentiment')
else:
    print('positive sentiment')

negative sentiment


In [38]:
#Making Predictions on New Data
new_review = loaded_vectorizer.transform(['I could care less... The interior is just beautiful.'])
predictions = loaded_model.predict(new_review)

if predictions == 0:
    print('negative sentiment')
else:
    print('positive sentiment')

positive sentiment
