<a href="https://colab.research.google.com/github/emil-jebastin/EDA-retail-/blob/main/SARR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [154]:
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load the data
data = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [155]:
# Remove stop words
stop_words = set(nltk.corpus.stopwords.words("english"))
corpus = [" ".join([word for word in text.split() if word not in stop_words]) for text in data["Review"]]

In [156]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [157]:
corpus = []
for i in range(0,1000):
     review = re.sub(pattern='[^a-zA-Z]',repl= ' ', string=data['Review'] [i])
     review = review.lower()
     review_words = review.split()
     review_words = [word for word in review_words if not word in set(stopwords.words('english'))]
     ps = PorterStemmer()
     review = [ps.stem(word) for word in review_words]
     review = ' '.join(review)
     corpus.append(review)

In [181]:


# Create the TF-IDF model
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus)

# Split the data into training and test sets
y = data["Liked"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=2)

# Train the model
model = MultinomialNB(alpha=0.9)
model.fit(x_train, y_train)

# Make predictions on the test set
y_pred = model.predict(x_test)

# Evaluation
score1 = accuracy_score(y_test, y_pred)
score2 = precision_score(y_test, y_pred)
score3 = recall_score(y_test, y_pred)

print("Accuracy score is: {} %".format(round(score1 * 100, 2)))
print("Precision score is: {} %".format(round(score2 * 100, 2)))
print("Recall score is: {} %".format(round(score3 * 100, 2)))


Accuracy score is: 82.5 %
Precision score is: 77.14 %
Recall score is: 88.04 %


In [182]:

from sklearn.metrics import accuracy_score

best_accuracy = 0.0
alpha_val = 0.0

for i in np.arange(0.1, 1.1, 0.1):
    temp_classifier = MultinomialNB(alpha=i)
    temp_classifier.fit(x_train, y_train)
    temp_y_pred = temp_classifier.predict(x_test)
    score = accuracy_score(y_test, temp_y_pred)
    print("Accuracy score for alpha {} is: {:.2f}%".format(round(i, 1), round(score * 100, 2)))

    if score > best_accuracy:
        best_accuracy = score
        alpha_val = i

print('------------------------')
print('The best accuracy is {:.2f}% with alpha value as {}'.format(round(best_accuracy * 100, 2), round(alpha_val,1)))

Accuracy score for alpha 0.1 is: 80.00%
Accuracy score for alpha 0.2 is: 81.00%
Accuracy score for alpha 0.3 is: 81.00%
Accuracy score for alpha 0.4 is: 81.50%
Accuracy score for alpha 0.5 is: 80.50%
Accuracy score for alpha 0.6 is: 80.50%
Accuracy score for alpha 0.7 is: 80.50%
Accuracy score for alpha 0.8 is: 81.00%
Accuracy score for alpha 0.9 is: 82.50%
Accuracy score for alpha 1.0 is: 82.00%
------------------------
The best accuracy is 82.50% with alpha value as 0.9


In [183]:
def predict_sentiment(sample_review):

  # Convert the sample review to lowercase.
  sample_review = sample_review.lower()

  # Remove stop words.
  sample_review_words = sample_review.split()
  sample_review_words = [word for word in sample_review_words if word not in set(stopwords.words('english'))]

  # Transform the sample review into a TF-IDF vector.
  sample_review_vector = count_vectorizer.transform([sample_review]).toarray()

  # Make a prediction.
  prediction = classifier.predict(sample_review_vector)

  # Return the predicted sentiment.
  if prediction == 1:
    return "POSITIVE"
  else:
    return "NEGATIVE"


# Sample review
sample_review = 'The food was absolutely wonderful, from preparation to presentation , very pleasing'

# Predict the sentiment of the sample review.
predicted_sentiment = predict_sentiment(sample_review)

# Print the predicted sentiment.
print("This is a {} review.".format(predicted_sentiment))

This is a POSITIVE review.
