In [37]:
import numpy as np
import csv
import re
from collections import defaultdict

In [38]:
class NaiveBayes:
    def __init__(self):
        self.class_priors = {}
        self.word_likelihoods = defaultdict(lambda: defaultdict(float))
        self.vocabulary_size = 0
        self.total_words_per_class = defaultdict(int)
        self.stop_words = set([
            'the','is','i','as','product'
        ])

    def preprocess_text(self, text):
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation and special characters
        text = re.sub(r'[^a-z0-9\s]', '', text)
        # Remove stop words
        text = ' '.join(word for word in text.split() if word not in self.stop_words)
        return text.strip()

    def fit(self, X, y):
        # Calculate prior probabilities
        total_reviews = len(y)
        class_counts = defaultdict(int)

        # Count occurrences of each class
        for label in y:
            class_counts[label] += 1

        self.class_priors = {cls: count / total_reviews for cls, count in class_counts.items()}

        # Count word frequencies for each class
        word_counts = defaultdict(lambda: defaultdict(int))

        for review, label in zip(X, y):
            for word in review.split():
                word_counts[label][word] += 1
                self.total_words_per_class[label] += 1

        # Calculate the vocabulary size
        all_words = set(word for counts in word_counts.values() for word in counts.keys())
        self.vocabulary_size = len(all_words)

        # Calculate log-likelihoods with Laplace smoothing
        for cls, counts in word_counts.items():
            for word in all_words:
                word_count = counts.get(word, 0)
                # Apply Laplace smoothing
                self.word_likelihoods[cls][word] = (word_count + 1) / (self.total_words_per_class[cls] + self.vocabulary_size)

    def predict(self, review):
        log_probs = {}
        for cls in self.class_priors.keys():
            log_probs[cls] = np.log(self.class_priors[cls])
            for word in review.split():
                if word in self.word_likelihoods[cls]:
                    log_probs[cls] += np.log(self.word_likelihoods[cls][word])
                else:
                    # Use Laplace smoothing for unseen words
                    log_probs[cls] += np.log(1 / (self.total_words_per_class[cls] + self.vocabulary_size))
        return max(log_probs, key=log_probs.get)


In [39]:
# Function to read data from CSV
def read_csv(file_path):
    reviews = []
    ratings = []

    with open(file_path, mode='r', encoding='utf-8') as csvfile:
        csvreader = csv.DictReader(csvfile)
        for row in csvreader:
            reviews.append(row['review'])
            ratings.append(int(row['rating']))

    return reviews, ratings

# Read data from the CSV file
X, y = read_csv('zomato_reviews.csv')

In [40]:
# Initialize the Naive Bayes classifier
nb_classifier = NaiveBayes()

In [41]:
# Clean the reviews
X_cleaned = [nb_classifier.preprocess_text(review) for review in X]

In [42]:
# Convert ratings to classes
def classify_rating(rating):
    if rating in [1, 2]:
        return 'Bad'
    elif rating == 3:
        return 'Okay'
    else:  # rating in [4, 5]
        return 'Good'

In [43]:
# Convert ratings to classes
y_classes = [classify_rating(rating) for rating in y]

In [44]:
# Manually split the dataset into training (70%), validation (10%), and test (20%)
def split_dataset(X, y, train_size=0.7, val_size=0.1):
    total_reviews = len(y)
    train_end = int(train_size * total_reviews)
    val_end = train_end + int(val_size * total_reviews)

    X_train = X[:train_end]
    y_train = y[:train_end]
    X_val = X[train_end:val_end]
    y_val = y[train_end:val_end]
    X_test = X[val_end:]
    y_test = y[val_end:]

    return X_train, y_train, X_val, y_val, X_test, y_test

# Split the dataset
X_train, y_train, X_val, y_val, X_test, y_test = split_dataset(X_cleaned, y_classes)

In [45]:
# Train the classifier
nb_classifier.fit(X_train, y_train)
#print(y_train)

In [46]:
# Validate the classifier on the validation set
val_predictions = [nb_classifier.predict(review) for review in X_val]
val_accuracy = np.mean(np.array(val_predictions) == y_val)

print(f'Validation Accuracy: {val_accuracy:.2f}')

# Test the classifier on the test set
test_predictions = [nb_classifier.predict(review) for review in X_test]
test_accuracy = np.mean(np.array(test_predictions) == y_test)

print(f'Test Accuracy: {test_accuracy:.2f}')

Validation Accuracy: 0.51
Test Accuracy: 0.47


In [47]:
# Example of predicting a single review
while(1):
  print("Enter stop to stop giving review : ")
  example_review = input("Enter a review: ")
  if(example_review=="stop"):
    break
  example_review_cleaned = nb_classifier.preprocess_text(example_review)
  predicted_class = nb_classifier.predict(example_review_cleaned)
  print(f'Predicted class for the review "{example_review}": {predicted_class}')
  print()

Enter stop to stop giving review : 
Enter a review: i loved the product
Predicted class for the review "i loved the product": Good

Enter stop to stop giving review : 
Enter a review: i didn't liked the product
Predicted class for the review "i didn't liked the product": Bad

Enter stop to stop giving review : 
Enter a review: stop
