In [16]:
# DO NOT CHANGE this cell
import zipfile
from typing import List, Tuple

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

data_path = "./data"
file_path = "./data/imdb.csv"
zip_path = './data/imdb.zip'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(data_path)
print('Unzipping finished')

# Load the IMDb dataset
data = pd.read_csv(file_path)
data["sentiment"] = data["sentiment"].apply(lambda review: 0 if review == 'negative' else 1)

# Get the number of rows in the data frame
num_rows = data.shape[0]

# Calculate the number of rows to be used for training and testing
num_train = int(num_rows * 0.7)
num_test = num_rows - num_train

# Generate a random permutation of the row indices
np.random.seed(1) # for auto grading
permuted_indices = np.random.permutation(num_rows)

# Split the permuted indices into training and testing indices
train_indices = permuted_indices[:num_train]
test_indices = permuted_indices[num_train:]

# Use the indices to split the data into training and testing sets
train_data = data.iloc[train_indices]
test_data = data.iloc[test_indices]

# Define the true labels and the predicted labels
y_train = train_data["sentiment"].to_list()
y_true = test_data["sentiment"].to_list()
x_train = train_data["review"].to_list()
x_test = test_data["review"].to_list()


Unzipping finished


In [39]:
import math
import string
import re
class NaiveBayesClassifier:
    """
    TODO: Implement the Naive Bayes classifier with add-1 smoothing
    (Laplace smoothing)

    Hints:
    - use CountVectorizer to extract words from each review and to store counts
    - use numpy arrays for fast calculation
        (for example np.sum, np.log, np.argmax, np.exp)
    - use log scale to avoid very small numbers while calculating
        joined probability of sentence
    """

    def __init__(self, alpha=1):
        self.alpha = alpha
        # for now simple count vectorizer is enough
        self.vc = CountVectorizer()
        # TODO init other necessary objects

    def tokenize(self, text):
        text = self.clean(text).lower()
        return re.split("\W+", text)
        
    def get_word_counts(self, words):
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0.0) + 1.0
        return word_counts
    
    def clean(self, s):
        translator = str.maketrans("", "", string.punctuation)
        return s.translate(translator)
    
    def fit(self, X: List[str], Y: List[int]) -> None:
        """
        TODO: Implement a function that takes in a list of labeled texts
        and trains the classifier.
        """
        self.num_messages = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = set()
        n = len(X)
        self.num_messages['positive'] = sum(1 for label in Y if label == 1)
        self.num_messages['negative'] = sum(1 for label in Y if label == 0)
        self.log_class_priors['positive'] = math.log(self.num_messages['positive'] / n)
        self.log_class_priors['negative'] = math.log(self.num_messages['negative'] / n)
        self.word_counts['positive'] = {}
        self.word_counts['negative'] = {}
        
        for x, y in zip(X, Y):
            c = 'positive' if y == 1 else 'negative'
            counts = self.get_word_counts(self.tokenize(x))
            for word, count in counts.items():
                if word not in self.vocab:
                    self.vocab.add(word)
                if word not in self.word_counts[c]:
                    self.word_counts[c][word] = 0.0
                self.word_counts[c][word] += count

    def predict(self, X: List[str]) -> List[int]:
        """
        TODO: Implement a function that takes a list of texts and predicts
        their labels using the learned classifier.

        return list of text labels in int format (for example 0 for normal message, 1 for spam)
        """
        # CODE START
        result = []
        for x in X:
            counts = self.get_word_counts(self.tokenize(x))
            spam_score = 0
            ham_score = 0
            for word, _ in counts.items():
                if word not in self.vocab: continue

                # add Laplace smoothing
                log_w_given_spam = math.log( (self.word_counts['positive'].get(word, 0.0) + 1) / (self.num_messages['positive'] + len(self.vocab)) )
                log_w_given_ham = math.log( (self.word_counts['negative'].get(word, 0.0) + 1) / (self.num_messages['negative'] + len(self.vocab)) )
                spam_score += log_w_given_spam
                ham_score += log_w_given_ham
            spam_score += self.log_class_priors['positive']
            ham_score += self.log_class_priors['negative']
            if spam_score > ham_score:
                result.append(1)
            else:
                result.append(0)
        return result

#         return [0] * len(X)
        # CODE END

In [40]:
"""
DO NOT CHANGE this cell
"""
nb = NaiveBayesClassifier()
nb.fit(x_train, y_train)
y_test = nb.predict(x_test)

In [47]:
"""
TODO: Calculate precision, recall, and F1 in the below cell

Note:
- To make autograder work don't change names of variables defined below.
- Don't use libraries to compute them.
"""
tp = 0
fp = 0
fn = 0
tn = 0

# iterate over each prediction in y_pred
for i in range(len(y_test)):
    if y_true[i] == 1:
        # positive example
        if y_test[i] == 1:
            # true positive
            tp += 1
        else:
            # false negative
            fn += 1
    else:
        # negative example
        if y_test[i] == 1:
            # false positive
            fp += 1
        else:
            # true negative
            tn += 1

       
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Precision: 0.8504918032786886
Recall: 0.8629407850964738
F1: 0.8566710700132101


In [20]:
# DO NOT CHANGE this cell

import os

# Clean up extra files
os.remove(path=file_path)