In [1]:
# DO NOT CHANGE this cell
import zipfile
import pandas as pd

data_path = "./data"
file_path = "./data/imdb.csv"
zip_path = './data/imdb.zip'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(data_path)
print('Unzipping finished')

# Load the IMDb dataset
data = pd.read_csv(file_path)

# Read the TSV file for scoring
word_positivity_df = pd.read_csv("./data/2000.tsv", sep="\t", header=None, names=["word", "positivity", "ampl"])

Unzipping finished


In [2]:
data = data.drop_duplicates()
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
import re
import string
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

[nltk_data] Downloading package omw-1.4 to /Users/damirs/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/damirs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/damirs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
"""
TODO: Calculate y_pred

Hints:
1. Extract words from each review
    Note: you can use regex for that
    Also you can use spacy but it takes an eternity to complete :)
2. Compute mean_sentiment for each review
3. If mean_sentiment < 0 consider it as negative otherwise positive
"""

# Creating a function to remove stopwords
def remove_stopwords(text):
    s = []
    for words in text.split():
        if words not in stopwords:
            s.append(words)
    a = s[:]
    s.clear()
    return ' '.join(a)

def preprocess_text(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    cleaned_text = ' '.join(tokens)
    return cleaned_text
    
data['review'] = data['review'].str.replace(r'<[^<>]*>', '', regex = True)
data['review'] = data['review'].str.replace(r'https ? ://\s+|www\.\s+', '', regex = True)
data['review'] = data['review'].str.lower()
data['review'] = data['review'].str.replace('[{}]'.format(string.punctuation), '', regex = True)
data['review']= data['review'].apply(remove_stopwords)
data['review'] = data['review'].apply(preprocess_text)
data['words'] = data['review'].apply(word_tokenize)

data_exploded = data.explode("words")
merged = pd.merge(data_exploded, word_positivity_df, left_on="words", right_on="word", how="left")
# group by review and calculate mean sentiment
grouped = merged.groupby("review").agg({"positivity": "mean"})

# reset the index to convert the review column back to a regular column
grouped = grouped.reset_index()

# add a new column to indicate whether the review is positive or negative
grouped["sentiment"] = np.where(grouped["positivity"] < 0, "negative", "positive")
data = data.merge(grouped, on='review', how='left')[['review','sentiment_x','words','sentiment_y']] 

y_true = data["sentiment_x"].apply(lambda review: 0 if review == 'negative' else 1)
data['sentiment_x'] = y_true
y_pred = data["sentiment_y"].apply(lambda review: 0 if review == 'negative' else 1)
data['sentiment_y'] = y_pred

In [6]:
data

Unnamed: 0,review,sentiment_x,words,sentiment_y
0,one reviewer mentioned watching 1 oz episode y...,1,"[one, reviewer, mentioned, watching, 1, oz, ep...",0
1,wonderful little production filming technique ...,1,"[wonderful, little, production, filming, techn...",1
2,thought wonderful way spend time hot summer we...,1,"[thought, wonderful, way, spend, time, hot, su...",1
3,basically there family little boy jake think t...,0,"[basically, there, family, little, boy, jake, ...",0
4,petter matteis love time money visually stunni...,1,"[petter, matteis, love, time, money, visually,...",1
...,...,...,...,...
49577,thought movie right good job wasnt creative or...,1,"[thought, movie, right, good, job, wasnt, crea...",1
49578,bad plot bad dialogue bad acting idiotic direc...,0,"[bad, plot, bad, dialogue, bad, acting, idioti...",1
49579,catholic taught parochial elementary school nu...,0,"[catholic, taught, parochial, elementary, scho...",1
49580,im going disagree previous comment side maltin...,0,"[im, going, disagree, previous, comment, side,...",1


In [8]:
"""
TODO: Calculate precision, recall, and F1 in the below cell

Note:
- To make autograder work don't change names of variables defined below,
change their value to proper values.
- Don't use libraries to compute them.
"""
# initialize counts to zero
tp = 0
fp = 0
fn = 0
tn = 0

# iterate over each prediction in y_pred
for i in range(len(y_pred)):
    if y_true[i] == 1:
        # positive example
        if y_pred[i] == 1:
            # true positive
            tp += 1
        else:
            # false negative
            fn += 1
    else:
        # negative example
        if y_pred[i] == 1:
            # false positive
            fp += 1
        else:
            # true negative
            tn += 1

       
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

# precision = 0.6083282159253736
# recall = 0.8359990355248352
# f1 = 0.7042196306765287

print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Precision: 0.6083282159253736
Recall: 0.8359990355248352
F1: 0.7042196306765287


In [4]:
# DO NOT CHANGE this cell

import os

# Clean up extra files
os.remove(path=file_path)