In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from collections import defaultdict
import math

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import pandas as pd

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['IMDB Dataset.csv']))
df.shape
df.head()

In [None]:
lemmatizer = WordNetLemmatizer()

# word_counts[word][0] = occurrences of word in negative reviews
# word_counts[word][1] = occurrences of word in positive reviews
word_counts = defaultdict(lambda: [0, 0]) # returns [0, 0] by default if the key does not exist

STOP_WORDS = stopwords.words('english')

tokenizer = RegexpTokenizer(r'\w+')

sentiment = list(df['sentiment'])

done =  0

total_positive_words = 0
total_negative_words = 0

# keep track of the number of positive and negative reviews (prior probabilities)
total_positive_reviews = 0
total_negative_reviews = 0

In [None]:
for i, review in enumerate(list(df['review'])):
    if sentiment[i] == 'positive':
        total_positive_reviews += 1
    else:
        total_negative_reviews += 1
    
    for token in tokenizer.tokenize(review):
        token = token.lower()
        token = lemmatizer.lemmatize(token)
        if token not in STOP_WORDS:
            if sentiment[i] == 'positive':
                word_counts[token][1] += 1
                total_positive_words += 1
            else:
                word_counts[token][0] += 1
                total_negative_words += 1

In [None]:
word_counts = sorted(word_counts.items(),  key=lambda x : x[1][0] + x[1][1], reverse=True)[:5000]


In [None]:
word_counts = defaultdict(lambda: [0, 0], word_counts)

In [None]:
def calculate_word_probability(word, sentiment):
    if sentiment == 'positive':
        return math.log((word_counts[word][1] + 1) / (total_positive_words + 5000))
    else:
        return math.log((word_counts[word][0] + 1) / (total_negative_words + 5000))


In [None]:
def calculate_review_probability(review, sentiment):
    if sentiment == 'positive':
        probability = math.log(total_positive_reviews / len(df))
    else:
        probability = math.log(total_negative_reviews / len(df))
    
    for token in tokenizer.tokenize(review):
        token = token.lower()
        token = lemmatizer.lemmatize(token)
        if token not in STOP_WORDS:
            probability += calculate_word_probability(token, sentiment)
    return probability


In [None]:
def predict(review):
    if calculate_review_probability(review, 'positive') > calculate_review_probability(review, 'negative'):
        return 'positive'
    else:
        return 'negative'

In [None]:
correct  = 0
incorrect = 0
sentiments = list(df['sentiment'])
for i, text in enumerate(list(df['review'])):
    if predict(text) == sentiments[i]:
        correct += 1
    else:
        incorrect += 1

In [None]:
print(correct / (correct + incorrect))

NameError: ignored