## Import Dependencies

In [17]:
# import dependencies
import re
import nltk
import pandas as pd

# NLTK dependencies
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [7]:
# need to download required NLTK resources -- First Time Only --
import ssl 
try:
    _create_unverified_http_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_http_context

# download all NLTK resources
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_da

[nltk_data]    | Downloading package omw-1.4 to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    | Downloading package opinion_lexicon to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping corpora/opinion_lexicon.zip.
[nltk_data]    | Downloading package panlex_swadesh to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    | Downloading package paradigms to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping corpora/paradigms.zip.
[nltk_data]    | Downloading package pe08 to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping corpora/pe08.zip.
[nltk_data]    | Downloading package perluniprops to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping misc/perluniprops.zip.
[nltk_data]    | Downloading package pil to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping corpora/pil.

[nltk_data]    |   Unzipping corpora/wordnet2022.zip.
[nltk_data]    | Downloading package wordnet31 to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    | Downloading package wordnet_ic to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping corpora/wordnet_ic.zip.
[nltk_data]    | Downloading package words to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping corpora/words.zip.
[nltk_data]    | Downloading package ycoe to
[nltk_data]    |     /Users/gouravsinghbais/nltk_data...
[nltk_data]    |   Unzipping corpora/ycoe.zip.
[nltk_data]    | 
[nltk_data]  Done downloading collection all


True

## Load the Dataset

In [105]:
# read the dataset
imdb_df = pd.read_csv('IMDB_Dataset.csv')
# use only first 5000 rows of data
imdb_df = imdb_df[:5000]
# check first few rows
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Preprocess Text

In [106]:
# create preprocess_text function
def preprocess_text(text):

    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    
    # filter only the text data and special characters
    processed_text = re.sub('[^A-Za-z0-9]+', ' ', processed_text)

    return processed_text

In [107]:
# apply the function preprocess_text to the dataset
imdb_df['review'] = imdb_df['review'].apply(preprocess_text)
imdb_df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode l...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically s family little boy jake think s zom...,negative
4,petter mattei s love time money visually stunn...,positive


## NLTK Sentiment Analyzer

In [93]:
# initialize NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()


# create get_sentiment function
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    
    if scores['neg'] > scores['pos']:
        sentiment = 'negative'
    elif scores['pos'] > scores['neg']:
        sentiment = 'positive'
    else:
        sentiment = 'neutral'

    return sentiment

In [94]:
# apply get_sentiment function
imdb_df['sentimentPredicted'] = imdb_df['review'].apply(get_sentiment)
imdb_df.head()

Unnamed: 0,review,sentiment,sentimentPredicted
0,one reviewer mentioned watching 1 oz episode l...,positive,negative
1,wonderful little production br br filming tech...,positive,positive
2,thought wonderful way spend time hot summer we...,positive,positive
3,basically s family little boy jake think s zom...,negative,negative
4,petter mattei s love time money visually stunn...,positive,positive


In [95]:
## check performance
from sklearn.metrics import classification_report
print(classification_report(imdb_df['sentiment'], imdb_df['sentimentPredicted']))

              precision    recall  f1-score   support

    negative       0.80      0.48      0.60      2532
     neutral       0.00      0.00      0.00         0
    positive       0.62      0.87      0.73      2468

    accuracy                           0.68      5000
   macro avg       0.47      0.45      0.44      5000
weighted avg       0.71      0.68      0.66      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
