In [1]:
import pandas as pd
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# 01 - Load the data

In [2]:
# data source: Kaggle - https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
df_original = pd.read_csv('data/01-IMDB Dataset.csv')
df_original.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df_original['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [4]:
# Sample 1K each positive and negative reviews
df = df_original.groupby('sentiment').sample(n=50, random_state=1).reset_index(drop=True)
df['sentiment'].value_counts()

sentiment
negative    50
positive    50
Name: count, dtype: int64

In [5]:
df

Unnamed: 0,review,sentiment
0,I recently viewed Manufactured Landscapes at t...,negative
1,I figured that any horror film with Orson Well...,negative
2,Run away from this movie. Even by B-movie stan...,negative
3,Oh dear. I was so disappointed that this movie...,negative
4,Below average blaxpoitation action / melodrama...,negative
...,...,...
95,Just as the new BSG wasn't what fans of the or...,positive
96,Spoiler This movie is about such a concept. Wi...,positive
97,I remember seeing this movie back when it was ...,positive
98,"This is a must see for independant movie fans,...",positive


In [6]:
# Examine a few reviews
df.loc[1, 'review']

"I figured that any horror film with Orson Welles in it would be weird. Necromancy sure was but it was a little too weird for it's own good. The film does indeed have a creepy feel as it deals with a coven of satanists/witches in a small town and a young woman's attempt to escape them. The director though seems to be deliberately trying to confuse the audience by using flashbacks and dream sequences. By the finale, there are too many unanswered questions. What's worse, as the story is so confusing, it's pretty hard to root for any of the characters. It seems odd that Welles would agree to headline this film especially since he doesn't have that much to do. Maybe someday they will put out a tape of the outtakes and bloopers from this movie. Now that would really be fun!"

In [7]:
df.loc[25, 'review']

'let me first say, i watched this movie around midnight, and usually there only is trash around this hour, but this movie broke the record<br /><br />first of all the main character is an old non attractive creepy guy, yet he gets to f*ck all girls that come on his path for example he goes to a shop, talks to a girl and then you see them f*ck<br /><br />secondly there are loads of sex scenes, and in many of them there is no nudity at all, i would not have been surprised if one of the characters in the movie would say: fast put your clothes on so we can f*ck!<br /><br />thirdly this movie should show what a sexual addiction can do to a man or a family, this movie only shows soft bad acted erotica it makes me wonder why those actors agreed to play in such trash'

# 02 - Examine word frequencies in positive and negative reviews

## 02-01-Word processing

In [8]:
# Download stopwords from NLTK
nltk.download('punkt_tab')
nltk.download('stopwords')
print(stopwords.words('english'))
stop_words = set(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/dsrivallabha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dsrivallabha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Tokenizing and stop word removal
inp_text = """This is a sample sentence, showing off </ br> the stop words filtration.""" 
word_tokens = word_tokenize(inp_text)
fs = [w for w in word_tokens if not w.lower() in stop_words]    
print ('word tokens', word_tokens)
print ('filtered sentence', fs)

word tokens ['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', '<', '/', 'br', '>', 'the', 'stop', 'words', 'filtration', '.']
filtered sentence ['sample', 'sentence', ',', 'showing', '<', '/', 'br', '>', 'stop', 'words', 'filtration', '.']


In [10]:
ps = PorterStemmer()
def stopwordremoval(inp_text):
    customlist = ['>', '<', '/', '.', ',', 'br']
    word_tokens = word_tokenize(inp_text)
    sw = [ps.stem(w) for w in word_tokens]
    # by making it lower case
    fs1 = [w for w in sw if not w.lower() in stop_words]
    fs = [w for w in fs1 if w not in customlist]
    return (fs)

In [11]:
df['filtered_words'] = df['review'].apply(lambda x: stopwordremoval(x))

## 02-02-Count Word Frequencies

In [12]:
word_groups = df.groupby('sentiment').agg({'filtered_words': sum})

  word_groups = df.groupby('sentiment').agg({'filtered_words': sum})


In [13]:
pos_words_list = word_groups.loc['positive', 'filtered_words']
neg_words_list = word_groups.loc['negative', 'filtered_words']

In [14]:
pwc = Counter(pos_words_list)
pos_word_count = {k: v for k, v in pwc.items() if v>5}
print (pos_word_count)

{'watch': 24, 'thi': 133, 'movi': 102, 'everi': 6, 'time': 29, 'wa': 101, 'tv': 7, '(': 92, 'lot': 10, ')': 91, 'becaus': 18, 'humor': 7, 'may': 11, 'critic': 9, 'star': 8, 'love': 29, '--': 7, 'fun': 7, 'comedi': 7, 'pleas': 6, 'teen': 9, 'actor': 11, 'probabl': 10, 'much': 12, 'like': 39, 'cast': 12, "'s": 135, 'show': 14, 'famili': 8, 'best': 19, 'though': 8, 'includ': 8, 'mani': 13, 'think': 14, 'recommend': 6, 'saw': 10, 'veri': 24, 'funni': 14, 'especi': 6, 'nice': 16, 'look': 25, 'film': 82, 'stori': 18, 'effect': 9, 'charact': 16, 'realli': 27, 'shine': 8, 'set': 9, 'anim': 7, 'world': 8, 'use': 8, 'base': 7, 'help': 9, 'one': 48, 'enough': 7, 'better': 9, 'doe': 26, 'even': 21, 'plot': 7, "'": 23, 'great': 27, 'act': 12, 'first': 13, '!': 50, 'charm': 7, 'long': 6, 'befor': 16, ';': 23, 'ever': 6, 'sinc': 12, 'well': 21, 'peopl': 9, 'know': 15, 'good': 16, 'bad': 8, 'hi': 59, 'premis': 6, 'noth': 9, 'new': 9, 'make': 22, 'main': 6, 'way': 11, 'made': 16, 'least': 8, '``': 85, 

In [15]:
print ('no of unique words in positive review is', len(pos_word_count.keys()))

no of unique words in positive review is 229


In [16]:
nwc = Counter(neg_words_list)
neg_word_count = {k: v for k, v in nwc.items() if v>5}
print (neg_word_count)

{'film': 71, 'wa': 88, 'movi': 122, 'becaus': 11, "'m": 18, 'fan': 8, "'s": 135, 'work': 18, 'believ': 9, 'doe': 18, 'good': 30, 'get': 29, 'across': 12, 'could': 18, "n't": 57, 'feel': 17, 'made': 15, 'complet': 7, 'book': 13, 'one': 57, 'reason': 6, 'thi': 151, 'use': 14, 'still': 8, 'like': 49, '(': 63, ')': 63, 'hi': 33, 'might': 7, 'better': 15, 'ani': 21, 'horror': 11, 'well': 17, 'would': 35, 'sure': 10, 'littl': 9, 'young': 6, 'director': 14, 'seem': 17, 'tri': 16, 'mani': 10, 'stori': 27, 'pretti': 9, 'charact': 19, 'much': 23, 'put': 11, 'realli': 16, 'fun': 7, '!': 36, 'run': 7, 'even': 29, 'b-movi': 7, 'also': 12, 'peopl': 16, 'worth': 6, 'lot': 8, 'see': 26, 'kid': 7, 'make': 26, 'whi': 10, 'ha': 28, 'person': 7, 'usual': 8, 'thing': 15, 'ridicul': 7, 'act': 22, 'direct': 11, 'present': 6, 'noth': 11, 'interest': 10, 'version': 6, 'least': 12, 'veri': 22, 'watch': 18, 'know': 13, 'great': 13, 'come': 15, 'ca': 8, 'happen': 6, 'big': 6, ':': 20, 'averag': 6, 'love': 20, 'pl

In [17]:
print ('no of unique words in negative review is', len(neg_word_count.keys()))

no of unique words in negative review is 192


## 02-03- Unique Words

In [18]:
# Positive words, not in negative reviews
pk = [k for k in list(pos_word_count.keys()) if not k in list(neg_word_count.keys())]
print (pk)

['tv', 'humor', 'critic', 'comedi', 'pleas', 'teen', 'probabl', 'show', 'best', 'though', 'includ', 'recommend', 'saw', 'funni', 'especi', 'nice', 'effect', 'shine', 'anim', 'world', 'base', 'help', 'enough', 'charm', 'long', 'befor', 'sinc', 'premis', 'new', 'main', 'add', 'life', "'ll", 'moment', 'hand', 'although', 'power', 'beauti', 'branagh', 'viewer', 'without', 'speak', 'keep', 'product', "'re", 'rather', 'alway', 'portray', "'ve", 'man', 'super', 'jack', 'appear', 'job', 'bit', 'special', 'event', 'fortun', 'tell', 'fact', 'part', 'past', 'leav', 'hope', 'enjoy', 'day', 'felt', 'hercul', 'perfect', 'australia', 'polit', 'histori', 'dismiss', 'govern', 'governor-gener', 'australian', 'law', 'section', 'howev', 'hold', 'write', 'final', 'differ', 'sever', 'cute', 'left', 'live', 'dure', 'find', 'matt', 'evil', 'hard', 'friend', 'murder', 'deliv', 'kubrick', 'wendi', 'danni', 'hotel', 'toni', 'cold', 'woman', 'blood', 'wilson', 'superhero', 'g-girl', 'thurman', 'jenni']


In [19]:
# Negative words, not in positive reviews
nk = [k for k in list(neg_word_count.keys()) if not k in list(pos_word_count.keys())]
print (nk)

['across', 'complet', 'reason', 'might', 'sure', 'director', 'put', 'run', 'b-movi', 'worth', 'kid', 'whi', 'person', 'usual', 'ridicul', 'present', 'ca', 'averag', 'eye', 'origin', 'script', 'role', 'old', 'someth', 'anyon', 'els', 'horribl', 'bore', 'shot', 'minut', 'worst', 'stupid', 'call', 'read', 'whole', 'mention', 'phone', 'wast', 'except', 'got', 'approach', 'high', 'olivi', 'kind', 'instead', 'london', 'give', 'pain', 'done', 'lack', 'anoth', '..', 'dull', 'crazi', 'convinc', 'annoy', 'monster', 'budget', 'around', '&', 'croc', 'poor', 'song', 'five', 'hous', 'danc', 'fred', 'joan', 'georg', 'number', '\uf0b7']
