In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df["review"] = df["review"].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# Data preprocessing

In [5]:
import re
def remove_html(text):
    pattern = re.compile("<.*?>")
    return pattern.sub(r'', text)

df["review"] = df["review"].apply(remove_html)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [6]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df['review'] = df["review"].apply(remove_url)

In [7]:
import string
punc = string.punctuation

def remove_punc(text):
    for char in punc:
        text = text.replace(char, '')
    return text

df["review"] = df["review"].apply(remove_punc)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


# Word count and vocabulary

In [8]:
'''
def word_count(text):
    count=0
    for i in range(0, len(text)):
        if text[i] == " ":
            count=count+1
    return count+1
'''

def word_count(text):
    return len(text.split())

In [9]:
df["review"].apply(word_count).sum()

11312631

In [10]:
df.loc[49996, "review"]

'bad plot bad dialogue bad acting idiotic directing the annoying porn groove soundtrack that ran continually over the overacted script and a crappy copy of the vhs cannot be redeemed by consuming liquor trust me because i stuck this turkey out to the end it was so pathetically bad all over that i had to figure it was a fourthrate spoof of springtime for hitlerthe girl who played janis joplin was the only faint spark of interest and that was only because she could sing better than the originalif you want to watch something similar but a thousand times better then watch beyond the valley of the dolls'

In [11]:
def get_unique_words_corpus(series):
    all_text = " ".join(series.astype(str))
    unique_words = set(all_text.split())
    return unique_words

In [12]:
len(get_unique_words_corpus(df["review"]))

222572

# Bag of words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(df['review'])

# Shape: (num_samples, num_unique_words)
print("Bag of Words shape:", bow_matrix.shape)

Bag of Words shape: (50000, 221259)


In [14]:
#print(bow_vectorizer.vocabulary_)
vocab = bow_vectorizer.vocabulary_
for i, (word, index) in enumerate(vocab.items()):
    print(f"{word}: {index}")
    if i == 100:  # stop after 100 entries
        break

one: 139400
of: 138049
the: 194312
other: 141293
reviewers: 163115
has: 88563
mentioned: 123387
that: 194056
after: 7623
watching: 212020
just: 105600
oz: 143147
episode: 64538
youll: 219943
be: 20360
hooked: 93680
they: 195383
are: 14200
right: 163887
as: 15185
this: 195911
is: 101549
exactly: 66494
what: 214178
happened: 87824
with: 216172
methe: 123992
first: 73574
thing: 195504
struck: 187051
me: 122110
about: 4827
was: 211615
its: 102629
brutality: 29025
and: 11373
unflinching: 205163
scenes: 169347
violence: 209701
which: 214490
set: 173676
in: 97851
from: 77634
word: 217076
go: 82277
trust: 202075
not: 136009
show: 176055
for: 75469
faint: 69031
hearted: 89456
or: 140606
timid: 197973
pulls: 155815
no: 134651
punches: 155898
regards: 160512
to: 198356
drugs: 58773
sex: 173942
hardcore: 88121
classic: 38396
use: 207170
wordit: 217095
called: 31236
nickname: 134078
given: 81817
oswald: 141268
maximum: 121411
security: 171547
state: 184633
penitentary: 146037
it: 102050
focuses: 74

In [15]:
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
print(sorted_vocab[4800:4900])  # First 100 items by index

[('abortion', 4800), ('abortionist', 4801), ('abortionistlots', 4802), ('abortionnot', 4803), ('abortions', 4804), ('abortionthe', 4805), ('abortive', 4806), ('aborts', 4807), ('abos', 4808), ('abott', 4809), ('abou', 4810), ('abound', 4811), ('aboundafter', 4812), ('aboundand', 4813), ('aboundbottom', 4814), ('aboundclint', 4815), ('aboundcoming', 4816), ('abounddays', 4817), ('abounded', 4818), ('abounding', 4819), ('abounds', 4820), ('aboundseverything', 4821), ('aboundsnot', 4822), ('aboundsthis', 4823), ('aboundthe', 4824), ('aboundthird', 4825), ('aboundtoday', 4826), ('about', 4827), ('about2', 4828), ('about4they', 4829), ('about800', 4830), ('abouta', 4831), ('aboutafter', 4832), ('aboutagirly', 4833), ('aboutalice', 4834), ('aboutall', 4835), ('aboutalso', 4836), ('aboutamerican', 4837), ('aboutand', 4838), ('aboutanother', 4839), ('aboutanyway', 4840), ('aboutapart', 4841), ('aboutarent', 4842), ('aboutas', 4843), ('aboutat', 4844), ('aboutavp1', 4845), ('aboutbeauty', 4846)

In [16]:
bow_vectorizer.get_feature_names_out()[:50]

array(['00', '000', '0000000000001', '00000001', '000001', '0001',
       '00015', '001', '0010', '002', '00383042', '006', '0069', '007',
       '0079', '007s', '007the', '0080', '0083', '009', '00agent', '00s',
       '00schneider', '01', '010', '01000', '0101', '010606', '010707',
       '010guinea', '010objectionable', '010overall', '010ps', '0110',
       '012310', '0126', '0130', '013007', '02', '0205', '0210', '0230',
       '029', '02i', '03', '030', '03092005', '0310', '039', '03oct2009'],
      dtype=object)

In [17]:
print(bow_matrix[6].toarray())

[[0 0 0 ... 0 0 0]]


In [18]:
bow_vectorizer.transform(["00 i am a good boy 99"]).toarray()

array([[1, 0, 0, ..., 0, 0, 0]], dtype=int64)

# N-grams

## Bi-gram

In [19]:
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_matrix = bigram_vectorizer.fit_transform(df['review'])
print("Bigram shape:", bigram_matrix.shape)

Bigram shape: (50000, 2598869)


In [20]:
bigram_vectorizer.get_feature_names_out()[:50]

array(['00 agent', '00 comes', '00 for', '00 including', '00 its',
       '00 schneider', '000 000', '000 overboard', '000 to',
       '0000000000001 out', '00000001 of', '000001 chances',
       '0001 percent', '00015 seconds', '001 and', '001 on', '001 to',
       '001 was', '0010 and', '002 hope', '002 out', '00383042 kdos',
       '006 but', '0069 tries', '007 about', '007 adventure',
       '007 aficionado', '007 agent', '007 and', '007 appearances',
       '007 as', '007 at', '007 atmosphereon', '007 because', '007 debut',
       '007 didnt', '007 difficulty', '007 dr', '007 facing', '007 fans',
       '007 fatima', '007 films', '007 following', '007 franchise',
       '007 frwl', '007 gadgets', '007 game', '007 games', '007 gamesi',
       '007 in'], dtype=object)

In [21]:
vocab = bigram_vectorizer.vocabulary_
for i, (word, index) in enumerate(vocab.items()):
    print(f"{word}: {index}")
    if i == 100:  # stop after 100 entries
        break

one of: 1613060
of the: 1585778
the other: 2225746
other reviewers: 1642840
reviewers has: 1864423
has mentioned: 991276
mentioned that: 1410178
that after: 2186664
after watching: 65693
watching just: 2468890
just oz: 1232180
oz episode: 1664005
episode youll: 719813
youll be: 2586987
be hooked: 256005
hooked they: 1066566
they are: 2262393
are right: 179710
right as: 1871514
as this: 202559
this is: 2276539
is exactly: 1173210
exactly what: 745024
what happened: 2494019
happened with: 982458
with methe: 2540256
methe first: 1414758
first thing: 827632
thing that: 2267909
that struck: 2196556
struck me: 2113996
me about: 1395615
about oz: 30065
oz was: 1664080
was its: 2457927
its brutality: 1197532
brutality and: 354707
and unflinching: 144013
unflinching scenes: 2386611
scenes of: 1921988
of violence: 1587704
violence which: 2436092
which set: 2506430
set in: 1964470
in right: 1122651
right from: 1871811
from the: 881861
the word: 2240461
word go: 2556889
go trust: 928937
trust me: 

In [22]:
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
print(sorted_vocab[:100])  # First 100 items by index

[('00 agent', 0), ('00 comes', 1), ('00 for', 2), ('00 including', 3), ('00 its', 4), ('00 schneider', 5), ('000 000', 6), ('000 overboard', 7), ('000 to', 8), ('0000000000001 out', 9), ('00000001 of', 10), ('000001 chances', 11), ('0001 percent', 12), ('00015 seconds', 13), ('001 and', 14), ('001 on', 15), ('001 to', 16), ('001 was', 17), ('0010 and', 18), ('002 hope', 19), ('002 out', 20), ('00383042 kdos', 21), ('006 but', 22), ('0069 tries', 23), ('007 about', 24), ('007 adventure', 25), ('007 aficionado', 26), ('007 agent', 27), ('007 and', 28), ('007 appearances', 29), ('007 as', 30), ('007 at', 31), ('007 atmosphereon', 32), ('007 because', 33), ('007 debut', 34), ('007 didnt', 35), ('007 difficulty', 36), ('007 dr', 37), ('007 facing', 38), ('007 fans', 39), ('007 fatima', 40), ('007 films', 41), ('007 following', 42), ('007 franchise', 43), ('007 frwl', 44), ('007 gadgets', 45), ('007 game', 46), ('007 games', 47), ('007 gamesi', 48), ('007 in', 49), ('007 is', 50), ('007 jame

## Tri-gram

In [23]:
trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))
trigram_matrix = trigram_vectorizer.fit_transform(df['review'])
print("trigram shape:", trigram_matrix.shape)

trigram shape: (50000, 6702694)


In [24]:
trigram_vectorizer.get_feature_names_out()[:50]

array(['00 agent difficult', '00 agent level', '00 comes back',
       '00 for acting', '00 including the', '00 its worth',
       '00 schneider and', '00 schneider directly', '000 000 overboard',
       '000 to produce', '0000000000001 out of', '00000001 of the',
       '000001 chances of', '0001 percent it', '00015 seconds with',
       '001 and 360', '001 on the', '001 to believe', '001 was brief',
       '0010 and would', '002 hope this', '002 out of',
       '00383042 kdos okwwfst', '006 but the', '0069 tries to',
       '007 about the', '007 adventure as', '007 aficionado nevertheless',
       '007 agent and', '007 and decides', '007 and he', '007 and save',
       '007 and their', '007 and this', '007 appearances the',
       '007 as an', '007 at least', '007 atmosphereon the',
       '007 because trust', '007 debut in', '007 didnt swing',
       '007 difficulty our', '007 dr no', '007 facing the',
       '007 fans would', '007 fatima is', '007 films made',
       '007 following

In [25]:
vocab = trigram_vectorizer.vocabulary_
for i, (word, index) in enumerate(vocab.items()):
    print(f"{word}: {index}")
    if i == 100:  # stop after 100 entries
        break

one of the: 4065342
of the other: 3964404
the other reviewers: 5550694
other reviewers has: 4145618
reviewers has mentioned: 4606999
has mentioned that: 2364043
mentioned that after: 3488401
that after watching: 5309986
after watching just: 140380
watching just oz: 6317163
just oz episode: 3084529
oz episode youll: 4197721
episode youll be: 1715816
youll be hooked: 6675952
be hooked they: 725120
hooked they are: 2587155
they are right: 5720725
are right as: 528589
right as this: 4620227
as this is: 598860
this is exactly: 5781418
is exactly what: 2865797
exactly what happened: 1776858
what happened with: 6388439
happened with methe: 2341668
with methe first: 6531824
methe first thing: 3496425
first thing that: 1980846
thing that struck: 5745323
that struck me: 5355995
struck me about: 5160305
me about oz: 3452822
about oz was: 52668
oz was its: 4197862
was its brutality: 6273682
its brutality and: 2998685
brutality and unflinching: 939329
and unflinching scenes: 432262
unflinching scen

In [26]:
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
print(sorted_vocab[:100])  # First 100 items by index

[('00 agent difficult', 0), ('00 agent level', 1), ('00 comes back', 2), ('00 for acting', 3), ('00 including the', 4), ('00 its worth', 5), ('00 schneider and', 6), ('00 schneider directly', 7), ('000 000 overboard', 8), ('000 to produce', 9), ('0000000000001 out of', 10), ('00000001 of the', 11), ('000001 chances of', 12), ('0001 percent it', 13), ('00015 seconds with', 14), ('001 and 360', 15), ('001 on the', 16), ('001 to believe', 17), ('001 was brief', 18), ('0010 and would', 19), ('002 hope this', 20), ('002 out of', 21), ('00383042 kdos okwwfst', 22), ('006 but the', 23), ('0069 tries to', 24), ('007 about the', 25), ('007 adventure as', 26), ('007 aficionado nevertheless', 27), ('007 agent and', 28), ('007 and decides', 29), ('007 and he', 30), ('007 and save', 31), ('007 and their', 32), ('007 and this', 33), ('007 appearances the', 34), ('007 as an', 35), ('007 at least', 36), ('007 atmosphereon the', 37), ('007 because trust', 38), ('007 debut in', 39), ('007 didnt swing', 

In [27]:
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
print(sorted_vocab[38000:38100])  # First 100 items by index

[('9yearoldseven the oneliners', 38000), ('a1 airhead trying', 38001), ('a1 and the', 38002), ('a1 in my', 38003), ('a10 out of', 38004), ('a2 is better', 38005), ('a26 both designations', 38006), ('a2nd story waxworks', 38007), ('a3 skywarriors the', 38008), ('a320 shuttle passengers', 38009), ('a4th story the', 38010), ('a5 vigilantes and', 38011), ('a50 minute episode', 38012), ('a5zo al shai6an', 38013), ('a666333 has articulated', 38014), ('aa and dates', 38015), ('aa antics and', 38016), ('aa as mindless', 38017), ('aa cultrehab many', 38018), ('aa doctor she', 38019), ('aa group but', 38020), ('aa group where', 38021), ('aa is violent', 38022), ('aa jaega each', 38023), ('aa level and', 38024), ('aa meetings and', 38025), ('aa meetings are', 38026), ('aa meetings bill', 38027), ('aa meri life', 38028), ('aa milnes books', 38029), ('aa presentation and', 38030), ('aa rating and', 38031), ('aa the acting', 38032), ('aa uo uo', 38033), ('aa was not', 38034), ('aaa ball for', 38035)

# TF-IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (50000, 221259)


In [30]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(df['review'])  # Just fit, don't need transform here

# Get IDF values and their corresponding words
idf_values = tfidf_vectorizer.idf_
words = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame for easy viewing
idf_df = pd.DataFrame({'word': words, 'idf': idf_values})

idf_df = idf_df.sort_values(by='idf')

idf_df.head(20)  # Top 20 common words

Unnamed: 0,word,idf
194312,the,1.009303
11373,and,1.036601
138049,of,1.05321
198356,to,1.063002
195911,this,1.107538
101549,is,1.112428
97851,in,1.131268
102050,it,1.17069
194056,that,1.228804
75469,for,1.345896
