In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df.shape

(50000, 2)

In [5]:
df.size

100000

In [6]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [7]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [8]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [9]:
type(df['review'][0])

str

In [10]:
len(df['review'][0])

1761

### Let's now LOWERCASE all the texts for Model Understanding Perspective ###

In [11]:
df['review'].str.lower()

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [12]:
type(df['review'].str.lower())

pandas.core.series.Series

In [13]:
df['lowerCase'] = df['review'].str.lower()

In [14]:
df

Unnamed: 0,review,sentiment,lowerCase
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production. <br /><br />the...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"petter mattei's ""love in the time of money"" is..."
...,...,...,...
49995,I thought this movie did a down right good job...,positive,i thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...,negative,i am a catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,negative,i'm going to have to disagree with the previou...


### Let's now remove some unnecessary letters/words like HTML tags.

In [15]:
import re

In [16]:
pattern = "<.*?>"

In [17]:
df['lowerCase'][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

In [18]:
# re.match(pattern=pattern,string=df['lowerCase'][0])

In [19]:
# re.findall(pattern,df['lowerCase'][0])

In [20]:
# html_tags = re.findall(pattern,df['lowerCase'][0])

In [21]:
# sample_string = df['lowerCase'][0]

In [22]:
# c = ""

In [23]:
# for i in html_tags:
#     if i in sample_string:
#         print(sample_string.index(i))
#         c = sample_string.replace(i,"")

In [24]:
def htmlTagRemover(text):
    pattern = re.compile('<.*?>')
    return pattern.sub('',text)

In [25]:
reviews = df['lowerCase'].values
htmlTagProcessed = []
for i in reviews:
    htmlTagProcessed.append(htmlTagRemover(i))

In [26]:
### Adding the new column in the DataFrame ###

In [27]:
df['htmlTagProcessed'] = htmlTagProcessed
df.rename(columns={'lowerCase':'lowerCaseProcessed'}, inplace=True)

In [28]:
df

Unnamed: 0,review,sentiment,lowerCaseProcessed,htmlTagProcessed
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production. <br /><br />the...,a wonderful little production. the filming tec...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there's a family where a little boy ...,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"petter mattei's ""love in the time of money"" is...","petter mattei's ""love in the time of money"" is..."
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,i thought this movie did a down right good job...,i thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"bad plot, bad dialogue, bad acting, idiotic di...","bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...,negative,i am a catholic taught in parochial elementary...,i am a catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,negative,i'm going to have to disagree with the previou...,i'm going to have to disagree with the previou...


In [29]:
df['htmlTagProcessed'][2]

'i thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. the plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). while some may be disappointed when they realize this is not match point 2: risk addiction, i thought it was proof that woody allen is still fully in control of the style many of us have grown to love.this was the most i\'d laughed at one of woody\'s comedies in years (dare i say a decade?). while i\'ve never been impressed with scarlet johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.this may not be the crown jewel of his career, but it was wittier than "devil wears prada" and more interesting than "superman" a great comedy to go see with friends.'

### Let's see some examples of removing URLs

In [30]:
given_text = 'Github portal of Aditya Das: https://github.com/dasaditya1999'

In [31]:
pattern = re.compile('https.*')

In [32]:
processed_text = pattern.sub(r'',given_text)

In [33]:
processed_text

'Github portal of Aditya Das: '

### Let's now focus on removing punctuation Marks

In [34]:
import string, time

In [35]:
string.ascii_letters

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [36]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [37]:
string.ascii_uppercase

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [38]:
string.capwords("ABCDE")

'Abcde'

In [39]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [40]:
def removePunctuation(text):
    punc = string.punctuation
    punc = punc.replace(punc[22],'')
    punc = '['+punc+']'
    pattern = re.compile(punc)
    return pattern.sub('',text)

In [41]:
import time
start = time.time()

puncRemovedTexts = []
texts = df['htmlTagProcessed'].values
for i in texts:
    puncRemovedTexts.append(removePunctuation(i))
    
end = time.time()
print('Total time taken to process the 50k data points: ',end-start)

Total time taken to process the 50k data points:  0.510706901550293


In [42]:
# puncRemovedTexts[0]

In [43]:
# df['htmlTagProcessed'][0]

In [44]:
df['removedPunctuation'] = puncRemovedTexts

In [45]:
df2 = df[['removedPunctuation','sentiment']]

In [46]:
# df2

In [47]:
# df

In [48]:
# df2['removedPunctuation'][0]

In [49]:
import string

In [50]:
exclude = string.punctuation
exclude = exclude.replace(exclude[21],'')

def removePunctuationOptimized(text):
    return text.translate(str.maketrans('','',exclude))

In [52]:
# processedText = []
# inputTexts = df['htmlTagProcessed'].values
# for i in inputTexts:
#     processedText.append(removePunctuationOptimized(inputTexts))

### Chat Word Treatment ###

In real life scenario the texts are mess. Those are sometimes are not understandable to a normal person & so
to the model. Hence, we have to convert those special words with the normal wordings/with it's full fledged 
meaning.

In [53]:
slang_full_form_pair = dict()

file = open('sms_slang_translator.txt','r')
data = file.readlines()
for i in data:
    try:
        arr = i.split('=')
        key = arr[0]
        value = arr[1].replace('\n','')
    except IndexError:
        continue
    slang_full_form_pair[key] = value

In [54]:
slang_full_form_pair

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': "For What It's Worth",
 'FYI': 'For Your Information',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'ILU: I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LM

In [55]:
inputDataForChatWordProcessing = df2['removedPunctuation'].values

In [91]:
start = time.time()
for i in range(0,len(inputDataForChatWordProcessing)):
    for j in slang_full_form_pair.keys():
        if j in inputDataForChatWordProcessing[i]:
            print(j, slang_full_form_pair[j])
            target = df2['removedPunctuation'][i]
            df2['removedPunctuation'][i] = target.replace(j,slang_full_form_pair[j])
            
end = time.time()
print(end-start)

IMHO In My Honest/Humble Opinion


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['removedPunctuation'][i] = target.replace(j,slang_full_form_pair[j])


3.1396448612213135


In [92]:
# df2['removedPunctuation'][0] = df2['removedPunctuation'][0].replace('one','IMHO')

In [93]:
df2

Unnamed: 0,removedPunctuation,sentiment
0,In My Honest/Humble Opinion of the other revie...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,i am a catholic taught in parochial elementary...,negative
49998,im going to have to disagree with the previous...,negative


### Let's now Handle the spelling of each word in the text

In [59]:
!pip install textBlob

Collecting textBlob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.8/636.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: textBlob
Successfully installed textBlob-0.17.1


In [60]:
from textblob import TextBlob

In [63]:
incorrectText = 'Food is one of the essntilo as per the scientst?'
textBlobObject = TextBlob(text=incorrectText)
textBlobObject.correct().string

'Good is one of the essntilo as per the scientist?'

In [72]:
# !pip install nltk

In [71]:
# !pip install autocorrect

In [67]:
from autocorrect import Speller

In [73]:
obj = Speller(lang='en')

In [75]:
obj('Mahchine Learning')

'Machine Learning'

In [76]:
obj('convinience')

'convenience'

### Stop Word Removal from the text

Now after doing LowerCasing, HTML tag removal, url link removal, punctuation removal, chat word handling, and Spell checking, we will apply Stop Word removal Technique to remove the stop words.

#### We can do the stop word removal by using the nltk module of python

In [82]:
from nltk.corpus import stopwords

In [86]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adityanarayandas/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [89]:
stopWords = stopwords.words('english')

In [96]:
inputText = df2['removedPunctuation'].values

In [109]:
def removeStopWords(text):
    processedWords = []
    words = text.split(' ')
    for i in words:
        if i in stopWords:
            i = i.replace(i,'')
            processedWords.append(i)
        else:
            processedWords.append(i)
    
    return ' '.join(processedWords)
            

In [110]:
processedText = []
for i in inputText:
    processedText.append(removeStopWords(i))

df2['StopWordRemovedText'] = processedText

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['StopWordRemovedText'] = processedText


In [112]:
len(df2['removedPunctuation'][0])

1713

In [114]:
len(df2['StopWordRemovedText'][0])

1340

### Now the most important part comes of TextPreProcessing i.e. Tokenisation.

Tokenisation is the process of separating the text as word wise, sentence wise, phrase wise or paragraph wise.

In [118]:
string = "Hey, how are you? Hope all is good. Yesterday match was horrible! They chased any how. Let's see what can we do @India.@"

In [119]:
re.split(pattern='[!,".? ]',string=string)

['Hey',
 '',
 'how',
 'are',
 'you',
 '',
 'Hope',
 'all',
 'is',
 'good',
 '',
 'Yesterday',
 'match',
 'was',
 'horrible',
 '',
 'They',
 'chased',
 'any',
 'how',
 '',
 "Let's",
 'see',
 'what',
 'can',
 'we',
 'do',
 '@India',
 '@']

We can use following methods to complete the Tokenisation.

1) Split() method
2) Regex method
3) NLTK module word_tokenizer & sent_tokensizer
4) Spacy module

### Stemming

Stemming is the process of bringing each & every word to it's original root word. Stemming helps model understanding the data better way.

### Lemmatization

Lemmatization is the same technique as the Stemming. The difference in between stemming & lemmatization is that lemmatization brings the word to root word in the same language as the original one. But stemming does not care about the language. 

In [125]:
import sys
a = int(10)
sys.getsizeof(a)

28

In [128]:
sum1 = 0
for i in range(0,29):
    sum1 = sum1 + 2**i
print(sum1)

536870911


In [137]:
b = 5
sys.getsizeof(b)
b.bit_length()

3

In [139]:
0.1 + 0.2

0.30000000000000004

In [140]:
0.3

0.3

In [142]:
round(0.30000000000000004,1) == 0.3

True

In [147]:
k = 2+ 3J

In [150]:
ord('a')

97

In [152]:
a = 'abuhuh'
type(a)

str

In [153]:
a,b,c = 1,2,3

In [155]:
a=b=c='yyy'

In [158]:
c

'yyy'

In [160]:
sys.version

'3.11.4 (main, Jul  5 2023, 08:54:11) [Clang 14.0.6 ]'

In [161]:
d = {'a':190,'b':180}

In [172]:
d1 = dict.fromkeys(d,[100,200])

In [173]:
d1

{'a': [100, 200], 'b': [100, 200]}

In [174]:
pd.DataFrame(d1)

Unnamed: 0,a,b
0,100,100
1,200,200


In [175]:
d1.items()

dict_items([('a', [100, 200]), ('b', [100, 200])])

In [180]:
a = 10
print('here1',id(a))
def fun():
    a = 100
    b = 20
    print('inside func',id(a))

a=10000
print('here2',id(a))
fun()

here1 4370982080
here2 11111897584
inside func 4370984960


In [204]:
s = {1,2,3}
s2 = {3,5,6}

In [207]:
# s3 = s.intersection(s2)
s2 = s&s2
s3

{3}

In [206]:
s3

{3}

In [192]:
s.add(9)

In [194]:
s.remove(9)

In [210]:
ord('S')

83

In [214]:
l = [1,2,34,5]

In [215]:
l.pop(2)

34

In [221]:
l.insert(0,5563)

In [222]:
l

[5563, 1, 2, 5, 7, 3]

In [249]:
print('{0:0.0f}'.format(7.0/3))

2


In [225]:
7.0/3

2.3333333333333335

In [252]:
print('a','b')

a b


In [232]:
txt = "For only {price:0.4f} dollars!"
print(txt.format(price = 49))

For only 49.0000 dollars!


In [255]:
l = ['a','b','zxc']
' '.join(l)

'a b zxc'

In [257]:
s1 = 'abc'
s2 = 'def'

print("%s%s" %(s1,s2))

abcdef


In [264]:
s = s1[0:2]+'z'

In [268]:
n = s.replace('z','k')

In [272]:
a  = 'Happy Birthday!'
a[-1:-10:-2]

'!ahrB'

In [274]:
type(id(a))

int

In [275]:
v = l.pop()

In [276]:
v

'zxc'

In [277]:
l

['a', 'b']