# 0. Datasets Used
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

# 1. Loading Dataset

In [2]:
import pandas as pd
df = pd.read_csv("./datasets/IMDB Dataset.csv")

In [3]:
print(df.shape)
df.head(5)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# 2. Preprocessing

## 2.1 Basic Preprocessing

### 2.1.1 Lowercase

In [4]:
df.review = df.review.str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### 2.1.2 Removing HTML Tags

In [5]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)
remove_html_tags("<br /><h1>Heading</h1>")

'Heading'

In [6]:
df.review = df.review.apply(remove_html_tags)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### 2.1.3 Removing URLS

In [7]:
# Removing URLS
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)
remove_url("check out the dataset https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

'check out the dataset '

In [8]:
df.review = df.review.apply(remove_url)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### 2.1.4 Removing Punctuations

In [9]:
import string
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def remove_punc(text):
    return text.translate(str.maketrans("", "", punc))

remove_punc("Hello, I am Ali's friend!")

'Hello I am Alis friend'

In [11]:
df.review = df.review.apply(remove_punc)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### 2.1.5 Abbreviations

In [12]:
abbv = {
    "AFAIK":"as far as I know",
	"IMO":	"in my opinion",
	"IMHO":	"in my humble opinion",
	"LGTM":	"look good to me",
	"AKA":	"also know as",
	"ASAP":	"as sone as possible",
	"BTW":	"by the way",
	"FAQ":	"frequently asked questions",
	"DIY":	"do it yourself",
	"DM":	"direct message",
	"FYI":	"for your information",
	"IC":	"i see",
	"IOW":	"in other words",
	"IIRC":	"If I Remember Correctly",
	"icymi":"In case you missed it",
	"CUZ":	"because",
	"COS":	"because",
	"nv":	"nevermind",
	"PLZ":	"please",
}

In [13]:
def abbv_conv(text):
    new_text = []
    for i in text.split(" "):
        new_text.append(abbv.get(i.upper(), i))
    return " ".join(new_text)

abbv_conv("FYI Islamabad is the second most beautiful city")

'for your information Islamabad is the second most beautiful city'

In [14]:
df.review = df.review.apply(abbv_conv)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### 2.1.6 Spelling Correction

In [15]:
from textblob import TextBlob

def spell_correct(text):
    textBlb = TextBlob(text)
    return textBlb.correct().string

spell_correct("hello frm the othr side")

'hello from the other side'

### 2.1.7 Removing Stopwords

In [16]:
from nltk.corpus import stopwords

stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [17]:
def remove_stopwords(text):
    new_text = []
    for i in text.split(" "):
        if i.lower() not in stopwords.words("english"):
            new_text.append(i)
    return " ".join(new_text)

remove_stopwords("I am felling myself today")

'felling today'

### 2.1.8 Handling Emojis

In [18]:
emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F"  # emoticons
	u"\U0001F300-\U0001F5FF"  # symbols & pictographs
	u"\U0001F680-\U0001F6FF"  # transport & map symbols
	u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
"]+", flags=re.UNICODE)

def remove_emoji(text):
	return emoji_pattern.sub(r'', text)

In [19]:
text = "This dog \U0001f602"
text

'This dog ðŸ˜‚'

In [20]:
remove_emoji(text)

'This dog '

In [21]:
import emoji

emoji.demojize(text)

'This dog :face_with_tears_of_joy:'

## 2.2 Advanced Preprocessing

### 2.2.1 Tokenization

#### 2.2.1.1 Using Split Function

In [22]:
sent1 = "I am from Pakistan"
sent2 = "I am from Pakistan. I was born in Khanewal. Soon we moved to Islambad"
sent3 = "I am going to Pakistan!"

In [23]:
# Word Split
sent1.split(" ")

['I', 'am', 'from', 'Pakistan']

In [24]:
# Sentence Split
sent2.split(".")

['I am from Pakistan', ' I was born in Khanewal', ' Soon we moved to Islambad']

In [25]:
# ! mark included
sent3.split()

['I', 'am', 'going', 'to', 'Pakistan!']

#### 2.2.1.2 Regular Expressions

In [26]:
import re
sent4 = "I am going to Karachi!"
tokens = re.findall("[\w']+", sent4)
tokens

['I', 'am', 'going', 'to', 'Karachi']

#### 2.2.1.3 NLTK

In [27]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [28]:
sent5 = "Hi, I am going to Faisalabad!"
word_tokenize(sent5)

['Hi', ',', 'I', 'am', 'going', 'to', 'Faisalabad', '!']

#### 2.2.1.4 Spacy

In [29]:
import spacy

# python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [30]:
nlp(sent1)

I am from Pakistan

In [31]:
nlp(sent2)

I am from Pakistan. I was born in Khanewal. Soon we moved to Islambad

In [32]:
nlp(sent3)

I am going to Pakistan!

In [33]:
nlp(sent4)

I am going to Karachi!

In [34]:
nlp(sent5)

Hi, I am going to Faisalabad!

### 2.2.2 Stemming

In [35]:
"""
sometimes the output word is not an english word
when you want to show the resultant word, stemmer is not the best option
is faster
"""

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem_words(text):
    return " ".join([ ps.stem(word) for word in text.split() ])

In [36]:
stem_words("walk walking walked walks")

'walk walk walk walk'

### 2.2.3 Lemmatization

In [37]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Geetu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
"""
output word is an english word
when you want to show the resultant word, lemmatization is the best option
is slower
root word is called lemma
"""

from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize_word(text):
    return " ".join([ wordnet_lemmatizer.lemmatize(word, pos="v") for word in text.split() ])

lemmatize_word("walk walking walked walks walker")

'walk walk walk walk walker'

## 2.3 Feature Extraction / Word Embedding

### 2.3.1 Bag of Words

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=10,
    binary=False,
)

In [40]:
bow = cv.fit_transform(df.review)

In [41]:
cv.vocabulary_

{'just': 4,
 'great': 3,
 'really': 7,
 'time': 9,
 'movie': 6,
 'film': 1,
 'like': 5,
 'good': 2,
 'story': 8,
 'bad': 0}

### 2.3.2 Bag of N-grams

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=10,
    binary=False,
    ngram_range=(2, 2),     # (min, max)
)

In [43]:
bow = cv.fit_transform(df.review[:10])

In [44]:
cv.vocabulary_

{'pulls punches': 8,
 'punches regards': 9,
 'production filming': 5,
 'production great': 6,
 'proof woody': 7,
 'mr mattei': 1,
 'human relations': 0,
 'probably alltime': 3,
 'probably wouldnt': 4,
 'original cast': 2}

### 2.3.3 Bag of TFIDF

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow = tfidf.fit_transform(df.review[:10]).toarray()

In [46]:
tfidf.idf_[:10]

array([2.29928298, 2.70474809, 2.70474809, 2.70474809, 2.70474809,
       2.70474809, 2.70474809, 1.6061358 , 2.70474809, 2.29928298])

In [47]:
tfidf.get_feature_names_out()[:10]

array(['10', '15', '1990', '1score', '25', '70s', '950', 'about',
       'accustomed', 'acting'], dtype=object)