In [1]:
import pandas as pd

In [9]:
imdb_datasets = pd.read_csv('../data/raw/IMDB Dataset.csv')
imdb_datasets.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [10]:
imdb_datasets['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [11]:
# convert the reviews to lowercase
imdb_datasets['review'].value_counts()

imdb_datasets['review'] = imdb_datasets['review'].fillna('').str.lower()


In [12]:
imdb_datasets.head(10)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
5,"probably my all-time favorite movie, a story o...",positive
6,i sure would like to see a resurrection of a u...,positive
7,"this show was an amazing, fresh & innovative i...",negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


In [13]:
# remove punctuaution
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_review(text):
    # Remove HTML tags if any
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove very long repeated characters (like aaaaaaa)
    text = re.sub(r'(.)\1{3,}', r'\1', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Lemmatization (convert words to there base form)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

imdb_datasets['cleaned_review'] = imdb_datasets['review'].apply(clean_review)

[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
imdb_datasets.head(10)

Unnamed: 0,review,sentiment,cleaned_review
0,one of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...
1,a wonderful little production. <br /><br />the...,positive,wonderful little production filming technique ...
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"petter mattei's ""love in the time of money"" is...",positive,petter matteis love time money visually stunni...
5,"probably my all-time favorite movie, a story o...",positive,probably alltime favorite movie story selfless...
6,i sure would like to see a resurrection of a u...,positive,sure would like see resurrection dated seahunt...
7,"this show was an amazing, fresh & innovative i...",negative,show amazing fresh innovative idea first aired...
8,encouraged by the positive comments about this...,negative,encouraged positive comment film looking forwa...
9,if you like original gut wrenching laughter yo...,positive,like original gut wrenching laughter like movi...


In [15]:
# Check the cleaned text
print(imdb_datasets[['review', 'cleaned_review']].head())

# Check the most common words
from collections import Counter

all_words = ' '.join(imdb_datasets['cleaned_review']).split()
word_counts = Counter(all_words)
print(word_counts.most_common(20))

                                              review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production. <br /><br />the...   
2  i thought this was a wonderful way to spend ti...   
3  basically there's a family where a little boy ...   
4  petter mattei's "love in the time of money" is...   

                                      cleaned_review  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  
[('movie', 99025), ('film', 89809), ('one', 52677), ('like', 39790), ('time', 29396), ('good', 28615), ('character', 27573), ('get', 24436), ('even', 24286), ('story', 24229), ('would', 24001), ('make', 23564), ('see', 23494), ('really', 22904), ('scene', 20706), ('much', 18897), ('well', 18630), ('people', 17979), ('great', 17805)

In [17]:
imdb_datasets.to_csv('../data/processed/Cleaned IMDB Dataset.csv', index=False)