In [2]:
import re
import nltk
import string
import pandas as pd
from bs4 import BeautifulSoup
from textblob import TextBlob
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [3]:
# load dataset
dataset = pd.read_csv('dataset/IMDB Dataset.csv')

In [4]:
# view dataset
dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
# lower case
dataset['review'] = dataset['review'].str.lower()

In [6]:
# remove html tags using regex
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

# remove html tags using beautifulsoup
def strip_html(text):
    # check if the text contains any html tags before parsing
    if re.search(r'<.*?>', text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()
    return text

dataset['review'] = dataset['review'].apply(lambda x: strip_html(x))

In [7]:
# remove https, http, www links
def check_and_remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    contains_url = bool(pattern.search(text))
    if contains_url:
        cleaned_text = pattern.sub(r'', text)
        return cleaned_text
    return text

dataset['review'] = dataset['review'].apply(lambda x: check_and_remove_url(x))

In [8]:
# remove punctuation
punc = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', punc))

dataset['review'] = dataset['review'].apply(lambda x: remove_punctuation(x))

In [9]:
# chatwords handling
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

def remove_chatwords(text):
    for word in chat_words:
        text = re.sub(r'\b' + re.escape(word) + r'\b', '', text)
    return text

def expand_chatwords(text):
    for word, full_form in chat_words.items():
        text = re.sub(r'\b' + re.escape(word) + r'\b', full_form, text, flags = re.IGNORECASE)
    return text

def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

dataset['review'] = dataset['review'].apply(lambda x: chat_conversion(x))

In [None]:
# handiling incorrection
def correct_spelling(text):
    return str(TextBlob(text).correct())

dataset['review'] = dataset['review'].apply(lambda x: correct_spelling(x))

In [None]:
# handling stopwords
stopword = stopwords.words('english')
stopword

In [None]:
# split data into training and testing sets
x = dataset['review']
y = dataset['sentiment']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)