## Import Dependencies

In [1]:
# import dependencies
import re
import nltk
import pandas as pd

# nltk dependencies 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# preprocessing dependencies
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# model dependencies
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

## Load the Dataset

In [2]:
# read the dataset
imdb_df = pd.read_csv('IMDB_Dataset.csv')
# use only first 5000 rows of data
imdb_df = imdb_df[:5000]
# check first few rows
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Preprocess Text

In [3]:
# create preprocess_text function
def preprocess_text(text):

    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    
    # filter only the text data and special characters
    processed_text = re.sub('[^A-Za-z0-9]+', ' ', processed_text)

    return processed_text

In [4]:
# apply the function preprocess_text to the dataset
imdb_df['review'] = imdb_df['review'].apply(preprocess_text)
imdb_df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode l...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically s family little boy jake think s zom...,negative
4,petter mattei s love time money visually stunn...,positive


## Label Encode Target Column

In [5]:
# encode the target column
lb = LabelEncoder()
imdb_df['sentiment'] = lb.fit_transform(imdb_df['sentiment'])

# check first few rows
imdb_df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode l...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically s family little boy jake think s zom...,0
4,petter mattei s love time money visually stunn...,1


In [9]:
# check label encoder classes 
lb.classes_

array(['negative', 'positive'], dtype=object)

## Vectorize Input Text for ML Model

In [10]:
# features and target 
preprocessed_data = list(imdb_df['review'])
target = list(imdb_df['sentiment'])

# convert the preprocessed data to feature vectors
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(preprocessed_data)
y = target

## Split Data in Train and Test Datasets

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train Logistic Regression

In [12]:
# Train a logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

## Check Logistic Regression

In [13]:
# Make predictions on the testing data
y_pred = clf.predict(X_test)

# print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.85      0.87       530
           1       0.84      0.87      0.86       470

    accuracy                           0.86      1000
   macro avg       0.86      0.86      0.86      1000
weighted avg       0.86      0.86      0.86      1000

