In [1]:
#################################### Load and Explore the Data ##############################################################
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv(r"C:\Users\ciana\OneDrive\Documents\DCU\Final Year\MT412 - Professional Business Analytics\IMDB_Dataset.csv")

In [2]:
# Display the first few rows of the dataframe
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
# Display the first review
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [4]:
# Check for any missing values
print(df.isnull().sum())

review       0
sentiment    0
dtype: int64


In [5]:
# Get a basic description of the dataset
print(df.describe())

                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000


In [6]:
# Check for duplicates
df.duplicated().sum()

418

In [7]:
# Remove duplicates
df.drop_duplicates(inplace = True)

In [8]:
# Check to see if duplicates are gone
df.duplicated().sum()

0

In [9]:
# What is the breakdown of positive review to negative reviews
df['sentiment'].value_counts()

positive    24884
negative    24698
Name: sentiment, dtype: int64

In [10]:
##################################### Data Processing ####################################################

In [11]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# Clean text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

In [13]:
# Clean the reviews
df['review'] = df['review'].apply(clean_text)

In [14]:
# Examining the text
print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production The filming tech...  positive
2      I thought this was wonderful way to spend time...  positive
3      Basically there a family where little boy Jake...  negative
4      Petter Mattei Love in the Time of Money is vis...  positive
...                                                  ...       ...
49995  I thought this movie did down right good job I...  positive
49996  Bad plot bad dialogue bad acting idiotic direc...  negative
49997  I am Catholic taught in parochial elementary s...  negative
49998  I going to have to disagree with the previous ...  negative
49999  No one expects the Star Trek movies to be high...  negative

[49582 rows x 2 columns]


In [15]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.25, random_state=0)

# The test set is 25% and the training set is 75%

In [16]:
# Vectorise the text
## We are vectorising the text to convert the tokens into numerical values so we can model the data later.

vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data and transform it
X_train = vectorizer.fit_transform(X_train)

# Transform the test data
X_test = vectorizer.transform(X_test)

In [27]:
# Converting the sentimental labels to numeric values
y_train = y_train.apply(lambda x: 1 if x == 'positive' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'positive' else 0)

# If the review is positive it will return a 1 and if it is negative it will return a 0.

In [18]:
# Ensuring sentiment labels were changed to binary
print(y_test)

166      1
28039    1
35960    1
1872     0
12728    0
        ..
17831    1
43510    0
20732    0
31910    1
47841    1
Name: sentiment, Length: 12396, dtype: int64


In [19]:
# If a review is positive it returns a 1 and if the review is neagtive it returns a 0

In [20]:
############################### Building Model and Training #########################################################

In [21]:
# I used a Random Forest Classifier to build this model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialise the Random Forest classifier with 100 trees
model = RandomForestClassifier(n_estimators=100, random_state=0)


In [22]:
# Train the model on the training data
model.fit(X_train, y_train)

In [23]:
#################################### Prediction and Evaluation ################################################

In [24]:
# Predict the sentiments on the test data
y_pred = model.predict(X_test)
print(y_pred)

[0 1 1 ... 0 1 1]


In [25]:
# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.84


In [26]:
# Detailed classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.84      6152
           1       0.84      0.83      0.84      6244

    accuracy                           0.84     12396
   macro avg       0.84      0.84      0.84     12396
weighted avg       0.84      0.84      0.84     12396

