## Importing the important libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
colors = ['#235E72']

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,Movie Name,IMDb Rating,Popularity,Release Date,Box Office,Runtime,Genre,Director,Cast,Production Company,Country,Language
0,The Kerala Story Bangla,6.5,,,,12 minutes,"['Short', 'Drama']",['Satyajit Das'],[],[],,['Bengali']
1,Generation Z,,,,,,"['Horror', 'Comedy']",[],"['Chris Reilly', 'Ellora Torchia', 'Viola Pret...","['All3Media International', 'The Forge', 'Zwei...",,[]
2,Criminal Justice: Adhura Sach,7.7,,"August 26, 2022 (India)",,45 minutes,"['Crime', 'Drama', 'Mystery']",[],"['Pankaj Tripathi', 'Swastika Mukherjee', 'Pur...","['Applause Entertainment Ltd.', 'BBC Studios']",India,['Hindi']
3,Madhil Mel Kaadhal,,,,,,['Romance'],['Anjana Ali Khan'],"['Mugen Rao', 'Divya Bharathi', 'Sakshi Agarwal']",[],,['Tamil']
4,Saawariya,5.2,,"November 9, 2007 (India)","$18,525,631",2 hours 22 minutes,"['Drama', 'Musical', 'Romance']",['Sanjay Leela Bhansali'],"['Ranbir Kapoor', 'Sonam Kapoor', 'Salman Khan']","['SPE Films', 'SLB Films Pvt. Ltd.']",,['Hindi']


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10004 entries, 0 to 10003
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Movie Name          9920 non-null   object 
 1   IMDb Rating         8671 non-null   float64
 2   Popularity          187 non-null    object 
 3   Release Date        6249 non-null   object 
 4   Box Office          2570 non-null   object 
 5   Runtime             7699 non-null   object 
 6   Genre               10004 non-null  object 
 7   Director            10004 non-null  object 
 8   Cast                10004 non-null  object 
 9   Production Company  10004 non-null  object 
 10  Country             5998 non-null   object 
 11  Language            10004 non-null  object 
dtypes: float64(1), object(11)
memory usage: 938.0+ KB


## Data Cleaning

In [4]:
# Droping the columns that have many null values or are not relevant to the analysis
df = df.drop(['Movie Name', 'Popularity', 'Production Company', 'Box Office', 'Language', 'Country'], axis=1)

In [5]:
#Drop rows with missing values in IMDB rating
df = df.dropna(subset=['IMDb Rating'])

In [6]:
### Extracting the year of release column from date of release and filling the missing values with mode
df['Release Date'].fillna('July 2023', inplace=True)
years = df['Release Date'].apply(lambda x: (x.split('(')[0])).apply(lambda x: x.split()[-1] if len(x.split(','))==1 else x.split(',')[1]) 
df['Year'] = years
df['Year'] = df['Year'].astype(int)

In [7]:
df.drop('Release Date', axis=1, inplace=True)

In [8]:
# Replacing cast and director columns with the first entry in the list
df['Cast'] = df['Cast'].apply(lambda x: x[1:-1].split(', ')[0] if x != '[]' else np.NaN)
df.dropna(subset=['Cast'], inplace=True)

df['Director'] = df['Director'].apply(lambda x: x[1:-1].split(',')[0] if x!='[]' else np.NaN)
df.dropna(subset=['Director'], inplace=True)

In [9]:
## Cleaning the runtime column by replacing it with number of minutes
df['Runtime'] = df['Runtime'].fillna('120 minutes')
df['Runtime'] = df['Runtime'].apply(lambda x: int(x.split()[0]) if 'hours' not in x else (int(x.split()[0])*60 + int(x.split()[2]) if len(x.split())>2 else int(x.split()[0])*60))

In [10]:
## Opening the Genre column and filling the missing values with mode
df['Genre'] = df['Genre'].apply(lambda x: x[1:-1].split(','))
df = df.explode('Genre')
df['Genre'].fillna(df['Genre'].mode()[0], inplace=True)
df['Genre'] = df['Genre'].apply(lambda x: x.lstrip().rstrip())
df.head()

Unnamed: 0,IMDb Rating,Runtime,Genre,Director,Cast,Year
4,5.2,142,'Drama','Sanjay Leela Bhansali','Ranbir Kapoor',2007
4,5.2,142,'Musical','Sanjay Leela Bhansali','Ranbir Kapoor',2007
4,5.2,142,'Romance','Sanjay Leela Bhansali','Ranbir Kapoor',2007
5,7.1,141,'Comedy','Amit Joshi','Shahid Kapoor',2024
5,7.1,141,'Drama','Amit Joshi','Shahid Kapoor',2024


## Feature Engineering

In [11]:
genre_mean_rating = df.groupby('Genre')['IMDb Rating'].transform('mean')
df['Genre_mean_rating'] = genre_mean_rating
df.drop(['Genre'], axis=1, inplace=True)

director_mean_rating = df.groupby('Director')['IMDb Rating'].transform('mean')
df['Director_mean_rating'] = director_mean_rating
df.drop(['Director'], axis=1, inplace=True)

cast_mean_rating = df.groupby('Cast')['IMDb Rating'].transform('mean')
df['Cast_mean_rating'] = cast_mean_rating
df.drop(['Cast'], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,IMDb Rating,Runtime,Year,Genre_mean_rating,Director_mean_rating,Cast_mean_rating
4,5.2,142,2007,6.546046,7.081481,6.33913
4,5.2,142,2007,6.481957,7.081481,6.33913
4,5.2,142,2007,6.30908,7.081481,6.33913
5,7.1,141,2024,6.258643,7.1,5.895714
5,7.1,141,2024,6.546046,7.1,5.895714


### Scaling the data so easily usable by SVM and logistic regression

In [13]:
#Scale the numerical columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['Runtime', 'Year', 'Genre_mean_rating', 'Director_mean_rating', 'Cast_mean_rating']] = scaler.fit_transform(df[['Runtime', 'Year', 'Genre_mean_rating', 'Director_mean_rating', 'Cast_mean_rating']])
df.head()


Unnamed: 0,IMDb Rating,Runtime,Year,Genre_mean_rating,Director_mean_rating,Cast_mean_rating
4,5.2,0.401554,-0.470917,0.628138,0.608887,-0.033658
4,5.2,0.401554,-0.470917,0.394314,0.608887,-0.033658
4,5.2,0.401554,-0.470917,-0.236422,0.608887,-0.033658
5,7.1,0.384253,0.802478,-0.420438,0.624823,-0.463146
5,7.1,0.384253,0.802478,0.628138,0.624823,-0.463146


# Training and Testing

### Defining the target and making a train test split

In [14]:
df['target'] = df['IMDb Rating'].apply(lambda x: 0 if x<3 else (1 if x<5 else (2 if x<7 else 3)))
df.drop('IMDb Rating', axis=1, inplace=True)

In [15]:
X = df.drop(['target'], axis=1)
y = df['target']

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
regr = LogisticRegression(multi_class='multinomial', solver='newton-cg')
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

0.7274569402228976

### Support Vector Machine

In [18]:


from models.SVM import MulticlassSVM
import time

n_classes = len(np.unique(y_train.to_numpy()))  # Number of classes

# Initialize and train multiclass SVM
svm = MulticlassSVM(n_classes,n_jobs=-1)  # Parallelize using all available CPU cores
start_time = time.time()
svm.fit(X_train.to_numpy(), y_train.to_numpy())
end_time = time.time()
prediction_time = end_time - start_time
print(prediction_time)

y_pred = svm.predict(X_test.to_numpy())
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

230.88930416107178
Accuracy: 0.662613981762918


### Decision Tree Classifier

In [19]:
from models.DecisionTree import DTC
clf = DTC(40)
clf.fit(X_train.to_numpy(), y_train.to_numpy())
y_pred = clf.predict(X_test.to_numpy())
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8844984802431611


### Gaussian Naive Bayes

In [None]:
from models.NaiveBayes import GNB
model = GNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7149611617696724


### XGBoost Classifier

In [None]:
import xgboost as xgb
#Creating an XGBoost classifier
model = xgb.XGBClassifier()

#Training the model on the training data
model.fit(X_train, y_train)

#Making predictions on the test set
predictions = model.predict(X_test)

#Calculating accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8649105032083756
