In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/data.csv')
df.head(5)

Unnamed: 0,Movie Name,IMDb Rating,Popularity,Release Date,Box Office,Runtime,Genre,Director,Cast,Production Company,Country,Language
0,The Kerala Story Bangla,6.5,,,,12 minutes,"['Short', 'Drama']",['Satyajit Das'],[],[],,['Bengali']
1,Generation Z,,,,,,"['Horror', 'Comedy']",[],"['Chris Reilly', 'Ellora Torchia', 'Viola Pret...","['All3Media International', 'The Forge', 'Zwei...",,[]
2,Criminal Justice: Adhura Sach,7.7,,"August 26, 2022 (India)",,45 minutes,"['Crime', 'Drama', 'Mystery']",[],"['Pankaj Tripathi', 'Swastika Mukherjee', 'Pur...","['Applause Entertainment Ltd.', 'BBC Studios']",India,['Hindi']
3,Madhil Mel Kaadhal,,,,,,['Romance'],['Anjana Ali Khan'],"['Mugen Rao', 'Divya Bharathi', 'Sakshi Agarwal']",[],,['Tamil']
4,Saawariya,5.2,,"November 9, 2007 (India)","$18,525,631",2 hours 22 minutes,"['Drama', 'Musical', 'Romance']",['Sanjay Leela Bhansali'],"['Ranbir Kapoor', 'Sonam Kapoor', 'Salman Khan']","['SPE Films', 'SLB Films Pvt. Ltd.']",,['Hindi']


In [3]:
#Drop columns that are not needed
df = df.drop(['Movie Name', 'Popularity', 'Production Company'], axis=1)
df.head(5)

Unnamed: 0,IMDb Rating,Release Date,Box Office,Runtime,Genre,Director,Cast,Country,Language
0,6.5,,,12 minutes,"['Short', 'Drama']",['Satyajit Das'],[],,['Bengali']
1,,,,,"['Horror', 'Comedy']",[],"['Chris Reilly', 'Ellora Torchia', 'Viola Pret...",,[]
2,7.7,"August 26, 2022 (India)",,45 minutes,"['Crime', 'Drama', 'Mystery']",[],"['Pankaj Tripathi', 'Swastika Mukherjee', 'Pur...",India,['Hindi']
3,,,,,['Romance'],['Anjana Ali Khan'],"['Mugen Rao', 'Divya Bharathi', 'Sakshi Agarwal']",,['Tamil']
4,5.2,"November 9, 2007 (India)","$18,525,631",2 hours 22 minutes,"['Drama', 'Musical', 'Romance']",['Sanjay Leela Bhansali'],"['Ranbir Kapoor', 'Sonam Kapoor', 'Salman Khan']",,['Hindi']


In [4]:
#Drop rows with missing values in IMDB rating
df = df.dropna(subset=['IMDb Rating'])

In [5]:
df.head(5)

Unnamed: 0,IMDb Rating,Release Date,Box Office,Runtime,Genre,Director,Cast,Country,Language
0,6.5,,,12 minutes,"['Short', 'Drama']",['Satyajit Das'],[],,['Bengali']
2,7.7,"August 26, 2022 (India)",,45 minutes,"['Crime', 'Drama', 'Mystery']",[],"['Pankaj Tripathi', 'Swastika Mukherjee', 'Pur...",India,['Hindi']
4,5.2,"November 9, 2007 (India)","$18,525,631",2 hours 22 minutes,"['Drama', 'Musical', 'Romance']",['Sanjay Leela Bhansali'],"['Ranbir Kapoor', 'Sonam Kapoor', 'Salman Khan']",,['Hindi']
5,7.1,"February 9, 2024 (India)","$1,309,068",2 hours 21 minutes,"['Comedy', 'Drama', 'Romance']","['Amit Joshi', 'Aradhana Sah']","['Shahid Kapoor', 'Kriti Sanon', 'Dharmendra']",India,['Hindi']
6,8.3,"July 8, 1983 (India)",,2 hours 21 minutes,"['Drama', 'Romance']",['Balu Mahendra'],"['Kamal Haasan', 'Sridevi', 'Gulshan Grover']",India,['Hindi']


In [6]:
for col in df.columns:
    if(col in ['Genre', 'Director', 'Cast', 'Production Company', 'Language']):
        print(f"Empty values in {col} is {df[col].apply(lambda x: x=='[]').sum()} ({df[col].apply(lambda x: x=='[]').sum()/df.shape[0]*100:.2f}%)")
    else: 
        print(f'NaN values in {col} is {df[col].isna().sum()} ({df[col].isna().sum()/df.shape[0]*100:.2f}%)')

NaN values in IMDb Rating is 0 (0.00%)
NaN values in Release Date is 2601 (30.00%)
NaN values in Box Office is 6103 (70.38%)
NaN values in Runtime is 1204 (13.89%)
Empty values in Genre is 72 (0.83%)
Empty values in Director is 1671 (19.27%)
Empty values in Cast is 347 (4.00%)
NaN values in Country is 2918 (33.65%)
Empty values in Language is 96 (1.11%)


In [7]:
df = df.drop(['Box Office'], axis = 1)

In [8]:
df['Country'] = df['Country'].fillna('India')

In [9]:
df['Release Date'].fillna('July 2023', inplace=True)
years = df['Release Date'].apply(lambda x: (x.split('(')[0])).apply(lambda x: x.split()[-1] if len(x.split(','))==1 else x.split(',')[1]) 
df['Year'] = years

In [10]:
df.head(5)

Unnamed: 0,IMDb Rating,Release Date,Runtime,Genre,Director,Cast,Country,Language,Year
0,6.5,July 2023,12 minutes,"['Short', 'Drama']",['Satyajit Das'],[],India,['Bengali'],2023
2,7.7,"August 26, 2022 (India)",45 minutes,"['Crime', 'Drama', 'Mystery']",[],"['Pankaj Tripathi', 'Swastika Mukherjee', 'Pur...",India,['Hindi'],2022
4,5.2,"November 9, 2007 (India)",2 hours 22 minutes,"['Drama', 'Musical', 'Romance']",['Sanjay Leela Bhansali'],"['Ranbir Kapoor', 'Sonam Kapoor', 'Salman Khan']",India,['Hindi'],2007
5,7.1,"February 9, 2024 (India)",2 hours 21 minutes,"['Comedy', 'Drama', 'Romance']","['Amit Joshi', 'Aradhana Sah']","['Shahid Kapoor', 'Kriti Sanon', 'Dharmendra']",India,['Hindi'],2024
6,8.3,"July 8, 1983 (India)",2 hours 21 minutes,"['Drama', 'Romance']",['Balu Mahendra'],"['Kamal Haasan', 'Sridevi', 'Gulshan Grover']",India,['Hindi'],1983


In [11]:
df.drop(['Release Date'], axis=1, inplace=True)

In [12]:
print(df.shape)

(8671, 8)


In [13]:
df.head(5)

Unnamed: 0,IMDb Rating,Runtime,Genre,Director,Cast,Country,Language,Year
0,6.5,12 minutes,"['Short', 'Drama']",['Satyajit Das'],[],India,['Bengali'],2023
2,7.7,45 minutes,"['Crime', 'Drama', 'Mystery']",[],"['Pankaj Tripathi', 'Swastika Mukherjee', 'Pur...",India,['Hindi'],2022
4,5.2,2 hours 22 minutes,"['Drama', 'Musical', 'Romance']",['Sanjay Leela Bhansali'],"['Ranbir Kapoor', 'Sonam Kapoor', 'Salman Khan']",India,['Hindi'],2007
5,7.1,2 hours 21 minutes,"['Comedy', 'Drama', 'Romance']","['Amit Joshi', 'Aradhana Sah']","['Shahid Kapoor', 'Kriti Sanon', 'Dharmendra']",India,['Hindi'],2024
6,8.3,2 hours 21 minutes,"['Drama', 'Romance']",['Balu Mahendra'],"['Kamal Haasan', 'Sridevi', 'Gulshan Grover']",India,['Hindi'],1983


In [14]:
df.drop(['Cast'], axis=1, inplace=True)

In [15]:
mydict = {}
for i in df['Director']:
    j = i[1:-1].split(',')
    for k in j:
        if k.strip() in mydict:
            mydict[k.strip()] += 1
        else:
            mydict[k.strip()] = 1

cols = list(mydict.keys())
cols.remove('')
print(cols)

["'Satyajit Das'", "'Sanjay Leela Bhansali'", "'Amit Joshi'", "'Aradhana Sah'", "'Balu Mahendra'", "'Gurinder Chadha'", "'John Mathew Matthan'", "'S.S. Rajamouli'", "'Shireesh Khemariya'", "'Selvin Raj Xavier'", "'Gulzar'", "'Paresh Mokashi'", "'Sandeep Reddy Vanga'", "'Rajiv Mehra'", "'Paul Mayeda Berges'", "'Syam Sasi'", "'Tamizh'", "'Faruk Kabir'", "'Ram Madhvani'", "'Heavenveer Singh Hayer'", "'Nishikant Kamat'", "'Rajneesh Berry'", "'Yogesh Bhardwaj'", "'Kiran Rao'", "'Simranjit Singh Hundal'", "'Shashank Khaitan'", "'Pulkit'", "'Satyajit Ray'", "'Rajendra Kachru Gaikwad'", "'Dilip Dombe'", "'Shriprasad Pawar'", "'Karthik Subbaraj'", "'Imtiaz Ali'", "'Vishal Pandya'", "'Rakesh Roshan'", "'Ketan Mehta'", "'Vidhu Vinod Chopra'", "'Guddu Dhanoa'", "'Kedar Shinde'", "'Anurag Kashyap'", "'Janjot Singh'", "'M. Padmakumar'", "'Gayatri'", "'Pushkar'", "'Yash Chopra'", "'Vikramaditya Motwane'", "'Hemant Dhome'", "'Siddharth Anand'", "'Abhishek Sharma'", "'Ajay Devgn'", "'Anil Ravipudi'", "

In [16]:
with open('../data/directors.txt', 'w') as f:
    for item in cols:
        item = item.replace("'", "")
        f.write("%s\n" % item)

In [17]:
mydict = {}
for i in df['Language']:
    j = i[1:-1].split(',')
    for k in j:
        if k.strip() in mydict:
            mydict[k.strip()] += 1
        else:
            mydict[k.strip()] = 1

cols = list(mydict.keys())
cols.remove('')
print(cols)

["'Bengali'", "'Hindi'", "'English'", "'Punjabi'", "'Spanish'", "'Telugu'", "'Tamil'", "'Kannada'", "'Malayalam'", "'Marathi'", "'Gujarati'", "'Indonesian'", "'Italian'", "'Cantonese'", "'Russian'", "'Urdu'", "'Bhojpuri'", "'North American Indian'", "'Portuguese'", "'Haryanvi'", "'Chinese'", "'Rajasthani'", "'Oriya'", "'French'", "'German'", "'Japanese'", "'Arabic'", "'Parsee'", "'Mandarin'", "'Korean'", "'Turkish'", "'Maithili'", "'Pashtu'", "'Assamese'", "'Nepali'", "'Apache languages'", "'Indian Sign Language'", "'Tulu'", "'Swahili'", "'Min Nan'", "'Icelandic'", "'Ukrainian'", "'Kashmiri'", "'Romanian'", "'Latin'", "'Sanskrit'", "'Awadhi'", "'Thai'", "'Sinhala'", "'Tibetan'", "'Vietnamese'", "'Polish'", "'Mongolian'", "'None'", "'Luxembourgish'", "'Swedish'", "'Armenian'", "'Abkhazian'", "'Dutch'", "'Chhattisgarhi'", "'Ladakhi'", "'Kikuyu'", "'Akan'", "'Nagpuri'", "'Bodo'", "'Dari'", "'Afrikaans'", "'Hebrew'", "'Burmese'", "'Sindhi'", "'Filipino'", "'Tagalog'", "'Irula'", "'Konkani'

In [18]:
newcols = []
for col in cols:
    if mydict[col] > 5:
        newcols.append(col)
newcols.append('Other')
newcols.remove("'None'")
print(newcols)

["'Bengali'", "'Hindi'", "'English'", "'Punjabi'", "'Spanish'", "'Telugu'", "'Tamil'", "'Kannada'", "'Malayalam'", "'Marathi'", "'Gujarati'", "'Italian'", "'Russian'", "'Urdu'", "'Bhojpuri'", "'Portuguese'", "'Haryanvi'", "'Chinese'", "'Rajasthani'", "'Oriya'", "'French'", "'German'", "'Japanese'", "'Arabic'", "'Mandarin'", "'Korean'", "'Maithili'", "'Assamese'", "'Nepali'", "'Sanskrit'", 'Other']


In [19]:
for i in range(len(df)):
    try:
        j = df.loc[i, 'Language']
        j = j[1:-1].split(',')
        for k in j:
            if k.strip() in newcols:
                df.loc[i, k.strip()] = 1
            else:
                df.loc[i, 'Other'] = 1
    except:
        pass

In [20]:
df.head()
df.drop(['Language'], axis=1, inplace=True)

In [21]:
for col in newcols:
    df[col] = df[col].fillna(0)

In [22]:
df['Runtime'] = df['Runtime'].fillna('120 minutes')
df['Runtime'] = df['Runtime'].apply(lambda x: int(x.split()[0]) if 'hours' not in x else (int(x.split()[0])*60 + int(x.split()[2]) if len(x.split())>2 else int(x.split()[0])*60))

In [23]:
df.head()

Unnamed: 0,IMDb Rating,Runtime,Genre,Director,Country,Year,'Bengali','Hindi','English','Punjabi',...,'French','German','Japanese','Arabic','Mandarin','Korean','Maithili','Assamese','Nepali','Sanskrit'
0,6.5,12,"['Short', 'Drama']",['Satyajit Das'],India,2023,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7.7,45,"['Crime', 'Drama', 'Mystery']",[],India,2022,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.2,142,"['Drama', 'Musical', 'Romance']",['Sanjay Leela Bhansali'],India,2007,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,7.1,141,"['Comedy', 'Drama', 'Romance']","['Amit Joshi', 'Aradhana Sah']",India,2024,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,8.3,141,"['Drama', 'Romance']",['Balu Mahendra'],India,1983,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
mydict = {}
for i in df['Genre']:
    j = i[1:-1].split(',')
    for k in j:
        if k.strip() in mydict:
            mydict[k.strip()] += 1
        else:
            mydict[k.strip()] = 1

cols = list(mydict.keys())
cols.remove('')
print(cols)

["'Short'", "'Drama'", "'Crime'", "'Mystery'", "'Musical'", "'Romance'", "'Comedy'", "'Action'", "'Horror'", "'Thriller'", "'Fantasy'", "'Sport'", "'Adventure'", "'History'", "'Biography'", "'Family'", "'War'", "'Documentary'", "'Animation'", "'Sci-Fi'", "'Reality-TV'", "'Music'", "'Talk-Show'", "'News'", "'Game-Show'", "'Western'"]


In [25]:
for col in cols:
    df[col] = df['Genre'].apply(lambda x: 1 if col in x else 0)

In [26]:
df.head()

Unnamed: 0,IMDb Rating,Runtime,Genre,Director,Country,Year,'Bengali','Hindi','English','Punjabi',...,'War','Documentary','Animation','Sci-Fi','Reality-TV','Music','Talk-Show','News','Game-Show','Western'
0,6.5,12,"['Short', 'Drama']",['Satyajit Das'],India,2023,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,7.7,45,"['Crime', 'Drama', 'Mystery']",[],India,2022,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,5.2,142,"['Drama', 'Musical', 'Romance']",['Sanjay Leela Bhansali'],India,2007,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,7.1,141,"['Comedy', 'Drama', 'Romance']","['Amit Joshi', 'Aradhana Sah']",India,2024,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,8.3,141,"['Drama', 'Romance']",['Balu Mahendra'],India,1983,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df.drop(['Genre', 'Country'], axis=1, inplace=True)

In [28]:
df.head()

Unnamed: 0,IMDb Rating,Runtime,Director,Year,'Bengali','Hindi','English','Punjabi','Spanish','Telugu',...,'War','Documentary','Animation','Sci-Fi','Reality-TV','Music','Talk-Show','News','Game-Show','Western'
0,6.5,12,['Satyajit Das'],2023,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,7.7,45,[],2022,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,5.2,142,['Sanjay Leela Bhansali'],2007,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,7.1,141,"['Amit Joshi', 'Aradhana Sah']",2024,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,8.3,141,['Balu Mahendra'],1983,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
df['target'] = df['IMDb Rating'].apply(lambda x: 0 if x<3 else (1 if x<5 else (2 if x<7 else 3)))

In [30]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
df[['Runtime']] = scaler.fit_transform(df[['Runtime']])
df.head()
df[['Year']] = df[['Year']].astype(int)
scaler2 = preprocessing.MinMaxScaler()
df[['Year']] = scaler2.fit_transform(df[['Year']])

In [38]:
df.head()
df.drop(['Director'], axis=1, inplace=True)

# Now we do training

In [39]:
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

In [40]:
X = df.drop(['target', 'IMDb Rating'], axis=1)
y = df['target']


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
regr = LogisticRegression(multi_class='multinomial', solver='newton-cg')
regr.fit(X_train, y_train)

LogisticRegression(multi_class='multinomial', solver='newton-cg')

In [43]:
regr.score(X_test, y_test)

0.5193083573487032

In [44]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [45]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5331412103746398


In [46]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4564841498559078


In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5279538904899136


In [48]:
import xgboost as xgb
#Creating an XGBoost classifier
model = xgb.XGBClassifier()

#Training the model on the training data
model.fit(X_train, y_train)

#Making predictions on the test set
predictions = model.predict(X_test)

#Calculating accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.5619596541786743
