In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.impute import SimpleImputer

In [3]:
file=r'C:\Users\Dawood MD\OneDrive\Desktop\Codsoft\IMDb Movie ratings\IMDb Movies India.csv'

In [4]:
# To get to know the encoding of the csv file
import chardet

with open(file,'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result    

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [5]:
df = pd.read_csv(file,encoding='ISO-8859-1')

In [6]:
# Shows the first 5 rows of dataframe
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [7]:
# Shows the last % rows of dataframe
df.tail()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11.0,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655.0,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,
15508,Zulm-O-Sitam,(1998),130 min,"Action, Drama",6.2,20.0,K.C. Bokadia,Dharmendra,Jaya Prada,Arjun Sarja


In [8]:
#information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [9]:
# checking for the duplicates
df.duplicated().sum()

6

In [10]:
# To check what are the false duplicates
df[df.duplicated()]

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1250,Arab Ka Sona - Abu Kaalia,(1979),,Action,,,Master Bhagwan,Meena Rai,Dara Singh,
1769,Balidan,(1992),,Drama,,,,,,
4723,First Time - Pehli Baar,(2009),,,,,Raja Bundela,Zeenat Aman,Nitin Arora,Raj Babbar
9713,Musafir,,,Thriller,,,Shiva Dagar,,,
13069,Shivani,(2019),,Crime,,,Ugresh Prasad Ujala,Santosh,,
13308,Slumdog Karodpati,(2019),118 min,Thriller,,,Rajesh Patole,Udhav Garje,Rahul Gavane,Govindrao


In [11]:
df.describe(include='all')

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
count,15509,14981,7240,13632,7919.0,7920.0,14984,13892,13125,12365
unique,13838,102,182,485,,2034.0,5938,4718,4891,4820
top,Anjaam,(2019),120 min,Drama,,8.0,Jayant Desai,Ashok Kumar,Rekha,Pran
freq,7,410,240,2780,,227.0,58,158,83,91
mean,,,,,5.841621,,,,,
std,,,,,1.381777,,,,,
min,,,,,1.1,,,,,
25%,,,,,4.9,,,,,
50%,,,,,6.0,,,,,
75%,,,,,6.8,,,,,


In [12]:
df.isna().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [13]:
# Removing Nan from target column
df = df.dropna(subset=['Rating'])

In [14]:
df1 = df.copy()

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7919 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      7919 non-null   object 
 1   Year      7919 non-null   object 
 2   Duration  5851 non-null   object 
 3   Genre     7817 non-null   object 
 4   Rating    7919 non-null   float64
 5   Votes     7919 non-null   object 
 6   Director  7914 non-null   object 
 7   Actor 1   7794 non-null   object 
 8   Actor 2   7719 non-null   object 
 9   Actor 3   7627 non-null   object 
dtypes: float64(1), object(9)
memory usage: 680.5+ KB


In [16]:
# Defining the X and Y Varibles i.e, Independent and Trahet variables
X = df[['Genre','Director','Actor 1','Actor 2','Actor 3']]
Y = df['Rating'] 

In [17]:
# Train and test the function
x_train, x_test ,y_train ,y_test = train_test_split(X,Y,test_size=0.3,random_state=42)  

In [18]:
# Defining the necessary columns required of Regression
categories = ['Genre','Director','Actor 1','Actor 2','Actor 3']

In [19]:
preprocessor = ColumnTransformer(
    transformers =[
        ('cat',OneHotEncoder(handle_unknown='ignore'),categories)
                            ])

In [20]:
#Defining Pipelines
model_pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor(n_estimators=100,random_state=42))])

In [21]:
# Train the model
model_pipeline.fit(x_train, y_train)

In [22]:
# Predict on the test set
y_pred = model_pipeline.predict(x_test)

In [23]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.5535212492634678
R-squared: 0.18394617699354465


In [26]:
from sklearn.ensemble import RandomForestClassifier
#Defining Pipelines
model_pipeline_2 = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('imputer', SimpleImputer(strategy='mean')), # if we have any missing values)
    ('model_2',RandomForestRegressor(n_estimators=100,random_state=42))])

In [27]:
# Train the model
model_pipeline_2.fit(x_train, y_train)

In [28]:
# Predict on the test set
y_pred_2 = model_pipeline_2.predict(x_test)

In [29]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_2)
r2 = r2_score(y_test, y_pred_2)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.5535212492634678
R-squared: 0.18394617699354465


In [31]:
# considering all the features  in x and rating in target varible
x = df.dropna(subset='Rating')
y = df['Rating']

In [36]:
x.drop(columns="Name")

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,,109.0,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,,110.0,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,,147.0,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,,142.0,"Drama, Romance, War",7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,,82.0,"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
...,...,...,...,...,...,...,...,...,...
15501,,,"Action, Crime, Drama",5.3,135.0,Bharat Rangachary,Dharmendra,Moushumi Chatterjee,Govinda
15503,,125.0,"Action, Crime, Drama",5.8,44.0,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
15504,,,Action,4.6,11.0,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,,129.0,"Action, Drama",4.5,655.0,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani


In [35]:
# Convert 'Year' and 'Votes' to numeric, and 'Duration' to minutes if it's in the format 'h m'
x['Year']     = pd.to_numeric(x['Year'],errors='coerce')
x['Votes']    = x['Votes'].str.replace(',','').astype(float)
x['Duration'] = x['Duration'].str.extract('(\d+)').astype(float)

In [37]:
# Fill missing or empty values in categorical features
categorical_features = ['Genre','Director','Actor 1','Actor 2','Actor 3']
for feature in categorical_features:
    x[feature] = x[feature].replace(' ','unknown').fillna('unknown')

In [50]:
# Fill missing values in numerical features with the median
numerical_features = ['Year','Duration','Votes']
for feature in numerical_features:
    x[feature] = x[feature].fillna(x[feature].median()).replace('NaN',x[feature].median())

In [51]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [52]:
# Create a column transformer for both categorical and numerical features
from sklearn.preprocessing import StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ])


In [57]:
# Create a preprocessing and training pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='mean')), # if we have any missing values)
    ('model', RandomForestRegressor(random_state=42))
])

In [58]:
# Hyperparameter tuning using Grid Search
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

In [55]:
x_train.isna().sum()

Name           0
Year        6335
Duration       0
Genre          0
Rating         0
Votes          0
Director       0
Actor 1        0
Actor 2        0
Actor 3        0
dtype: int64

In [59]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(x_train, y_train)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [60]:
# Best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

Best parameters: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 10, 'model__n_estimators': 300}


In [62]:
# Predict on the test set using the best estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)



In [63]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.4241758023150524
R-squared: 0.23396201233495673
