In [1]:
from google.colab import files
uploaded = files.upload()

Saving IMDb Movies India.csv to IMDb Movies India.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import re

In [6]:
df = pd.read_csv("IMDb Movies India.csv", encoding='latin-1')

In [7]:
print(df.columns)
print(df.head())

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kap

In [11]:
df.loc[:, 'Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df = df.dropna(subset=['Rating'])

In [10]:
df.loc[:, 'Genre'] = df['Genre'].fillna('Unknown')
df.loc[:, 'Director'] = df['Director'].fillna('Unknown')
df.loc[:, 'Actor 1'] = df['Actor 1'].fillna('Unknown')
df.loc[:, 'Actor 2'] = df['Actor 2'].fillna('Unknown')
df.loc[:, 'Actor 3'] = df['Actor 3'].fillna('Unknown')

In [12]:
df['Genre'] = df['Genre'].apply(lambda x: re.split(',\s*', x))

In [14]:
df['Actor 1'] = df['Actor 1'].apply(lambda x: re.split(',\s*', x))
df['Actor 2'] = df['Actor 2'].apply(lambda x: re.split(',\s*', x))
df['Actor 3'] = df['Actor 3'].apply(lambda x: re.split(',\s*', x))

In [16]:
top_directors = df['Director'].value_counts().head(20).index.tolist()
all_actors = df['Actor 1'] + df['Actor 2'] + df['Actor 3']
top_actors = pd.Series([actor for sublist in all_actors for actor in sublist])
top_actors = top_actors.value_counts().head(30).index.tolist()

In [18]:
df['Director'] = df['Director'].apply(lambda x: x if x in top_directors else 'Other')

# Create a new column indicating if any of the actors are in the top_actors list
df['Has_Top_Actor'] = df.apply(lambda row: any(actor in top_actors for actor_list in [row['Actor 1'], row['Actor 2'], row['Actor 3']] for actor in actor_list), axis=1)

In [25]:
mlb_genre = MultiLabelBinarizer()
# Apply MultiLabelBinarizer to the 'Genre' column of the cleaned DataFrame
genre_encoded = pd.DataFrame(mlb_genre.fit_transform(df['Genre']), columns=mlb_genre.classes_, index=df.index)

# The 'Star' column does not exist. Actor information is in 'Actor 1', 'Actor 2', and 'Actor 3'.
# Depending on your goal, you might want to encode top actors or use the 'Has_Top_Actor' column.
# mlb_star = MultiLabelBinarizer()
# star_encoded = pd.DataFrame(mlb_star.fit_transform(df['Star']), columns=mlb_star.classes_)

In [21]:
le_director = LabelEncoder()
df['Director_encoded'] = le_director.fit_transform(df['Director'])


In [26]:
X = pd.concat([genre_encoded, df[['Director_encoded']], df[['Has_Top_Actor']]], axis=1)
y = df['Rating']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [29]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [30]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)


In [31]:
def evaluate(model_name, y_true, y_pred):
    print(f"🔍 {model_name} Performance:")
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("R² Score:", r2_score(y_true, y_pred))
    print("-" * 40)

In [32]:
evaluate("Linear Regression", y_test, y_pred_lr)
evaluate("Random Forest", y_test, y_pred_rf)
evaluate("XGBoost", y_test, y_pred_xgb)

🔍 Linear Regression Performance:
MAE: 1.0200368807304028
RMSE: 1.271784318505152
R² Score: 0.1300112522486464
----------------------------------------
🔍 Random Forest Performance:
MAE: 1.0344474938225945
RMSE: 1.289782893432238
R² Score: 0.10521245715355909
----------------------------------------
🔍 XGBoost Performance:
MAE: 1.0234107943195285
RMSE: 1.284511467077626
R² Score: 0.11251161999989323
----------------------------------------
