In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [26]:
# 1️⃣ Load data
csv_path = 'IMDb-Movies-India.csv'  # Correct filename
df = pd.read_csv(csv_path)
df.columns = [c.strip() for c in df.columns]  # strip whitespace

print('Loaded dataframe shape:', df.shape)
display(df.head())

Loaded dataframe shape: (15509, 10)


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [32]:
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df = df.dropna(subset=['Rating'])

In [None]:
df['Genre'] = df['Genre'].fillna('Unknown').str.split(',').str[0].str.strip()
df['Director'] = df['Director'].fillna('Unknown')
df['Actor 1'] = df['Actor 1'].fillna('Unknown')

In [None]:
def top_n_or_other(series, n=50):
    top = series.value_counts().nlargest(n).index
    return series.where(series.isin(top), 'Other')


In [35]:
df['Director'] = top_n_or_other(df['Director'])
df['Actor 1'] = top_n_or_other(df['Actor 1'])

In [45]:
X = pd.get_dummies(df[['Genre', 'Director', 'Actor 1']], prefix_sep='==')
y = df['Rating']
print(X,y)


       Genre==Action  Genre==Adventure  Genre==Animation  Genre==Biography  \
1              False             False             False             False   
3              False             False             False             False   
5              False             False             False             False   
6              False             False             False             False   
8              False             False             False             False   
...              ...               ...               ...               ...   
15501           True             False             False             False   
15503           True             False             False             False   
15504           True             False             False             False   
15505           True             False             False             False   
15508           True             False             False             False   

       Genre==Comedy  Genre==Crime  Genre==Documentary  Genre==

In [None]:
#Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression().fit(X_train, y_train)
print('Model trained successfully!')

Model trained successfully! LinearRegression()


In [None]:
# Prediction
new = pd.DataFrame({'Genre': ['Action'], 'Director': ['John Doe'], 'Actor 1': ['Jane Smith']})
new['Director'] = top_n_or_other(new['Director'])
new['Actor 1'] = top_n_or_other(new['Actor 1'])
new = pd.get_dummies(new, prefix_sep='==').reindex(columns=X.columns, fill_value=0)
print("predictions:", model.predict(new))
print("Predicted rating:", model.predict(new)[0])

predictions: [5.70060972]
Predicted rating: 5.700609716891661
