# TASK 2

# MOVIE RATING PREDICTION IN PYTHON 

In [46]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [47]:
# Step 1: Load the dataset
file_path = "IMDb Movies India.csv"  
df = pd.read_csv(file_path, encoding='ISO-8859-1')



In [48]:
# Step 2: Drop rows where 'Rating' is missing (for training)
df_train = df.dropna(subset=['Rating']).copy()

In [49]:
# Step 3: Clean 'Year' and 'Duration' columns
df_train['Year'] = df_train['Year'].str.extract(r'(\d{4})')
df_train['Year'] = pd.to_numeric(df_train['Year'], errors='coerce')
df_train['Duration'] = df_train['Duration'].str.extract(r'(\d+)')
df_train['Duration'] = pd.to_numeric(df_train['Duration'], errors='coerce')


In [50]:
# Step 4: Fill missing values
categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
for col in categorical_cols:
    df_train[col] = df_train[col].fillna("Unknown")
df_train['Year'] = df_train['Year'].fillna(df_train['Year'].median())
df_train['Duration'] = df_train['Duration'].fillna(df_train['Duration'].median())

In [51]:

# Step 5: Define features and target
features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Year', 'Duration']
X = df_train[features]
y = df_train['Rating']


In [52]:
# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', max_categories=20), categorical_cols),
    ('num', 'passthrough', ['Year', 'Duration'])
])

# Step 8: Build model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Step 9: Train the model
pipeline.fit(X_train, y_train)

# Step 10: Evaluate model
y_pred = pipeline.predict(X_test)
print("\n📊 Model Evaluation:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R² Score: {r2_score(y_test, y_pred):.2f}")


📊 Model Evaluation:
RMSE: 1.35
MAE: 1.05
R² Score: 0.02


In [53]:
# Step 11: Predict ratings for unrated movies
df_unrated = df[df['Rating'].isnull()].copy()

# Clean 'Year' and 'Duration'
df_unrated['Year'] = df_unrated['Year'].str.extract(r'(\d{4})')
df_unrated['Year'] = pd.to_numeric(df_unrated['Year'], errors='coerce')
df_unrated['Duration'] = df_unrated['Duration'].str.extract(r'(\d+)')
df_unrated['Duration'] = pd.to_numeric(df_unrated['Duration'], errors='coerce')

# Fill missing values in unrated data
for col in categorical_cols:
    df_unrated[col] = df_unrated[col].fillna("Unknown")
df_unrated['Year'] = df_unrated['Year'].fillna(df_train['Year'].median())
df_unrated['Duration'] = df_unrated['Duration'].fillna(df_train['Duration'].median())

# Prepare features and predict
X_unrated = df_unrated[features]
df_unrated['Predicted Rating'] = pipeline.predict(X_unrated)

# Step 12: Show predicted results
print("\n🎬 Predicted Ratings for Movies with Missing Ratings:")
print(df_unrated[['Name', 'Genre', 'Director', 'Predicted Rating']].head(10))

# Optional: Save to CSV
df_unrated.to_csv("Predicted_IMDb_Ratings.csv", index=False)


🎬 Predicted Ratings for Movies with Missing Ratings:
                 Name                   Genre            Director  \
0                                       Drama       J.S. Randhawa   
2         #Homecoming          Drama, Musical  Soumyajit Majumdar   
4   ...And Once Again                   Drama        Amol Palekar   
7      .in for Motion             Documentary       Anirban Datta   
14         101 Ratein                Thriller              Harish   
16        108 Limited                 Unknown         Anand Anddy   
17    108 Teerthyatra  Comedy, Drama, Fantasy             Rajpati   
19         11 O'Clock                 Unknown          Homi Wadia   
20         12 Bulbule   Comedy, Drama, Family          Jagat Joon   
23           12 Years               Biography       Sohail Tatari   

    Predicted Rating  
0           5.153133  
2           5.487400  
4           5.110583  
7           6.770000  
14          5.474643  
16          5.987500  
17          5.573050  
19