In [None]:
# Build a model that predicts the rating of a movie based on features like genre, director, and actors. You can use regression techniques to tackle this problem.
# The goal is to analyze historical movie data and develop a model that accurately estimates the rating given to a movie by users or critics.
# Movie Rating Prediction project enables you to explore data analysis, preprocessing, feature engineering, and machine learning modeling techniques.
# It provides insights into the factors that influence movie ratings and allows you to build a model that can estimate the ratings of movies accurately.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
# Load data
data = pd.read_csv('/content/IMDb_Movies_India.csv', encoding='latin-1')

In [None]:
data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor1,Actor2,Actor3
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,-1997.0,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,-2005.0,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,-2012.0,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [None]:
data.tail()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor1,Actor2,Actor3
15501,Zulm Ki Hukumat,-1992.0,,"Action, Crime, Drama",5.3,135,Bharat Rangachary,Dharmendra,Moushumi Chatterjee,Govinda
15503,Zulm Ki Zanjeer,-1989.0,125 min,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
15504,Zulm Ko Jala Doonga,-1988.0,,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,-1999.0,129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15508,Zulm-O-Sitam,-1998.0,130 min,"Action, Drama",6.2,20,K.C. Bokadia,Dharmendra,Jaya Prada,Arjun Sarja


In [None]:
# Inspect column names to ensure correct referencing
print(data.columns)

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor1', 'Actor2', 'Actor3'],
      dtype='object')


In [None]:
# After inspecting the columns, set the correct column names
target_column = 'Rating'
genre_column = 'Genre'
director_column = 'Director'
actors_column1 = 'Actor1'
actors_column2 = 'Actor2'
actors_column3 = 'Actor3'

In [None]:
# Data preprocessing
data = data.dropna(subset=[target_column])  # Ensure target variable has no missing values

In [None]:
# Define features and target
features = data[[genre_column, director_column, actors_column1, actors_column2, actors_column3]]
target = data[target_column]

In [None]:
# Preprocess categorical data
categorical_features = [genre_column, director_column, actors_column1, actors_column2, actors_column3]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[('cat', categorical_transformer, categorical_features)])

In [None]:
# Create a pipeline with preprocessor and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
# Train model
model.fit(X_train, y_train)

In [None]:
# Predict and evaluate
y_pred = model.predict(X_test)
print('MSE:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))

MSE: 7.377081584966801
R2 Score: -2.9679965935404153


In [None]:
# Output predicted values and actual values for comparison
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions.head())

       Actual  Predicted
9456      3.3   4.676216
14816     5.3   7.449256
3213      5.7  13.026991
3778      7.2  -8.003002
5775      3.5   5.908172
