**IMPORTING LIBRARIES**

In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

**LOADING DATASET**

In [3]:
df = pd.read_csv('IMDb Movies India.csv', encoding='latin1')

**DATA ANALYSIS**

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None


In [5]:
print(df.isnull().sum())

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64


**Convert 'Year', 'Duration' and 'Votes' column to numeric**

In [6]:
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

**Fill numeric columns with mean**

In [7]:
df['Year'].fillna(df['Year'].mean(), inplace=True)
df['Duration'].fillna(df['Duration'].mean(), inplace=True)
df['Rating'].fillna(df['Rating'].mean(), inplace=True)
df['Votes'].fillna(df['Votes'].mean(), inplace=True)

**Fill categorical columns with mode**

In [8]:
df['Genre'].fillna(df['Genre'].mode()[0], inplace=True)
df['Director'].fillna(df['Director'].mode()[0], inplace=True)
df['Actor 1'].fillna(df['Actor 1'].mode()[0], inplace=True)
df['Actor 2'].fillna(df['Actor 2'].mode()[0], inplace=True)
df['Actor 3'].fillna(df['Actor 3'].mode()[0], inplace=True)

**Drop 'Name' column as it's not informative for prediction**

In [9]:
df = df.drop('Name', axis=1)

**Separate features and target variable**

In [10]:
X = df.drop('Rating', axis=1)
y = df['Rating']

**Split the data into training and testing sets**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Define preprocessing steps for numeric and categorical features**

In [12]:
numeric_features = ['Year', 'Duration', 'Votes']
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

**Combine numeric and categorical transformers**

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ])

**Define the model**

In [14]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42)),
])


**Train the model**

In [15]:
model.fit(X_train, y_train)

**Make predictions on the test set**

In [16]:
y_pred = model.predict(X_test)

**Evaluate the model**

In [17]:
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.7908534256862633


**Calculate R-squared**

In [18]:
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

R-squared: 0.17831478692054603
