In [None]:
# MOVIE RATING PREDICTION 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset with proper encoding
file_path = "IMDb Movies India.csv"
df = pd.read_csv(file_path, encoding="latin1")

# Convert 'Year' to numeric
df["Year"] = df["Year"].str.extract(r"(\d{4})").astype(float)

# Convert 'Duration' to numeric
df["Duration"] = df["Duration"].str.extract(r"(\d+)").astype(float)

# Clean 'Votes' column (remove commas and convert to numeric)
df["Votes"] = df["Votes"].astype(str)
df["Votes"] = df["Votes"].apply(lambda x: x.replace(",", "") if x.replace(",", "").replace(".", "").isnumeric() else None)
df["Votes"] = pd.to_numeric(df["Votes"])

# Drop rows where 'Rating' is missing
df_cleaned = df.dropna(subset=["Rating"]).copy()

# Fill missing numerical values with median using .loc
df_cleaned.loc[:, "Year"] = df_cleaned["Year"].fillna(df_cleaned["Year"].median())
df_cleaned.loc[:, "Duration"] = df_cleaned["Duration"].fillna(df_cleaned["Duration"].median())
df_cleaned.loc[:, "Votes"] = df_cleaned["Votes"].fillna(df_cleaned["Votes"].median())

# Fill missing categorical values with 'Unknown' using .loc
categorical_cols = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]
df_cleaned.loc[:, categorical_cols] = df_cleaned[categorical_cols].fillna("Unknown")

# One-hot encode categorical features (after dropping missing Rating)
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
categorical_encoded = ohe.fit_transform(df_cleaned[categorical_cols])
categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=ohe.get_feature_names_out(categorical_cols))

# Combine numerical and categorical features
numerical_cols = ["Year", "Duration", "Votes"]
X = pd.concat([df_cleaned[numerical_cols].reset_index(drop=True), categorical_encoded_df], axis=1)
y = df_cleaned["Rating"].reset_index(drop=True)  # Ensure y and X have the same index

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 1.1700251231060603
