# Movie Business Success Model Training
This notebook handles data preprocessing and model training for predicting movie success metrics.

In [45]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import warnings
import joblib
import os
warnings.filterwarnings('ignore')

In [46]:
# Load and process data
df = pd.DataFrame(pd.read_csv('../data/imdb_tmdb_regression.csv'))

# Process nested columns
nested_columns = ['production_companies', 'production_countries', 'keywords', 
                 'actor_list', 'director', 'genre']

for col in nested_columns:
    df[col] = df[col].apply(ast.literal_eval)

In [47]:
# Extract date components and filter runtime
df['month'] = df['release_date'].apply(lambda x: x.split('-')[1])
df['day'] = df['release_date'].apply(lambda x: x.split('-')[2])
df.drop(columns=['release_date'], inplace=True)
df = df[df['runtime'] <= 300]

In [48]:
# PCA for revenue and popularity
pca_features = df[['popularity', 'revenue']]
pca_scaler = MinMaxScaler()
pca_features_scaled = pca_scaler.fit_transform(pca_features)
df['runtime'] = MinMaxScaler().fit_transform(df[['runtime']])

pca = PCA()
pca_result = pca.fit_transform(pca_features_scaled)
explained_variance = pca.explained_variance_ratio_
df['pca'] = pca_result[:, 0]

In [49]:
# Scale budget
budget_scaler = MinMaxScaler()
df['budget'] = budget_scaler.fit_transform(df[['budget']])

In [50]:
# Process genres
mlb = MultiLabelBinarizer()
df['genre'] = df['genre'].apply(lambda genres: [g for g in genres if g not in ['Unknown', 'TV Movie']])
encoded_genres = pd.DataFrame(mlb.fit_transform(df['genre']), 
                            columns=mlb.classes_, 
                            index=df.index)
df = pd.concat([df, encoded_genres], axis=1)

# Drop unnecessary columns
columns_to_drop = ['genre', 'production_companies', 
                   'production_countries', 'keywords', 'actor_list', 'director',
                   'original_language', 'month', 'day']
df.drop(columns=columns_to_drop, inplace=True)

In [51]:
# Prepare features for model
X = df.drop(columns=['id', 'title', 'vote_count', 'pca', 'runtime', 
                     'overview_sentiment', 'vote_average', 'revenue', 'popularity'])
y = df[['pca']]

In [52]:
X.head()

Unnamed: 0,budget,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,Thriller,War,Western
0,0.18018,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0.185811,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
2,0.208333,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
3,0.266892,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
4,0.247748,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [53]:
# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

# Save training data for genre suggestions
X_train.to_csv('../data/processed_features.csv', index=False)

In [54]:
# Save models and scalers
if not os.path.exists('models'):
    os.makedirs('models')

joblib.dump(linear_regressor, 'models/linear_regressor.joblib')
joblib.dump(pca_scaler, 'models/pca_scaler.joblib')
joblib.dump(budget_scaler, 'models/budget_scaler.joblib')
joblib.dump(pca, 'models/pca.joblib')
joblib.dump(mlb, 'models/multilabel_binarizer.joblib')

['models/multilabel_binarizer.joblib']

In [55]:
# Evaluate model
y_pred = linear_regressor.predict(X_test)
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred)}")
print(f"R²: {r2_score(y_test, y_pred)}")
n = len(y_test)
p = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2_score(y_test, y_pred)) * (n - 1) / (n - p - 1)
print(f"Adjusted R²: {adjusted_r2}")

Mean Squared Error: 3.955664108119022e-05
RMSE: 0.006289407053227691
MAPE: 1.2727541740087815
R²: 0.48795302392004936
Adjusted R²: 0.48784568009886364


In [56]:
# Get the coefficients and intercept
coefficients = linear_regressor.coef_[0]
intercept = linear_regressor.intercept_[0]

# Print the regression line equation
equation = "y = " + " + ".join([f"{coeff:.4f}*x{i}" for i, coeff in enumerate(coefficients, start=1)]) + f" + {intercept:.4f}"
print("Regression Line Equation:")
print(equation)

Regression Line Equation:
y = 0.8840*x1 + -0.0001*x2 + 0.0005*x3 + 0.0001*x4 + -0.0000*x5 + -0.0003*x6 + 0.0000*x7 + -0.0000*x8 + -0.0003*x9 + 0.0001*x10 + -0.0003*x11 + 0.0002*x12 + 0.0000*x13 + -0.0000*x14 + 0.0000*x15 + 0.0001*x16 + -0.0004*x17 + -0.0006*x18 + -0.0008*x19 + -0.0004
