# Applied Data Analysis: Movies
This project analyzes the key factors influencing box-office success for films released between 1915 and 2015, seeking to understand how audience preferences and industry trends have evolved over time.

We will work with two datasets:
- CMU Movie Summary
- IMDb dataset

## Loading the Datasets

In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from collections import Counter
import os
import sys

from importlib import reload
import src.utils.utils
reload(src.utils.utils)
from src.utils.utils import top_n_by_interval, top_n_total_revenue, top_n_average_rating

In [2]:
data_folder = './data/'
pickle_folder = data_folder + 'pickle/'
imdb_folder = data_folder + 'IMDB/'
cmu_folder = data_folder + 'CMU/'

# Modeling

In [61]:
import statsmodels.api as sm
import random
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# fix random seed for reproducibility
np.random.seed(42)
random.seed(42)

## Training model on complete dataframe without seasons

In [52]:
with open(pickle_folder+"movies_clean.p", 'rb') as f:
    movies_clean = pickle.load(f)
movies_clean = pd.get_dummies(movies_clean,columns=["Year_Interval"],prefix="Interval",drop_first=True)

### Average Rating

In [54]:
X = movies_clean.drop(columns=["Wikipedia_movie_ID","Movie_name","nb_of_Genres","nb_of_Languages","nb_of_Countries","Movie_box_office_revenue","averageRating"])
y = movies_clean["averageRating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
scaled_X_train = sm.add_constant(pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index,columns=X_train.columns))
scaled_X_test = sm.add_constant(pd.DataFrame(scaler.transform(X_test),index=X_test.index,columns=X_test.columns))

In [55]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error:", np.sqrt(mse))
print("R-squared:", r2)

Root Mean Squared Error: 0.8303988105844023
R-squared: 0.37322909460073017


In [68]:
model = sm.OLS(y_train, scaled_X_train).fit()

SyntaxError: invalid syntax (3728329989.py, line 2)

### Box Office Revenue

In [57]:
movies_clean.dropna(subset="Movie_box_office_revenue",inplace=True)
X = movies_clean.drop(columns=["Wikipedia_movie_ID","Movie_name","nb_of_Genres","nb_of_Languages","nb_of_Countries","Movie_box_office_revenue","averageRating"])
y = movies_clean["Movie_box_office_revenue"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
scaled_X_train = sm.add_constant(pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index,columns=X_train.columns))
scaled_X_test = sm.add_constant(pd.DataFrame(scaler.transform(X_test),index=X_test.index,columns=X_test.columns))

In [58]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error:", np.sqrt(mse))
print("R-squared:", r2)

Root Mean Squared Error: 101408870.78079669
R-squared: 0.4135733944098068


In [59]:
model = sm.OLS(y_train, scaled_X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,Movie_box_office_revenue,R-squared:,0.522
Model:,OLS,Adj. R-squared:,0.516
Method:,Least Squares,F-statistic:,86.23
Date:,"Fri, 13 Dec 2024",Prob (F-statistic):,0.0
Time:,17:29:42,Log-Likelihood:,-108120.0
No. Observations:,5528,AIC:,216400.0
Df Residuals:,5458,BIC:,216800.0
Df Model:,69,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.161e+07,1.02e+06,50.498,0.000,4.96e+07,5.36e+07
Year,1.607e+07,4.13e+06,3.896,0.000,7.98e+06,2.42e+07
Genre_Action,7.346e+06,1.53e+06,4.812,0.000,4.35e+06,1.03e+07
Genre_Action/Adventure,2.733e+06,1.48e+06,1.848,0.065,-1.66e+05,5.63e+06
Genre_Adventure,8.241e+06,1.22e+06,6.757,0.000,5.85e+06,1.06e+07
Genre_Animation,7.71e+06,1.2e+06,6.426,0.000,5.36e+06,1.01e+07
Genre_Biographical film,3.065e+04,1.17e+06,0.026,0.979,-2.27e+06,2.33e+06
Genre_Biography,-7.855e+05,1.2e+06,-0.657,0.511,-3.13e+06,1.56e+06
Genre_Black comedy,-5.32e+06,1.08e+06,-4.937,0.000,-7.43e+06,-3.21e+06

0,1,2,3
Omnibus:,3442.085,Durbin-Watson:,1.959
Prob(Omnibus):,0.0,Jarque-Bera (JB):,214757.846
Skew:,2.24,Prob(JB):,0.0
Kurtosis:,33.204,Cond. No.,24.8


## Training model on complete dataframe with seasons

In [44]:
with open(pickle_folder+"movies_clean_with_season.p", 'rb') as f:
    movies_season = pickle.load(f)
movies_season = pd.get_dummies(movies_season,columns=["release_season","Year_Interval"],prefix=["Season","Interval"],drop_first=True)

### Average Rating

In [45]:
X = movies_season.drop(columns=["Wikipedia_movie_ID","Movie_name","nb_of_Genres","nb_of_Languages","nb_of_Countries","Movie_box_office_revenue","averageRating"])
y = movies_season["averageRating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
scaled_X_train = sm.add_constant(pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index,columns=X_train.columns))
scaled_X_test = sm.add_constant(pd.DataFrame(scaler.transform(X_test),index=X_test.index,columns=X_test.columns))

In [46]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error:", np.sqrt(mse))
print("R-squared:", r2)

Root Mean Squared Error: 0.8356196075815406
R-squared: 0.33304360728170546


In [47]:
model = sm.OLS(y_train, scaled_X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,averageRating,R-squared:,0.332
Model:,OLS,Adj. R-squared:,0.328
Method:,Least Squares,F-statistic:,85.5
Date:,"Fri, 13 Dec 2024",Prob (F-statistic):,0.0
Time:,17:25:49,Log-Likelihood:,-15163.0
No. Observations:,12460,AIC:,30470.0
Df Residuals:,12387,BIC:,31010.0
Df Model:,72,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.3063,0.007,859.020,0.000,6.292,6.321
Year,-0.2465,0.037,-6.634,0.000,-0.319,-0.174
Genre_Action,-0.0813,0.010,-8.187,0.000,-0.101,-0.062
Genre_Action/Adventure,-0.0033,0.010,-0.338,0.736,-0.022,0.016
Genre_Adventure,-0.0267,0.009,-3.134,0.002,-0.043,-0.010
Genre_Animation,0.0930,0.008,11.224,0.000,0.077,0.109
Genre_Biographical film,0.0205,0.008,2.579,0.010,0.005,0.036
Genre_Biography,0.0209,0.008,2.544,0.011,0.005,0.037
Genre_Black comedy,0.0369,0.008,4.802,0.000,0.022,0.052

0,1,2,3
Omnibus:,2164.957,Durbin-Watson:,1.977
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5362.96
Skew:,-0.97,Prob(JB):,0.0
Kurtosis:,5.562,Cond. No.,22.4


### Box Office Revenue

In [48]:
movies_season.dropna(subset="Movie_box_office_revenue",inplace=True)
X = movies_season.drop(columns=["Wikipedia_movie_ID","Movie_name","nb_of_Genres","nb_of_Languages","nb_of_Countries","Movie_box_office_revenue","averageRating"])
y = movies_season["Movie_box_office_revenue"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
scaled_X_train = sm.add_constant(pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index,columns=X_train.columns))
scaled_X_test = sm.add_constant(pd.DataFrame(scaler.transform(X_test),index=X_test.index,columns=X_test.columns))

In [49]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error:", np.sqrt(mse))
print("R-squared:", r2)

Root Mean Squared Error: 85990580.7713244
R-squared: 0.5411346580894989


In [50]:
model = sm.OLS(y_train, scaled_X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,Movie_box_office_revenue,R-squared:,0.483
Model:,OLS,Adj. R-squared:,0.475
Method:,Least Squares,F-statistic:,58.57
Date:,"Fri, 13 Dec 2024",Prob (F-statistic):,0.0
Time:,17:27:11,Log-Likelihood:,-90272.0
No. Observations:,4580,AIC:,180700.0
Df Residuals:,4507,BIC:,181200.0
Df Model:,72,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.866e+07,1.31e+06,44.830,0.000,5.61e+07,6.12e+07
Year,1.462e+07,4.97e+06,2.940,0.003,4.87e+06,2.44e+07
Genre_Action,7.422e+06,1.99e+06,3.730,0.000,3.52e+06,1.13e+07
Genre_Action/Adventure,3.304e+06,1.92e+06,1.722,0.085,-4.57e+05,7.07e+06
Genre_Adventure,7.961e+06,1.57e+06,5.064,0.000,4.88e+06,1.1e+07
Genre_Animation,7.446e+06,1.54e+06,4.821,0.000,4.42e+06,1.05e+07
Genre_Biographical film,-7.767e+05,1.47e+06,-0.527,0.598,-3.67e+06,2.11e+06
Genre_Biography,-9.548e+05,1.5e+06,-0.635,0.525,-3.9e+06,1.99e+06
Genre_Black comedy,-6.076e+06,1.39e+06,-4.376,0.000,-8.8e+06,-3.35e+06

0,1,2,3
Omnibus:,5407.312,Durbin-Watson:,1.953
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2757474.163
Skew:,5.609,Prob(JB):,0.0
Kurtosis:,122.682,Cond. No.,24.6
