In [27]:
import sys
import os

import pandas as pd
import numpy as np

import random
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm



In [28]:
final_df = pd.read_csv('../data/movie_final_dataset.csv')
final_df

Unnamed: 0,Name,Genres,Budget(USD)_Inflated,Domestic(USD)_Inflated,Domestic_Percentage,Foreign(USD)_Inflated,Foreign_Percentage,Worldwide(USD)_Inflated,Runtime(mins),Rating,...,period piece,animation,teen,film adaptation,musical,history,coming of age,sports,war,Worldwide_profit
0,10 cloverfield lane,"drama, thriller, horror, sci-fi",6.076746e+06,8.760602e+07,0.654010,4.634613e+07,0.345990,1.339521e+08,103,pg-13,...,0,0,0,0,0,0,0,0,0,22.043400
1,"10,000 bc","drama, thriller, action, adventure, romance fi...",1.441563e+08,1.301309e+08,0.351333,2.402606e+08,0.648667,3.703914e+08,109,pg-13,...,0,0,0,0,0,0,0,0,0,2.569373
2,12 rounds,"thriller, action, crime",2.726275e+07,1.667757e+07,0.708013,6.877890e+06,0.291987,2.355546e+07,108,pg-13,...,0,0,0,0,0,0,0,0,0,0.864016
3,12 strong,"drama, action, history, war",3.979050e+07,5.209113e+07,0.644274,2.876133e+07,0.355726,8.085246e+07,130,r,...,0,0,0,0,0,1,0,0,1,2.031954
4,12 years a slave,"drama, biography, history",2.539260e+07,7.195247e+07,0.301875,1.663993e+08,0.698125,2.383517e+08,134,r,...,0,0,0,0,0,1,0,0,0,9.386660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2684,zoolander,comedy,4.851315e+07,7.826600e+07,0.743197,2.704388e+07,0.256803,1.053099e+08,90,pg-13,...,0,0,0,0,0,0,0,0,0,2.170749
2685,zoolander 2,"comedy, action, adventure, romance film, mystery",6.076746e+07,3.506124e+07,0.508592,3.387664e+07,0.491408,6.893788e+07,101,pg-13,...,0,0,0,0,0,0,0,0,0,1.134454
2686,zoom,"action, adventure, fantasy, sci-fi",5.014200e+07,1.717625e+07,0.958658,7.407176e+05,0.041342,1.791697e+07,93,pg,...,0,0,0,0,0,0,0,0,0,0.357325
2687,zootopia,"comedy, action, adventure, mystery, crime, fam...",1.823024e+08,4.147601e+08,0.333340,8.294952e+08,0.666660,1.244255e+09,108,pg,...,0,1,0,0,0,0,0,0,0,6.825228


In [29]:
final_df.columns

Index(['Name', 'Genres', 'Budget(USD)_Inflated', 'Domestic(USD)_Inflated',
       'Domestic_Percentage', 'Foreign(USD)_Inflated', 'Foreign_Percentage',
       'Worldwide(USD)_Inflated', 'Runtime(mins)', 'Rating', 'Audience_Score',
       'Critics_Score', 'plot_summary', 'Month', 'Day', 'Year',
       'Foreign_higher', 'emotion', 'emotion_score', 'drama', 'comedy',
       'thriller', 'action', 'adventure', 'romance film', 'horror', 'fantasy',
       'mystery', 'crime', 'indie', 'biography', 'family', 'sci-fi',
       'period piece', 'animation', 'teen', 'film adaptation', 'musical',
       'history', 'coming of age', 'sports', 'war', 'Worldwide_profit'],
      dtype='object')

## Pre-processing for OLS 

In [30]:
# Dropping columns deemed irrelevant for the model
df_ols = final_df.drop(columns=[ "Genres", "Day","Name", "plot_summary", "Worldwide(USD)_Inflated", "Domestic(USD)_Inflated", "Foreign(USD)_Inflated", "Worldwide(USD)_Inflated", "Foreign_higher", "emotion_score", "Worldwide_profit"])

In [31]:
df_ols.head(5)

Unnamed: 0,Budget(USD)_Inflated,Domestic_Percentage,Foreign_Percentage,Runtime(mins),Rating,Audience_Score,Critics_Score,Month,Year,emotion,...,sci-fi,period piece,animation,teen,film adaptation,musical,history,coming of age,sports,war
0,6076746.0,0.65401,0.34599,103,pg-13,0.79,0.91,Mar,2016,fear,...,1,0,0,0,0,0,0,0,0,0
1,144156300.0,0.351333,0.648667,109,pg-13,0.37,0.1,Mar,2008,fear,...,0,0,0,0,0,0,0,0,0,0
2,27262750.0,0.708013,0.291987,108,pg-13,0.45,0.31,Mar,2009,anger,...,0,0,0,0,0,0,0,0,0,0
3,39790500.0,0.644274,0.355726,130,r,0.62,0.5,Jan,2018,fear,...,0,0,0,0,0,0,1,0,0,1
4,25392600.0,0.301875,0.698125,134,r,0.9,0.95,Oct,2013,sadness,...,0,0,0,0,0,0,1,0,0,0


Standardizing columns

In [32]:
# Standardizing 

columns_to_scale = ['Budget(USD)_Inflated', 'Domestic_Percentage', 'Foreign_Percentage', 'Runtime(mins)', 'Audience_Score', 'Critics_Score']
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_ols[columns_to_scale])
df_ols[columns_to_scale] = scaled_data


One hot encoding columns

In [33]:
# one-hot encoding rating column, dropping the first column in order to avoid introducing multicollinearity
df_ols = pd.get_dummies(df_ols, columns=['Rating'], drop_first=True)
df_ols = df_ols.drop(columns="Rating_nc-17") # Only 5 nc-17 movies, not considered for analysis
# one-hot encoding month column, dropping first column for same reason
df_ols = pd.get_dummies(df_ols, columns=['Month'], drop_first=True)
# same thing for the emotion column
df_ols = pd.get_dummies(df_ols, columns=['emotion'], drop_first=True)


Checking for multicolinearity with VIF

In [34]:
# Function calculating the VIF (Variance inflation factor)
def calculate_vif(dataframe):

    # Ensuring the data contains only numeric columns
    data = dataframe.select_dtypes(include=['number'])

    data = sm.add_constant(data)

    # Compute VIF for each feature
    vif_data = pd.DataFrame()
    vif_data["Feature"] = data.columns
    vif_data["VIF"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]

    # Drop the constant row from the VIF results
    return vif_data[vif_data["Feature"] != "const"]



In [35]:
# Drop target variables
predictors = df_ols.drop(columns=["Foreign_Percentage", "Domestic_Percentage", "Year"]).astype(float)
# Saving for later use in final_results.ipynb
predictors.to_csv("../src/data/OLS_data/predictors.csv", index=False)
vif_results = calculate_vif(predictors)
print(vif_results)

                 Feature        VIF
1   Budget(USD)_Inflated   2.357069
2          Runtime(mins)   1.994242
3         Audience_Score   2.190690
4          Critics_Score   2.107627
5                  drama   1.616573
6                 comedy   1.587771
7               thriller   1.816373
8                 action   1.958124
9              adventure   1.942613
10          romance film   1.304259
11                horror   1.499739
12               fantasy   1.331925
13               mystery   1.283100
14                 crime   1.406517
15                 indie   1.140862
16             biography   1.312084
17                family   2.368413
18                sci-fi   1.296999
19          period piece   1.202336
20             animation   1.651701
21                  teen   1.164074
22       film adaptation   1.106382
23               musical   1.089489
24               history   1.231568
25         coming of age   1.215961
26                sports   1.105325
27                   war   1

The one-hot encoded rating columns seem to be multicolinear, removal of one of the ratings is thus necessary

In [36]:
# Drop target variables
predictors = df_ols.drop(columns=["Foreign_Percentage", "Domestic_Percentage", "Year", "Rating_r"]).astype(float)
vif_results = calculate_vif(predictors)
print(vif_results)

                 Feature       VIF
1   Budget(USD)_Inflated  2.338089
2          Runtime(mins)  1.964846
3         Audience_Score  2.189151
4          Critics_Score  2.107565
5                  drama  1.611259
6                 comedy  1.568428
7               thriller  1.812405
8                 action  1.918804
9              adventure  1.914379
10          romance film  1.304259
11                horror  1.491241
12               fantasy  1.331849
13               mystery  1.283076
14                 crime  1.403300
15                 indie  1.139938
16             biography  1.312038
17                family  2.339399
18                sci-fi  1.293860
19          period piece  1.202083
20             animation  1.632124
21                  teen  1.163777
22       film adaptation  1.105897
23               musical  1.089058
24               history  1.231371
25         coming of age  1.215783
26                sports  1.103321
27                   war  1.201854
28             Ratin

Removing the "Rating_r" columns thus resolves the issue

In [37]:
df_ols = df_ols.drop(columns = "Rating_r")

## Defining features and labels for the 4 different models:
Labels: Foreign/domestic percentage for prediction
Predictors: before and after 2010

In [38]:
# Removing Year column, not used as a feature

# Movies before 2010
df_before_2010 = df_ols[df_ols['Year'] < 2010].drop(columns= "Year")

# Movies including and after 2010
df_2010_and_after = df_ols[df_ols['Year'] >= 2010].drop(columns= "Year")

df_ols = df_ols.drop(columns= "Year")


Function to train OLS model

In [39]:
def train_ols_model(dataframe, target_column, drop_columns):
    """
    Trains an OLS regression model.

    Parameters:
        dataframe (pd.DataFrame): The input DataFrame containing features and the target variable.
        target_column (str): The name of the column to be used as the target variable.
        drop_columns (list): List of columns to drop from the DataFrame.

    Returns:
        ols_model (sm.OLS): The trained OLS model.
    """
    # Defining features and labels
    y = dataframe[target_column]
    X = dataframe.drop(columns=drop_columns)

    # Adding a constant column for the OLS model
    X = sm.add_constant(X)

    # Training the OLS model
    feature_names = list(X.columns)
    label_name = target_column

    # Ensure predictors and target are numeric
    X = pd.DataFrame(X, columns=feature_names, dtype=float)
    y = pd.Series(y, name=label_name, dtype=float) # Including the label name

    # Fit the OLS model
    ols_model = sm.OLS(y, X).fit()
    return ols_model

Function to save the feature statistics for later use

In [40]:
def extract_feature_statistics(ols_model, output_csv_path=None):
    """
    Extracts feature statistics from an OLS regression model.

    Parameters:
        ols_model (sm.OLS): The trained OLS regression model.
        output_csv_path (str, optional): Path to save the feature statistics as a CSV file. Defaults to None.

    Returns:
        feature_stats (pd.DataFrame): A DataFrame containing the feature statistics.
    """
    # Extracting feature statistics
    coefficients = ols_model.params  # Coefficients (coef)
    standard_errors = ols_model.bse  # Standard errors (std err)
    t_values = ols_model.tvalues     # t-values (t)
    p_values = ols_model.pvalues     # p-values (P>|t|)

    # Creating a DataFrame
    feature_stats = pd.DataFrame({
        "Feature": coefficients.index,
        "Coefficient": coefficients.values,
        "Std_Error": standard_errors.values,
        "t_value": t_values.values,
        "p_value": p_values.values
    })

    # Saving the DataFrame to a CSV file if the path is provided
    if output_csv_path:
        feature_stats.to_csv(output_csv_path, index=False)

    return feature_stats

### 1st model: Foreign Percentage prediction 

In [41]:
# Defining the target column as well as which columns to drop
target_column = "Foreign_Percentage"
drop_columns = ["Foreign_Percentage", "Domestic_Percentage"]
# Path to save CSV file containing the feature statistics
output_csv_path = "../src/data/OLS_data/feature_stats_Foreign.csv"

# Train the model
ols_model = train_ols_model(df_ols, target_column, drop_columns)

# Extract feature statistics and save to CSV
feature_stats_Foreign = extract_feature_statistics(ols_model, output_csv_path)

# Displaying the model summary
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:     Foreign_Percentage   R-squared:                       0.244
Model:                            OLS   Adj. R-squared:                  0.231
Method:                 Least Squares   F-statistic:                     18.52
Date:                Fri, 20 Dec 2024   Prob (F-statistic):          7.91e-127
Time:                        23:40:55   Log-Likelihood:                -3439.8
No. Observations:                2689   AIC:                             6974.
Df Residuals:                    2642   BIC:                             7251.
Df Model:                          46                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.0547 

### "2nd" model: Domestic Percentage prediction 

In [42]:
# Defining the target column as well as which columns to drop
target_column = "Domestic_Percentage"
drop_columns = ["Foreign_Percentage", "Domestic_Percentage"]
# Path to save CSV file containing the feature statistics
output_csv_path = "../src/data/OLS_data/feature_stats_Domestic.csv"

# Train the model
ols_model = train_ols_model(df_ols, target_column, drop_columns)

# Extract feature statistics and save to CSV
feature_stats_domestic = extract_feature_statistics(ols_model, output_csv_path)

# Displaying the model summary
print(ols_model.summary())

                             OLS Regression Results                            
Dep. Variable:     Domestic_Percentage   R-squared:                       0.244
Model:                             OLS   Adj. R-squared:                  0.231
Method:                  Least Squares   F-statistic:                     18.52
Date:                 Fri, 20 Dec 2024   Prob (F-statistic):          7.91e-127
Time:                         23:40:56   Log-Likelihood:                -3439.8
No. Observations:                 2689   AIC:                             6974.
Df Residuals:                     2642   BIC:                             7251.
Df Model:                           46                                         
Covariance Type:             nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 

Only difference between the two models is the sign of the coeffecients. Subsquent Analysis will have only the foreign_percentage as a target column

### Before 2010, foreign prediction

In [43]:
# Defining the target column as well as which columns to drop
target_column = "Foreign_Percentage"
drop_columns = ["Foreign_Percentage", "Domestic_Percentage"]
# Path to save CSV file containing the feature statistics
output_csv_path = "../src/data/OLS_data/feature_stats_pre_2010.csv"

# Train the model
ols_model = train_ols_model(df_before_2010, target_column, drop_columns)

# Extract feature statistics and save to CSV
feature_stats_pre_2010 = extract_feature_statistics(ols_model, output_csv_path)

# Displaying the model summary
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:     Foreign_Percentage   R-squared:                       0.262
Model:                            OLS   Adj. R-squared:                  0.233
Method:                 Least Squares   F-statistic:                     9.123
Date:                Fri, 20 Dec 2024   Prob (F-statistic):           2.27e-51
Time:                        23:40:56   Log-Likelihood:                -1481.4
No. Observations:                1232   AIC:                             3057.
Df Residuals:                    1185   BIC:                             3297.
Df Model:                          46                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -0.0654 

### Including and after 2010, foreign prediction

In [44]:
# Defining the target column as well as which columns to drop
target_column = "Foreign_Percentage"
drop_columns = ["Foreign_Percentage", "Domestic_Percentage"]
# Path to save CSV file containing the feature statistics
output_csv_path = "../src/data/OLS_data/feature_stats_post_2010.csv"

# Train the model
ols_model = train_ols_model(df_2010_and_after, target_column, drop_columns)

# Extract feature statistics and save to CSV
feature_stats_post_2010 = extract_feature_statistics(ols_model, output_csv_path)

# Displaying the model summary
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:     Foreign_Percentage   R-squared:                       0.265
Model:                            OLS   Adj. R-squared:                  0.241
Method:                 Least Squares   F-statistic:                     11.06
Date:                Fri, 20 Dec 2024   Prob (F-statistic):           6.55e-66
Time:                        23:40:56   Log-Likelihood:                -1873.8
No. Observations:                1457   AIC:                             3842.
Df Residuals:                    1410   BIC:                             4090.
Df Model:                          46                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -0.0423 

## Comparing the statistically significant coefficients

### Filtering significant features

In [45]:
feature_stats_Foreign.head(5)

Unnamed: 0,Feature,Coefficient,Std_Error,t_value,p_value
0,const,0.054742,0.090505,0.604854,0.5453278
1,Budget(USD)_Inflated,0.231861,0.025869,8.962917,5.839609e-19
2,Runtime(mins),0.061409,0.023714,2.589517,0.009663843
3,Audience_Score,-0.156608,0.025031,-6.256467,4.579128e-10
4,Critics_Score,0.116545,0.024561,4.745215,2.193754e-06


Defining funciton that removes all the features with a P-value > 0.05

In [46]:
def filter_significant_features(feature_stats, p_value_threshold=0.05):

    # Filter the DataFrame based on the p-value threshold
    significant_features = feature_stats[feature_stats["p_value"] <= p_value_threshold]
    
    return significant_features

In [47]:
feature_stats_Foreign  = filter_significant_features(feature_stats_Foreign)
feature_stats_pre_2010 = filter_significant_features(feature_stats_pre_2010)
feature_stats_post_2010 = filter_significant_features(feature_stats_post_2010)


Renaming the statisticaly significant features for plotting:

In [48]:
extracted_feature_names = feature_stats_Foreign["Feature"].tolist()
extracted_feature_names

['Budget(USD)_Inflated',
 'Runtime(mins)',
 'Audience_Score',
 'Critics_Score',
 'comedy',
 'thriller',
 'action',
 'adventure',
 'romance film',
 'mystery',
 'animation',
 'teen',
 'sports',
 'Rating_pg',
 'Rating_pg-13',
 'Month_Dec',
 'Month_Nov',
 'emotion_surprise']

In [49]:
def clean_feature_names(dataframe):

    mapping = {
    'animation':'Animation',
    'Budget(USD)_Inflated': 'Budget (USD) Inflated',
    'Runtime(mins)': 'Runtime (Minutes)',
    'Audience_Score': 'Audience Score',
    'Critics_Score': 'Critics Score',
    'drama': 'Drama',
    'comedy': 'Comedy',
    'thriller': 'Thriller',
    'action': 'Action',
    'adventure': 'Adventure',
    'romance film': 'Romance Film',
    'horror': 'Horror',
    'fantasy': 'Fantasy',
    'mystery': 'Mystery',
    'crime': 'Crime',
    'indie': 'Indie',
    'biography': 'Biography',
    'family': 'Family',
    'sci-fi': 'Sci-Fi',
    'period piece': 'Period Piece',
    'animation': 'Animation',
    'teen': 'Teen',
    'film adaptation': 'Film Adaptation',
    'musical': 'Musical',
    'history': 'History',
    'coming of age': 'Coming Of Age',
    'sports': 'Sports',
    'war': 'War',
    'Month_Dec':'Month: December',
    'Month_Nov':'Month: November',
    'Month_Oct':'Month: October',
    'Rating_pg-13': 'Rating: PG 13',
    'Rating_pg': 'Rating: PG',
    'emotion_surprise': 'Emotion: Surprise',
    'emotion_disgust': 'Emotion: Disgust'
}

    dataframe["Feature"] = dataframe["Feature"].replace(mapping)
    return dataframe


In [50]:
feature_stats_Foreign = clean_feature_names(feature_stats_Foreign)
feature_stats_pre_2010 = clean_feature_names(feature_stats_pre_2010)
feature_stats_post_2010 = clean_feature_names(feature_stats_post_2010)

feature_stats_Foreign.to_csv("../src/data/OLS_data/feature_stats_Foreign.csv", index=False)
feature_stats_pre_2010.to_csv("../src/data/OLS_data/feature_stats_pre_2010.csv", index=False)
feature_stats_post_2010.to_csv("../src/data/OLS_data/feature_stats_post_2010.csv", index=False)

### Plotting the coeffecients for the statistically significant features

In [51]:
def plot_feature_coefficients(dataframe, title, output_path = None):
    """
    Plots a bar chart for feature coefficients.

    Parameters:
        dataframe (pd.DataFrame): DataFrame containing the feature statistics to plot.
        title (str): Title of the bar chart.
        Optional: output_path (str): Path to save the HTML file. If None, the plot is not saved.

    Returns:
        fig: The Plotly bar chart figure.
    """
    # Aranges the features in descending order base on their coeffecients
    dataframe_sorted = dataframe.sort_values(by="Coefficient", ascending=False)

    # Create the bar plot
    fig = px.bar(
        dataframe_sorted,
        x='Feature',
        y='Coefficient',
        hover_data=['Std_Error', 't_value', 'p_value'],
        color='Coefficient',
        labels={'Feature': 'Feature Names', 'Coefficient': 'Coefficient Value'},
        title=title,
        height=400
    )

    # Centering the title and changing the font

    fig.update_layout(
        title={
            'text': title,
            'y': 0.95,  # Vertical alignment of the title
            'x': 0.5,   # Horizontal alignment of the title
            'xanchor': 'center',
            'yanchor': 'top'
        },
        title_font=dict(
            size=20,
            color="black"    
        )
    )

    # Save the plot as html file if output_path provided
    if output_path:
        fig.write_html(output_path)
        print(f"Plot saved to {output_path}")
    else:
        print("Plot not saved.")

    return fig

In [52]:
plot_feature_coefficients(feature_stats_Foreign, "OLS Coefficients 2000-2019", "../_includes/OLS_features_all_years.html")


Plot saved to ../_includes/OLS_features_all_years.html


In [53]:
plot_feature_coefficients(feature_stats_pre_2010, "OLS Coefficients 2000-2009", "../_includes/OLS_features_pre_2010.html")

Plot saved to ../_includes/OLS_features_pre_2010.html


In [54]:
plot_feature_coefficients(feature_stats_post_2010, "OLS Coefficients 2010-2019", "../_includes/OLS_features_post_2010.html")

Plot saved to ../_includes/OLS_features_post_2010.html
