In [129]:
import pandas as pd
from statsmodels.tsa.ardl import ardl_select_order
from statsmodels.tsa.ardl import ARDL

import numpy as np

import sys
sys.path.append("../src/model")

from ARDL_model import ARDL_model_func

DATA_PATH = "../data/CLEAN"

# Violent movie data

In [10]:
df_box_offices = pd.read_csv(DATA_PATH+"/Violent_Movies_final.tsv", sep="\t")

In [11]:
# Testing out the ARDL model: 
weekly_revenue_films = ARDL_model_func(df_box_offices)

In [12]:
# QUESTIONS
# 1) Only take the years 1950-2012 ? In other years we have nearly no weekly values ? Due to wars ? 
# 2) Fill all missing weeks with 0 films and 0 box_office_revenue


weekly_revenue_films.head()

Unnamed: 0,Year,Week,no. films released,Box office revenue
0,1913,33,1,980000.0
1,1914,46,1,87028.0
2,1915,6,1,50000000.0
3,1923,47,1,4168790.0
4,1924,49,1,274827.0


# Real world violence data

In [16]:
df_real_violence = pd.read_csv(DATA_PATH+"/FBI_91_12/Scores/0_violence_scores_merged.csv", sep=",")
df_real_violence.head()

Unnamed: 0,Year,Week,Violence_score
0,1991,1,3798
1,1991,2,2869
2,1991,3,2842
3,1991,4,2871
4,1991,5,3302


In [29]:
# sum up violence counts for all states (grouped by year and week) to have one final violence score for USA per week
weekly_violence_USA = df_real_violence.groupby(["Year", "Week"])["Violence_score"].sum().reset_index()
weekly_violence_USA = weekly_violence_USA.sort_values(["Year", "Week"], ascending=True)
weekly_violence_USA.head()

Unnamed: 0,Year,Week,Violence_score
0,1991,1,9679
1,1991,2,6439
2,1991,3,6556
3,1991,4,6651
4,1991,5,7336


# Match both datasets in timespan

In [40]:
# we only have the real violence data from 1991 to 2012
year_start = weekly_violence_USA["Year"].min()
year_stop = weekly_violence_USA["Year"].max()

# for 2012, we only have the movie box office revenue until week 42 (included)
df_temp = weekly_revenue_films[weekly_revenue_films['Year'] == 2012]

# Get the maximum value of the "week" column
week_stop_2012 = df_temp['Week'].max()

In [None]:
# cut movie box office revenue dataframe for the timespan 1991-2012
weekly_revenue_films_cut = weekly_revenue_films[(weekly_revenue_films["Year"] >= year_start) & (weekly_revenue_films["Year"] <= year_stop)]

# cut the real world violence dataframe to only contain values until week 42 of 2012 (included)
weekly_violence_USA_cut = weekly_violence_USA[(weekly_violence_USA["Year"] < 2012) | ((weekly_violence_USA["Year"] == 2012) & (weekly_violence_USA["Week"] <= 42))]


Unnamed: 0,Year,Week,no. films released,Box office revenue
862,1991,2,2,38867309.0
863,1991,3,2,20038851.0
864,1991,5,2,8614328.0
865,1991,6,2,175738109.0
866,1991,7,1,272742922.0


# Merge the two dataframes

In [69]:
merged_violence = pd.merge(weekly_violence_USA_cut, weekly_revenue_films_cut, on=['Year', 'Week'], how='left')
merged_violence["no. films released"] = merged_violence["no. films released"].fillna(0).astype(int)
merged_violence["Box office revenue"] = merged_violence["Box office revenue"].fillna(0).astype(int)

merged_violence = merged_violence.sort_values(["Year", "Week"], ascending=True)

merged_violence.head()

Unnamed: 0,Year,Week,Violence_score,no. films released,Box office revenue
0,1991,1,9679,0,0
1,1991,2,6439,2,38867309
2,1991,3,6556,2,20038851
3,1991,4,6651,0,0
4,1991,5,7336,2,8614328


# Add the time dummies

In [64]:
# Get indicator variables for the year-week
merged_violence["Year-Week"] = merged_violence["Year"].astype(str) + "-" + merged_violence["Week"].astype(str)

In [67]:
# Create time dummies for weekly time-fixed effects, drop temporary Year-Week column afterwards
time_dummies = pd.get_dummies(merged_violence["Year-Week"], drop_first=True).astype(int)
merged_violence_with_dummies = pd.concat([merged_violence.drop(columns="Year-Week"), time_dummies], axis=1)
merged_violence_with_dummies.head()

Unnamed: 0,Year,Week,Violence_score,no. films released,Box office revenue,1991-10,1991-11,1991-12,1991-13,1991-14,...,2012-39,2012-4,2012-40,2012-41,2012-42,2012-5,2012-6,2012-7,2012-8,2012-9
0,1991,1,9679,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1991,2,6439,2,38867309,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1991,3,6556,2,20038851,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1991,4,6651,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1991,5,7336,2,8614328,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Try out ARDL model

In [123]:
# Setting the time frame for the auto-regressive part
max_auto_lag = 6            # take into account max. 6 previous timesteps

# Setting the time span for the distributed lag part
max_film_lag = 6            # take into account max. 4 previous timesteps
#max_unemployment_lag = 1    # take into account max 1 previous timestep

In [124]:
# find best order for lags
selected_order = ardl_select_order(
    endog=merged_violence_with_dummies['Violence_score'], 
    exog=merged_violence_with_dummies.drop(columns=["Year", "Week", "Violence_score", "no. films released"]), 
    maxlag=max_auto_lag, 
    maxorder={"Box office revenue": max_film_lag}, 
    ic='aic'
)

  return _format_order(self.data.orig_exog, order, self._causal)
  return _format_order(self.data.orig_exog, order, self._causal)


In [125]:
print(selected_order.ar_lags)

[1, 2, 3, 4]


In [126]:
print(selected_order.dl_lags)

{'Box office revenue': [0, 1, 2, 3, 4]}


In [134]:
model = ARDL(
    endog=merged_violence_with_dummies['Violence_score'],
    exog=merged_violence_with_dummies.drop(columns=["Year", "Week", "Violence_score", "no. films released"]),
    lags=selected_order.ar_lags,
    order=selected_order.dl_lags, 
    trend="ct"
).fit()

# Display model summary
print(model.summary())

# Why are there no time dummies in the summary??

                              ARDL Model Results                              
Dep. Variable:         Violence_score   No. Observations:                 1142
Model:                     ARDL(4, 4)   Log Likelihood              -10546.136
Method:               Conditional MLE   S.D. of innovations           2561.417
Date:                Wed, 11 Dec 2024   AIC                          21116.271
Time:                        18:42:03   BIC                          21176.716
Sample:                             4   HQIC                         21139.100
                                 1142                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   281.2043    163.127      1.724      0.085     -38.863     601.271
trend                     3.7009      0.853      4.336      0.000       2.026       5.375
Violence

  return _format_order(self.data.orig_exog, order, self._causal)
