In [1]:
import pandas as pd
from statsmodels.tsa.ardl import ardl_select_order
import numpy as np

import sys
sys.path.append("../src/model")

from ARDL_model import ARDL_model_func

In [2]:
DATA_PATH = "../data/CLEAN"

df_box_offices = pd.read_csv(DATA_PATH+"/Violent_Movies_final.tsv", sep="\t")

In [3]:
# Testing out the ARDL model: 
weekly_revenue_films = ARDL_model_func(df_box_offices)

In [4]:
# QUESTIONS
# 1) Only take the years 1950-2012 ? In other years we have nearly no weekly values ? Due to wars ? 
# 2) Fill all missing weeks with 0 films and 0 box_office_revenue


weekly_revenue_films.head()

Unnamed: 0,Year,Week,no. films released,Box office revenue
0,1913,33,1,980000.0
1,1914,46,1,87028.0
2,1915,6,1,50000000.0
3,1923,47,1,4168790.0
4,1924,49,1,274827.0


In [5]:
EXOG = weekly_revenue_films
EXOG

Unnamed: 0,Year,Week,no. films released,Box office revenue
0,1913,33,1,980000.0
1,1914,46,1,87028.0
2,1915,6,1,50000000.0
3,1923,47,1,4168790.0
4,1924,49,1,274827.0
...,...,...,...,...
1824,2012,36,5,312845733.0
1825,2012,38,1,10473039.0
1826,2012,39,1,136513833.0
1827,2012,41,1,2005099.0


In [13]:
# Setting the time frame for the auto-regressive part
max_auto_lag = 4            # take into account max. 4 previous timesteps

# Setting the time span for the distributed lag part
max_film_lag = 4            # take into account max. 4 previous timesteps
max_unemployment_lag = 1    # take into account max 1 previous timestep

# Include time-fixed effects
#time_fixed = True

# Include additional confounding factors
#include_confounding = True

# Get indicator variables for the year-week
EXOG["Year-Week"] = EXOG["Year"].astype(str) + "-" + EXOG["Week"].astype(str)
EXOG

Unnamed: 0,Year,Week,no. films released,Box office revenue,Year-Week
0,1913,33,1,980000.0,1913-33
1,1914,46,1,87028.0,1914-46
2,1915,6,1,50000000.0,1915-6
3,1923,47,1,4168790.0,1923-47
4,1924,49,1,274827.0,1924-49
...,...,...,...,...,...
1824,2012,36,5,312845733.0,2012-36
1825,2012,38,1,10473039.0,2012-38
1826,2012,39,1,136513833.0,2012-39
1827,2012,41,1,2005099.0,2012-41


In [18]:
# Create time dummies for weekly time-fixed effects
time_dummies = pd.get_dummies(EXOG["Year-Week"], drop_first=True).astype(int)
EXOG_with_dummies = pd.concat([EXOG.drop(columns="Year-Week"), time_dummies], axis=1)
EXOG_with_dummies.head()

Unnamed: 0,Year,Week,no. films released,Box office revenue,1914-46,1915-6,1923-47,1924-49,1925-53,1926-52,...,2012-35,2012-36,2012-38,2012-39,2012-4,2012-41,2012-42,2012-6,2012-7,2012-8
0,1913,33,1,980000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1914,46,1,87028.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1915,6,1,50000000.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,47,1,4168790.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1924,49,1,274827.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
random_integers = np.random.randint(0, 100, size=1829)  # Adjust the range (0, 100) as needed

# Create the DataFrame
violence = pd.DataFrame({'violence_score': random_integers})

In [20]:
print(EXOG_with_dummies.dtypes)

Year                    int64
Week                    int64
no. films released      int64
Box office revenue    float64
1914-46                 int64
                       ...   
2012-41                 int64
2012-42                 int64
2012-6                  int64
2012-7                  int64
2012-8                  int64
Length: 1832, dtype: object


In [21]:
# Automatically select lag order based on AIC
# Call ardl_select_order function
selected_order = ardl_select_order(
    endog=violence['violence_score'], 
    exog=EXOG_with_dummies, 
    maxlag=max_auto_lag, 
    maxorder={"Box office revenue": max_film_lag}, 
    ic='aic'
)

  return _format_order(self.data.orig_exog, order, self._causal)
  return _format_order(self.data.orig_exog, order, self._causal)


In [28]:
print(selected_order.ar_lags)

[1, 2]


In [5]:
box_revenues_clean = df_box_offices.dropna()

In [9]:
weekly_revenues = box_revenues_clean.groupby(["Year", "Week"])["Box office revenue"].sum().reset_index()
weekly_no_films = box_revenues_clean.groupby(["Year", "Week"]).size().reset_index(name="no. films released")

weekly_films_revenues = pd.merge(weekly_no_films, weekly_revenues, on=["Year", "Week"], how="left") 
weekly_films_revenues_sorted = weekly_films_revenues.sort_values(["Year", "Week"], ascending=True)

In [10]:
weekly_films_revenues_sorted.head(50)

Unnamed: 0,Year,Week,no. films released,Box office revenue
0,1913,33,1,980000.0
1,1914,46,1,87028.0
2,1915,6,1,50000000.0
3,1923,47,1,4168790.0
4,1924,49,1,274827.0
5,1925,53,1,10738000.0
6,1926,52,1,1000000.0
7,1930,46,1,8000000.0
8,1930,52,1,780000.0
9,1931,47,1,12000000.0
