In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm


In [2]:
df = pd.read_csv(rf"C:\Users\lihou\Box\BudgetAnnouncements\data\step_3.csv")

In [4]:
def regression(df, y_col, x_col, hkm_col=None):
    if hkm_col:
        df_temp = df[["year", "article_count", x_col, y_col, hkm_col]]
    else:
        df_temp = df[["year", "article_count", x_col, y_col]]

    na_info = df_temp[df_temp.isna().any(axis=1)]

    rows_dropped = []

    if not na_info.empty:
        for _, row in na_info.iterrows():
            na_columns = row[row.isna()].index.tolist()
            rows_dropped.append(
                f"Year {row['year']}, Date {row.index}: Column {na_columns}"
            )

    df_temp = df_temp.dropna()

    if len(df_temp) == 0:
        return "no data"
    if len(df_temp) < 5:
        return "too few data points"

    if hkm_col:
        df_temp["interaction"] = df_temp[x_col] * df_temp[hkm_col]
        X = df_temp[[x_col, hkm_col, "interaction"]]
    else:
        X = df_temp[x_col]

    y = df_temp[y_col]
    X = sm.add_constant(X)

    try:
        model = sm.OLS(y, X).fit()

        params = model.params
        r_squared = model.rsquared
        t_values = model.tvalues

        print("Regression:")
        if not hkm_col:
            print(f"{y_col} ~ {x_col}")
        else:
            print(f"{y_col} ~ {x_col} + {hkm_col} + interaction")
        # print(f"Parameters:")
        # print(params)
        # print(f"R2: {r_squared}")
        # print(f"T-values:")
        # print(t_values)
        # print()
        return model

    except ValueError:
        print(f"ValueError: {y_col} ~ {x_col}")
        return "value error"


In [8]:
df1 = df.copy()
df1['time_period'] = df1.apply(lambda x: f"{x['date_n5']}_to_{x['date_p1']}", axis=1)

In [9]:
df1 = df1[['time_period'] + df1.columns.tolist()[:-1]]

In [10]:
import re

# name_change = {}

# for col in df1.columns:
#     if re.search(r"\_d$", col):
#         new_name = re.sub(r"\_d$", "", col)
#         name_change[col] = new_name
        
# df1 = df1.rename(columns=name_change)

In [11]:
# na counts
pd.options.display.max_rows = None
pd.options.display.max_columns = None
print(df1.isna().sum().to_frame(name="NA Count"))

               NA Count
time_period           0
date                  0
year                  0
dow                   0
note                  0
date_n5               0
date_p1               0
icr_d                37
icrf_d               37
ivwir_d              37
ilr_sq_d             37
2.5_n_d               0
5.0_n_d               0
10.0_n_d              0
15.0_n_d              0
20.0_n_d              1
30.0_n_d             72
2.5_r_d               8
5.0_r_d               8
10.0_r_d              8
15.0_r_d              8
20.0_r_d             11
30.0_r_d             72
2.5_i_d               8
5.0_i_d               8
10.0_i_d              8
15.0_i_d              8
20.0_i_d             11
30.0_i_d             72
gbpusd_o_d           18
gbpusd_c_d           18
ftse_o_d             18
ftse_c_d             18
gbpeur_o_d           18
gbpeur_c_d           18
rpi_1m_d             80
rpi_3m_d             80
rpi_6m_d             80
rpi_9m_d             80
rpi_1y_d             46
rpi_2y_d        

In [12]:
to_drop_cols = [
    '30.0_n',
    '30.0_r',
    '30.0_i',
    'rpi_1m',
    'rpi_3m',
    'rpi_6m',
    'rpi_9m',
    'cpi_1m',
    'cpi_3m',
    'cpi_6m',
    'cpi_9m',
    'cpi_1y',
    'cpi_3y',
    'cpi_4y',
    'cpi_6y',
    'cpi_7y',
    'cpi_8y',
    'cpi_9y',
    'cpi_15y',
    'cpi_25y',
    'ois_2m',
    'ois_4m',
    'ois_5m',
    'ois_7m',
    'ois_8m',
    'ois_9m',
    'ois_10m',
    'ois_11m',
    'ois_18m',
    'ois_2y',
    'ois_3y',
    'ois_4y',
    'ois_6y',
    'ois_7y',
    'ois_8y',
    'ois_9y',
    'ois_12y',
    'ois_25y'
]

to_drop_cols = [f"{col}_d" for col in to_drop_cols]

df1 = df1.drop(columns=to_drop_cols)

In [None]:
print(df1.isna().sum().to_frame(name="NA Count"))

               NA Count
time_period           0
date                  0
year                  0
dow                   0
note                  0
date_n5               0
date_p1               0
icr_d                37
icrf_d               37
ivwir_d              37
ilr_sq_d             37
2.5_n_d               0
5.0_n_d               0
10.0_n_d              0
15.0_n_d              0
20.0_n_d              1
2.5_r_d               8
5.0_r_d               8
10.0_r_d              8
15.0_r_d              8
20.0_r_d             11
2.5_i_d               8
5.0_i_d               8
10.0_i_d              8
15.0_i_d              8
20.0_i_d             11
gbpusd_o_d           18
gbpusd_c_d           18
ftse_o_d             18
ftse_c_d             18
gbpeur_o_d           18
gbpeur_c_d           18
rpi_1y_d             46
rpi_2y_d             46
rpi_3y_d             48
rpi_4y_d             48
rpi_5y_d             48
rpi_6y_d             48
rpi_7y_d             48
rpi_8y_d             48
rpi_9y_d        

In [64]:
# Here, I will record every significant regression results I found

# reg = regression(df1, "10.0_n_d", "q1_ratio", "gbpeur_o_d")

# reg = regression(df1, "10.0_n_d", "q1_ratio")
reg = regression(df1, "10.0_n_d", "q1_ratio", "gbpusd_o_d")
# reg = regression(df1, "10.0_n_d", "q1_ratio", "rpi_1y_d")
# reg = regression(df1, "10.0_n_d", "q1_ratio", "2.5_r_d")

print(reg.summary2())

Regression:
10.0_n_d ~ q1_ratio + gbpusd_o_d + interaction
                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.447   
Dependent Variable: 10.0_n_d         AIC:                -91.7672
Date:               2024-11-03 13:27 BIC:                -82.6053
No. Observations:   73               Log-Likelihood:     49.884  
Df Model:           3                F-statistic:        20.41   
Df Residuals:       69               Prob (F-statistic): 1.42e-09
R-squared:          0.470            Scale:              0.015793
------------------------------------------------------------------
               Coef.   Std.Err.     t     P>|t|    [0.025   0.975]
------------------------------------------------------------------
const          0.0448    0.0199   2.2543  0.0274   0.0051   0.0844
q1_ratio       0.1729    0.0475   3.6426  0.0005   0.0782   0.2675
gbpusd_o_d    -3.5446    0.6382  -5.5544  0.0000  -4.8177  -2.2715
interaction   -6.6575    1.5