In [1]:
import csv
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
import pickle as pkl
import matplotlib.pyplot as plt
import random
import pyfixest as pf
import statsmodels.api as sm
import itertools as it
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore')

In [20]:
model_groups = [
    {
        "temp_vars":["temp_unweighted","temp_unweighted_2","temp_unweighted_3"],
        "precip_vars":["precip_unweighted","precip_unweighted_2","precip_unweighted_3"],
        "humidity_vars":["humidity_unweighted","humidity_unweighted_2","humidity_unweighted_3"]
    },
    {
        "temp_vars":["temp_daily_std_unweighted","temp_daily_std_unweighted_2","temp_daily_std_unweighted_3"],
        "precip_vars":["precip_daily_std_unweighted","precip_daily_std_unweighted_2","precip_daily_std_unweighted_3"],
        "humidity_vars":["humidity_daily_std_unweighted","humidity_daily_std_unweighted_2","humidity_daily_std_unweighted_3"]
    },
    {
        "temp_vars":["temp_annual_std_unweighted","temp_annual_std_unweighted_2","temp_annual_std_unweighted_3"],
        "precip_vars":["precip_annual_std_unweighted","precip_annual_std_unweighted_2","precip_annual_std_unweighted_3"],
        "humidity_vars":["humidity_annual_std_unweighted","humidity_annual_std_unweighted_2","humidity_annual_std_unweighted_3"]
    },
    {
        "temp_vars":["fd_temp_unweighted","fd_temp_unweighted_2","fd_temp_unweighted_3"],
        "precip_vars":["fd_precip_unweighted","fd_precip_unweighted_2","fd_precip_unweighted_3"],
        "humidity_vars":["fd_humidity_unweighted","fd_humidity_unweighted_2","fd_humidity_unweighted_3"]
    },
    {
        "temp_vars":["fd_temp_daily_std_unweighted","fd_temp_daily_std_unweighted_2","fd_temp_daily_std_unweighted_3"],
        "precip_vars":["fd_precip_daily_std_unweighted","fd_precip_daily_std_unweighted_2","fd_precip_daily_std_unweighted_3"],
        "humidity_vars":["fd_humidity_daily_std_unweighted","fd_humidity_daily_std_unweighted_2","fd_humidity_daily_std_unweighted_3"]
    },
    {
        "temp_vars":["fd_temp_annual_std_unweighted","fd_temp_annual_std_unweighted_2","fd_temp_annual_std_unweighted_3"],
        "precip_vars":["fd_precip_annual_std_unweighted","fd_precip_annual_std_unweighted_2","fd_precip_annual_std_unweighted_3"],
        "humidity_vars":["fd_humidity_annual_std_unweighted","fd_humidity_annual_std_unweighted_2","fd_humidity_annual_std_unweighted_3"]
    }
]

In [3]:
def calculate_prediction_interval_accuracy(x, y, predictions, cov_mat):
    results = []
    for index, row in enumerate(x.itertuples()):
        x_data = list(row[1:])
        y_real = y.iloc[index]
        se_pred = np.sqrt(np.linalg.multi_dot([x_data, cov_mat, np.transpose(x_data)]))
        prediction_interval = (predictions[index]-se_pred*1.9603795, predictions[index]+se_pred*1.9603795)
        if y_real >= prediction_interval[0] and y_real <= prediction_interval[1]:
            results.append(1)
        else:
            results.append(0)
    return np.mean(results)

In [5]:
def run_regression_with_cv(results, num_folds, target_var, temp_var_list=None, precip_var_list=None, humidity_var_list=None, incremental_effects=None, fe_string = "country + year"):
    assert any([temp_var_list, precip_var_list, humidity_var_list]) != None
    var_list = []
    if temp_var_list != None:
        for var in temp_var_list: var_list.append(var)
    if precip_var_list != None:
        for var in precip_var_list: var_list.append(var)
    if humidity_var_list != None:
        for var in humidity_var_list: var_list.append(var)
    if incremental_effects != None:
        for i in range(incremental_effects):
            for incremental_col in [col for col in regression_data_insample.columns if col.endswith(f"incremental_effect_{str(i+1)}")]:
                var_list.append(incremental_col)
    
    for fold in range(num_folds):
        in_sample_mse, out_sample_mse, in_sample_pred_int_acc, out_sample_pred_int_acc = [], [], [], []
        train_data = pd.read_csv(f"../data/regression/cross_validation/gdp_train_data_{fold}.csv")
        test_data = pd.read_csv(f"../data/regression/cross_validation/gdp_test_data_{fold}.csv")
        x = train_data[var_list]
        x_test = test_data[var_list]
        x = sm.add_constant(x)
        x_test = sm.add_constant(x_test)
        model = sm.OLS(train_data[target_var],x)
        regression = model.fit()
        in_sample_predictions = regression.predict()
        out_sample_predictions = regression.predict(x_test)
        # calculate mse
        in_sample_mse.append(np.mean(np.square(in_sample_predictions-train_data[target_var])))
        out_sample_mse.append(np.mean(np.square(out_sample_predictions-test_data[target_var])))
        # calculate prediction interval accuracy
        # in_sample_pred_int_acc.append(calculate_prediction_interval_accuracy(x, train_data[target_var], in_sample_predictions, regression.cov_params()))
        # out_sample_pred_int_acc.append(calculate_prediction_interval_accuracy(x_test, test_data[target_var],out_sample_predictions, regression.cov_params()))
    results["covariate_string"].append(",".join(var_list))
    results["in_sample_mse"].append(np.mean(in_sample_mse))
    results["out_sample_mse"].append(np.mean(out_sample_mse))
    # results["in_sample_pred_int_acc"].append(np.mean(in_sample_pred_int_acc))
    # results["out_sample_pred_int_acc"].append(np.mean(out_sample_pred_int_acc))
    return results

In [24]:
def run_fe_regression_with_cv(results, num_folds, target_var, temp_var_list=None, precip_var_list=None, humidity_var_list=None, incremental_effects=2):

    print(temp_var_list, precip_var_list, humidity_var_list, f"incremental_effects: {incremental_effects}")
    assert any([temp_var_list, precip_var_list, humidity_var_list]) != None

    in_sample_mse, out_sample_mse, in_sample_pred_int_acc, out_sample_pred_int_acc = [], [], [], []
    
    var_list = []
    if temp_var_list != None:
        for var in temp_var_list: var_list.append(var)
    if precip_var_list != None:
        for var in precip_var_list: var_list.append(var)
    if humidity_var_list != None:
        for var in humidity_var_list: var_list.append(var)
    
    for fold in range(num_folds):
        
        # train_data = pd.read_csv(f"../data/regression/cross_validation/gdp_train_data_{fold}.csv")
        # test_data = pd.read_csv(f"../data/regression/cross_validation/gdp_test_data_{fold}.csv")
        train_data = pd.read_csv(f"../data/regression/gdp_regression_data_insample.csv")
        test_data = pd.read_csv(f"../data/regression/gdp_regression_data_outsample.csv")

        if incremental_effects != None:
            for i in range(incremental_effects):
                for incremental_col in [col for col in train_data.columns if col.endswith(f"incremental_effect_{str(i+1)}")]:
                    var_list.append(incremental_col)

        train_data_covariates = train_data[var_list]
        test_data_covariates = test_data[var_list]
        covariate_string = " + ".join(var_list)
        
        regression = pf.feols(
            f"{target_var} ~ {covariate_string} | country", 
            data=train_data
        )

        cov_mat = regression._vcov
        
        in_sample_predictions = regression.predict(train_data)
        in_sample_mse.append(np.mean(np.square(in_sample_predictions-train_data[target_var])))
        # in_sample_pred_int_acc.append(calculate_prediction_interval_accuracy(train_data_covariates, train_data[target_var], in_sample_predictions, cov_mat))

        out_sample_predictions = regression.predict(test_data)
        out_sample_mse.append(np.mean(np.square(out_sample_predictions-test_data[target_var])))
        # out_sample_pred_int_acc.append(calculate_prediction_interval_accuracy(test_data_covariates, test_data[target_var], out_sample_predictions, cov_mat))
    
    var_list = [var for var in var_list if "incremental_effect" not in var]
    if incremental_effects != 0:
        var_list.append(f"incremental_effects_{str(incremental_effects)}")
    results["covariate_string"].append(",".join(var_list))
    results["in_sample_mse"].append(np.mean(in_sample_mse))
    results["out_sample_mse"].append(np.mean(out_sample_mse))
    # results["in_sample_pred_int_acc"].append(np.mean(in_sample_pred_int_acc))
    # results["out_sample_pred_int_acc"].append(np.mean(out_sample_pred_int_acc))
    return results

In [25]:
# baseline model test
results = {"covariate_string":[],"out_sample_mse":[],"in_sample_mse":[]}#,"in_sample_pred_int_acc":[],"out_sample_pred_int_acc":[]}
for group in model_groups:
    temp_vars, precip_vars, humidity_vars = [], [], []
    for var in group["temp_vars"]:
        temp_vars.append(var)
        results = run_fe_regression_with_cv(results, 1, "fd_ln_gdp", temp_vars)
    for var in group["precip_vars"]:
        precip_vars.append(var)
        results = run_fe_regression_with_cv(results, 1, "fd_ln_gdp", None, precip_vars)
    for var in group["humidity_vars"]:
        humidity_vars.append(var)
        results = run_fe_regression_with_cv(results, 1, "fd_ln_gdp", None, None, humidity_vars)
    results = run_fe_regression_with_cv(results, 1, "fd_ln_gdp", temp_vars, precip_vars, None)
    results = run_fe_regression_with_cv(results, 1, "fd_ln_gdp", None, precip_vars, humidity_vars)
    results = run_fe_regression_with_cv(results, 1, "fd_ln_gdp", temp_vars, None, humidity_vars)
    results = run_fe_regression_with_cv(results, 1, "fd_ln_gdp", temp_vars, precip_vars, humidity_vars)
pd.DataFrame.from_dict(results).sort_values(["out_sample_mse"]).to_csv("../output/model_comparison_results/baseline_model_test_with_fe_ie2_no_cv_no_yfe.csv")

['temp_unweighted'] None None incremental_effects: 2
['temp_unweighted', 'temp_unweighted_2'] None None incremental_effects: 2
['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3'] None None incremental_effects: 2
None ['precip_unweighted'] None incremental_effects: 2
None ['precip_unweighted', 'precip_unweighted_2'] None incremental_effects: 2
None ['precip_unweighted', 'precip_unweighted_2', 'precip_unweighted_3'] None incremental_effects: 2
None None ['humidity_unweighted'] incremental_effects: 2
None None ['humidity_unweighted', 'humidity_unweighted_2'] incremental_effects: 2
None None ['humidity_unweighted', 'humidity_unweighted_2', 'humidity_unweighted_3'] incremental_effects: 2
['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3'] ['precip_unweighted', 'precip_unweighted_2', 'precip_unweighted_3'] None incremental_effects: 2
None ['precip_unweighted', 'precip_unweighted_2', 'precip_unweighted_3'] ['humidity_unweighted', 'humidity_unweighted_2', 'humidity_unweight

In [26]:
baseline_model_results = pd.read_csv("../output/model_comparison_results/baseline_model_test_with_fe.csv")
print(baseline_model_results.sort_values("out_sample_mse").iloc[1:5].covariate_string)

1                    fd_humidity_annual_std_unweighted
2                       humidity_annual_std_unweighted
3    humidity_annual_std_unweighted,humidity_annual...
4                                   fd_temp_unweighted
Name: covariate_string, dtype: object


In [16]:
results

{'covariate_string': ['temp_unweighted,incremental_effects_2',
  'temp_unweighted,temp_unweighted_2,incremental_effects_2',
  'temp_unweighted,temp_unweighted_2,temp_unweighted_3,incremental_effects_2'],
 'out_sample_mse': [0.02245547157262562,
  0.02243552628368742,
  0.022445592380040413],
 'in_sample_mse': [0.022007153916081833,
  0.022001057148123976,
  0.02200084261540005],
 'in_sample_pred_int_acc': [],
 'out_sample_pred_int_acc': []}