In [1]:
import csv
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
import pickle as pkl
import matplotlib.pyplot as plt
import random
import pyfixest as pf
import statsmodels.api as sm
import itertools as it
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore')

In [117]:
class RegressionResult:

    def __init__(self):
        self.model_vars = []
        self.out_sample_mse = np.NaN
        self.in_sample_mse = np.NaN
        self.out_sample_pred_int_acc = np.NaN
        self.in_sample_pred_int_acc = np.NaN

    def print_result(self):
        print(",".join(self.model_vars))
        print("out_sample_mse:", self.out_sample_mse)
        print("in_sample_mse:", self.in_sample_mse)
        print("out_sample_pred_int_acc:", self.out_sample_pred_int_acc)
        print("in_sample_pred_int_acc:", self.in_sample_pred_int_acc)

    def is_empty(self):
        if self.model_vars == []:
            return True
        else:
            return False

In [3]:
def calculate_prediction_interval_accuracy(x, y, predictions, cov_mat):
    results = []
    for index, row in enumerate(x.itertuples()):
        x_data = list(row[1:])
        y_real = y.iloc[index]
        se_pred = np.sqrt(np.linalg.multi_dot([x_data, cov_mat, np.transpose(x_data)]))
        prediction_interval = (predictions[index]-se_pred*1.9603795, predictions[index]+se_pred*1.9603795)
        if y_real >= prediction_interval[0] and y_real <= prediction_interval[1]:
            results.append(1)
        else:
            results.append(0)
    return np.mean(results)

In [121]:
def choose_best_model(model1, model2):
    if (model1.out_sample_mse > model2.out_sample_mse) and (abs(.95-model1.out_sample_pred_int_acc) > abs(.95-model2.out_sample_pred_int_acc)):
        return model2
    else:
        return model1

In [118]:
def run_fe_regression_with_cv(num_folds, target_var, model_vars, incremental_effects=0):
    print(model_vars)
    in_sample_mse, out_sample_mse, in_sample_pred_int_acc, out_sample_pred_int_acc = [], [], [], []
    
    for fold in range(num_folds):
        train_data = pd.read_csv(f"../data/regression/cross_validation/gdp_regression_data_insample_{str(fold)}.csv")
        test_data = pd.read_csv(f"../data/regression/cross_validation/gdp_regression_data_outsample_{str(fold)}.csv")

        if incremental_effects != None:
            for i in range(incremental_effects):
                for incremental_col in [col for col in train_data.columns if col.endswith(f"incremental_effect_{str(i+1)}")]:
                    model_vars.append(incremental_col)

        train_data_covariates = train_data[model_vars]
        test_data_covariates = test_data[model_vars]
        covariate_string = " + ".join(model_vars)
        
        regression = pf.feols(
            f"{target_var} ~ {covariate_string} | country", 
            data=train_data
        )

        cov_mat = regression._vcov
        # remove variables from sample data that were removed from regression due to multicollinearityd
        vars_in_model = regression._coefnames
        for column in list(train_data_covariates.columns):
            if column not in vars_in_model:
                train_data_covariates.drop(column, axis=1, inplace=True)
                test_data_covariates.drop(column, axis=1, inplace=True)
        
        in_sample_predictions = regression.predict(train_data)
        in_sample_mse.append(np.mean(np.square(in_sample_predictions-train_data[target_var])))
        in_sample_pred_int_acc.append(calculate_prediction_interval_accuracy(train_data_covariates, train_data[target_var], in_sample_predictions, cov_mat))

        out_sample_predictions = regression.predict(test_data)
        out_sample_mse.append(np.mean(np.square(out_sample_predictions-test_data[target_var])))
        out_sample_pred_int_acc.append(calculate_prediction_interval_accuracy(test_data_covariates, test_data[target_var], out_sample_predictions, cov_mat))
    
    model_vars = [var for var in model_vars if "incremental_effect" not in var]
    if incremental_effects != 0:
        model_vars.append(f"incremental_effects_{str(incremental_effects)}")
    reg_result = RegressionResult()
    reg_result.model_vars = model_vars
    reg_result.in_sample_mse = np.mean(in_sample_mse)
    reg_result.out_sample_mse = np.mean(out_sample_mse)
    reg_result.in_sample_pred_int_acc = np.mean(in_sample_pred_int_acc)
    reg_result.out_sample_pred_int_acc = np.mean(out_sample_pred_int_acc)
    return reg_result

In [119]:
model_variations = {
    "temp_vars":["temp_unweighted","temp_unweighted_2","temp_unweighted_3"],
    "precip_vars":["precip_unweighted","precip_unweighted_2","precip_unweighted_3"],
    "humidity_vars":["humidity_unweighted","humidity_unweighted_2","humidity_unweighted_3"]
}

In [120]:
num_folds = 10
base_model = RegressionResult()
for group, vars in model_variations.items():
    model_vars = []
    for var in vars:
        model_vars.append(var)
        new_model = run_fe_regression_with_cv(num_folds, "fd_ln_gdp", model_vars)
        if base_model.is_empty():
            base_model = new_model
        else:
            base_model = choose_best_model(base_model, new_model)
base_model.print_result()

['temp_unweighted']
['temp_unweighted', 'temp_unweighted_2']
['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3']
['precip_unweighted']
['precip_unweighted', 'precip_unweighted_2']
['precip_unweighted', 'precip_unweighted_2', 'precip_unweighted_3']
['humidity_unweighted']
['humidity_unweighted', 'humidity_unweighted_2']
['humidity_unweighted', 'humidity_unweighted_2', 'humidity_unweighted_3']
temp_unweighted,temp_unweighted_2,temp_unweighted_3
out_sample_mse: 0.027243456137445714
in_sample_mse: 0.027152303127506487
out_sample_pred_int_acc: 0.6850059031877213
in_sample_pred_int_acc: 0.8278783110411749


In [132]:
model_vars_to_add = []
for group, vars in model_variations.items():
    for var in vars:
        if var not in base_model.model_vars:
            new_model_vars = []
            for model_var in base_model.model_vars:
                new_model_vars.append(model_var)
            new_model_vars.append(var)
            new_model = run_fe_regression_with_cv(num_folds, "fd_ln_gdp", new_model_vars)
            if choose_best_model(base_model, new_model) == new_model:
                model_vars_to_add.append(var)
print(model_vars_to_add)

['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3', 'precip_unweighted']
['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3', 'precip_unweighted_2']
['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3', 'precip_unweighted_3']
['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3', 'humidity_unweighted']
['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3', 'humidity_unweighted_2']
['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3', 'humidity_unweighted_3']
['humidity_unweighted', 'humidity_unweighted_2', 'humidity_unweighted_3']


In [135]:
second_round_model_vars = []
for var in base_model.model_vars:
    second_round_model_vars.append(var)
for var in model_vars_to_add:
    second_round_model_vars.append(var)
new_model = run_fe_regression_with_cv(num_folds, "fd_ln_gdp", second_round_model_vars)
print("New model better?", compare_model_result(base_model, new_model) == new_model)

['temp_unweighted', 'temp_unweighted_2', 'temp_unweighted_3', 'humidity_unweighted', 'humidity_unweighted_2', 'humidity_unweighted_3']
New model better? False


In [136]:
base_model.print_result()

temp_unweighted,temp_unweighted_2,temp_unweighted_3
out_sample_mse: 0.027243456137445714
in_sample_mse: 0.027152303127506487
out_sample_pred_int_acc: 0.6850059031877213
in_sample_pred_int_acc: 0.8278783110411749


In [137]:
new_model.print_result()

temp_unweighted,temp_unweighted_2,temp_unweighted_3,humidity_unweighted,humidity_unweighted_2,humidity_unweighted_3
out_sample_mse: 0.027276446160989187
in_sample_mse: 0.027135880381679533
out_sample_pred_int_acc: 0.9948051948051949
in_sample_pred_int_acc: 0.9847102019407291
