In [154]:
import csv
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
import pickle as pkl
import matplotlib.pyplot as plt
import random
import pyfixest as pf
import statsmodels.api as sm
import itertools as it
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore')

In [184]:
regression_data_full = pd.read_csv("../data/regression/gdp_regression_data.csv").dropna().reset_index()
regression_data_insample = pd.read_csv("../data/regression/gdp_regression_data_insample.csv")
regression_data_outsample = pd.read_csv("../data/regression/gdp_regression_data_outsample.csv")

In [145]:
model_groups = [
    {
        "temp_vars":["temp_unweighted","temp_unweighted_2","temp_unweighted_3"],
        "precip_vars":["precip_unweighted","precip_unweighted_2","precip_unweighted_3"],
        "humidity_vars":["humidity_unweighted","humidity_unweighted_2","humidity_unweighted_3"]
    },
    # {
    #     "temp_vars":["temp_daily_std_unweighted","temp_daily_std_unweighted_2","temp_daily_std_unweighted_3"],
    #     "precip_vars":["precip_daily_std_unweighted","precip_daily_std_unweighted_2","precip_daily_std_unweighted_3"],
    #     "humidity_vars":["humidity_daily_std_unweighted","humidity_daily_std_unweighted_2","humidity_daily_std_unweighted_3"]
    # },
    # {
    #     "temp_vars":["temp_annual_std_unweighted","temp_annual_std_unweighted_2","temp_annual_std_unweighted_3"],
    #     "precip_vars":["precip_annual_std_unweighted","precip_annual_std_unweighted_2","precip_annual_std_unweighted_3"],
    #     "humidity_vars":["humidity_annual_std_unweighted","humidity_annual_std_unweighted_2","humidity_annual_std_unweighted_3"]
    # },
    {
        "temp_vars":["fd_temp_unweighted","fd_temp_unweighted_2","fd_temp_unweighted_3"],
        "precip_vars":["fd_precip_unweighted","fd_precip_unweighted_2","fd_precip_unweighted_3"],
        "humidity_vars":["fd_humidity_unweighted","fd_humidity_unweighted_2","fd_humidity_unweighted_3"]
    },
    # {
    #     "temp_vars":["fd_temp_daily_std_unweighted","fd_temp_daily_std_unweighted_2","fd_temp_daily_std_unweighted_3"],
    #     "precip_vars":["fd_precip_daily_std_unweighted","fd_precip_daily_std_unweighted_2","fd_precip_daily_std_unweighted_3"],
    #     "humidity_vars":["fd_humidity_daily_std_unweighted","fd_humidity_daily_std_unweighted_2","fd_humidity_daily_std_unweighted_3"]
    # },
    # {
    #     "temp_vars":["fd_temp_annual_std_unweighted","fd_temp_annual_std_unweighted_2","fd_temp_annual_std_unweighted_3"],
    #     "precip_vars":["fd_precip_annual_std_unweighted","fd_precip_annual_std_unweighted_2","fd_precip_annual_std_unweighted_3"],
    #     "humidity_vars":["fd_humidity_annual_std_unweighted","fd_humidity_annual_std_unweighted_2","fd_humidity_annual_std_unweighted_3"]
    # }
]

In [87]:
def run_regression(results, in_sample_data, out_sample_data, target_var, temp_var_list=None, precip_var_list=None, humidity_var_list=None, fe_string = "country + year"):
    assert any([temp_var_list, precip_var_list, humidity_var_list]) != None
    var_list = []
    if temp_var_list != None:
        for var in temp_var_list: var_list.append(var)
    if precip_var_list != None:
        for var in precip_var_list: var_list.append(var)
    if humidity_var_list != None:
        for var in humidity_var_list: var_list.append(var)
    var_string = " + ".join(var_list)
    for incremental_effects in [0,1,2,3]:
        for i in range(incremental_effects):
            for incremental_col in [col for col in regression_data_insample.columns if col.endswith(f"incremental_effect_{str(i+1)}")]:
                var_list.append(incremental_col)
            var_string += f"*incremental_effects_{str(i+i)}"
        covariate_string = " + ".join(var_list)
        regression = pf.feols(
            f"{target_var} ~ {covariate_string} | {fe_string}", 
            data=in_sample_data
        )
        # TODO: MSE should be a reduction from an intercept-only model
        in_sample_mse = np.mean(np.square(regression.predict()-in_sample_data[target_var]))
        out_sample_mse = np.mean(np.square(regression.predict(out_sample_data)-out_sample_data[target_var]))
        results["covariate_string"].append(covariate_string)
        results["in_sample_mse"].append(in_sample_mse)
        results["out_sample_mse"].append(out_sample_mse)
    return results

In [4]:
results = {"covariate_string":[],"out_sample_mse":[],"in_sample_mse":[]}
for group in model_groups:
    temp_vars, precip_vars, humidity_vars = [], [], []
    for var in group["temp_vars"]:
        temp_vars.append(var)
        results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", temp_vars)
    for var in group["precip_vars"]:
        precip_vars.append(var)
        results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", None, precip_vars)
    for var in group["humidity_vars"]:
        humidity_vars.append(var)
        results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", None, None, humidity_vars)
    results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", temp_vars, precip_vars)
    results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", None, precip_vars, humidity_vars)
    results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", temp_vars, None, humidity_vars)
    results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", temp_vars, precip_vars, humidity_vars)
pd.DataFrame.from_dict(results).sort_values(["out_sample_mse","in_sample_mse"]).to_csv("test.csv")

In [48]:
headers = []
for var in var_list:
    headers.append(var)
headers.append("In-sample MSE")
with open("test_out.csv", "w") as file_output:
    writer = csv.writer(file_output)
    writer.writerow(headers)
    for permutation in permutation_list:
        vars = " + ".join([var for index, var in enumerate(var_list) if permutation[index] == 1])
        regression = pf.feols(f"fd_log_tfp ~ {vars} | country + year", data=data)
        yhat = regression.predict()
        error = np.mean(np.square(yhat-data.fd_log_tfp))
        res_row = []
        for i in permutation:
            res_row.append(i)
        res_row.append(error)
        writer.writerow(res_row)

In [27]:
regression.tidy()

Unnamed: 0_level_0,Estimate,Std. Error,t value,Pr(>|t|),2.5%,97.5%
Coefficient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
temp_unweighted,0.014584,0.006973,2.091417,0.038022,0.000816,0.028352
temp_unweighted_2,-0.000984,0.00064,-1.538625,0.125811,-0.002247,0.000279
temp_unweighted_3,2.1e-05,1.6e-05,1.354382,0.177466,-1e-05,5.3e-05


In [38]:
regression.summary()

###

Estimation:  OLS
Dep. var.: fd_log_tfp, Fixed effects: ISO3+year
Inference:  CRV1
Observations:  9255

| Coefficient   |   Estimate |   Std. Error |   t value |   Pr(>|t|) |   2.5% |   97.5% |
|:--------------|-----------:|-------------:|----------:|-----------:|-------:|--------:|
| fd_tmean      |     -0.006 |        0.009 |    -0.705 |      0.482 | -0.025 |   0.012 |
| fd_tmean_sq   |     -0.000 |        0.000 |    -0.211 |      0.834 | -0.000 |   0.000 |
| fd_prcp       |      0.000 |        0.000 |     3.181 |      0.002 |  0.000 |   0.000 |
| fd_prcp_sq    |     -0.000 |        0.000 |    -3.109 |      0.002 | -0.000 |  -0.000 |
---
RMSE: 0.082 R2: 0.04 R2 Within: 0.01 


# Cross-validation

In [124]:
enc = OrdinalEncoder()
ordered_country_list = list(dict.fromkeys(regression_data_full.country))
enc.fit(np.array(ordered_country_list).reshape(-1,1))
regression_data_full["encoded_country"] = [int(val) for val in enc.transform(np.array(regression_data_full.country).reshape(-1,1))]
columns_to_center = [
        "temp_unweighted",
        "temp_unweighted_2",
        "temp_unweighted_3",
        "fd_temp_unweighted",
        "fd_temp_unweighted_2",
        "fd_temp_unweighted_3",
        "precip_unweighted",
        "precip_unweighted_2",
        "precip_unweighted_3",
        "fd_precip_unweighted",
        "fd_precip_unweighted_2",
        "fd_precip_unweighted_3",
        "humidity_unweighted",
        "humidity_unweighted_2",
        "humidity_unweighted_3",
        "fd_humidity_unweighted",
        "fd_humidity_unweighted_2",
        "fd_humidity_unweighted_3"
    ]
centered_data = pf.estimation.demean(
    np.array(regression_data_full[columns_to_center]), 
    np.array(regression_data_full[["encoded_country","year"]]), 
    np.ones(len(regression_data_full))
)[0]

  regression_data_full["encoded_country"] = [int(val) for val in enc.transform(np.array(regression_data_full.country).reshape(-1,1))]


In [143]:
cv_folds = 10
sampled_years = np.array(random.sample(set(regression_data_full.year), k=len(set(regression_data_full.year))))
year_cut = OrdinalEncoder().fit_transform(np.array(list(pd.cut(range(1,63), bins=cv_folds))).reshape(-1,1)).flatten()
for fold in range(cv_folds):
    withheld_years = []
    for index, cut in enumerate(year_cut):
        if cut == fold:
            withheld_years.append(sampled_years[index])
    withheld_rows = regression_data_full.loc[(regression_data_full.year.isin(withheld_years))]
    training_rows = regression_data_full.loc[(regression_data_full.year.isin(withheld_years) == False)]
    training_data, withheld_data = [], []
    for index, row in enumerate(centered_data):
        if index in withheld_rows.index:
            withheld_data.append(row)
        else:
            training_data.append(row)
    training_data = np.array(training_data)
    withheld_data = np.array(withheld_data)
    training_rows = training_rows.reset_index()
    withheld_rows = withheld_rows.reset_index()
    withheld_data_dict, training_data_dict = {}, {}
    for index, column in enumerate(columns_to_center):
        withheld_data_dict[column] = withheld_data[:,index]
        training_data_dict[column] = training_data[:,index]
    training_data = pd.concat([pd.DataFrame.from_dict(training_data_dict), training_rows[["fd_ln_gdp","country","year"]], training_rows.loc[:, training_rows.columns.str.contains(('_incremental_effect'))]], axis=1)
    withheld_data = pd.concat([pd.DataFrame.from_dict(withheld_data_dict), withheld_rows[["fd_ln_gdp","country","year"]], withheld_rows.loc[:, withheld_rows.columns.str.contains(('_incremental_effect'))]], axis=1)
    training_data.to_csv(f"../data/regression/cross_validation/gdp_regression_data_train_cv_{fold}.csv")
    withheld_data.to_csv(f"../data/regression/cross_validation/gdp_regression_data_test_cv_{fold}.csv")

since Python 3.9 and will be removed in a subsequent version.
  sampled_years = np.array(random.sample(set(regression_data_full.year), k=len(set(regression_data_full.year))))


62


In [258]:
def calculate_prediction_interval_accuracy(x, y, predictions, cov_mat):
    results = []
    for index, row in enumerate(x.itertuples()):
        x_data = list(row[1:])
        y_real = y.iloc[index]
        se_pred = np.sqrt(np.linalg.multi_dot([x_data, cov_mat, np.transpose(x_data)]))
        prediction_interval = (predictions[index]-se_pred*1.9603795, predictions[index]+se_pred*1.9603795)
        if y_real >= prediction_interval[0] and y_real <= prediction_interval[1]:
            results.append(1)
        else:
            results.append(0)
    return np.mean(results)

In [263]:
def run_regression_with_cv(results, num_folds, target_var, temp_var_list=None, precip_var_list=None, humidity_var_list=None, fe_string = "0"):
    print(temp_var_list, precip_var_list, humidity_var_list)
    assert any([temp_var_list, precip_var_list, humidity_var_list]) != None
    # for incremental_effects in [0,1,2,3]:
    var_list = []
    if temp_var_list != None:
        for var in temp_var_list: var_list.append(var)
    if precip_var_list != None:
        for var in precip_var_list: var_list.append(var)
    if humidity_var_list != None:
        for var in humidity_var_list: var_list.append(var)
    for fold in range(num_folds):
        in_sample_mse, out_sample_mse, in_sample_pred_int_acc, out_sample_pred_int_acc = [], [], [], []
        train_data = pd.read_csv(f"../data/regression/cross_validation/gdp_regression_data_train_cv_{fold}.csv")
        test_data = pd.read_csv(f"../data/regression/cross_validation/gdp_regression_data_test_cv_{fold}.csv")
        x = train_data[var_list]
        x_test = test_data[var_list]
        x = sm.add_constant(x)
        x_test = sm.add_constant(x_test)
        model = sm.OLS(train_data[target_var],x)
        regression = model.fit()
        in_sample_predictions = regression.predict()
        out_sample_predictions = regression.predict(x_test)
        # calculate mse
        # TODO: MSE should be a reduction from an intercept-only model
        in_sample_mse.append(np.mean(np.square(in_sample_predictions-train_data[target_var])))
        out_sample_mse.append(np.mean(np.square(out_sample_predictions-test_data[target_var])))
        # calculate prediction interval accuracy
        in_sample_pred_int_acc.append(calculate_prediction_interval_accuracy(x, train_data[target_var], in_sample_predictions, regression.cov_params()))
        out_sample_pred_int_acc.append(calculate_prediction_interval_accuracy(x_test, test_data[target_var],out_sample_predictions, regression.cov_params()))
    results["covariate_string"].append(",".join(var_list))
    results["in_sample_mse"].append(np.mean(in_sample_mse))
    results["out_sample_mse"].append(np.mean(out_sample_mse))
    results["in_sample_pred_int_acc"].append(np.mean(in_sample_pred_int_acc))
    results["out_sample_pred_int_acc"].append(np.mean(out_sample_pred_int_acc))
    return results

In [264]:
results = {"covariate_string":[],"out_sample_mse":[],"in_sample_mse":[],"in_sample_pred_int_acc":[],"out_sample_pred_int_acc":[]}
results = run_regression_with_cv(
    results, 
    10, 
    "fd_ln_gdp", 
    ["fd_temp_unweighted","fd_temp_unweighted_2","fd_temp_unweighted_3"],
    ["fd_precip_unweighted","fd_precip_unweighted_2","fd_precip_unweighted_3"],
    ["fd_humidity_unweighted","fd_humidity_unweighted_2","fd_humidity_unweighted_3"]
)
results = run_regression_with_cv(
    results, 
    10, 
    "fd_ln_gdp", 
    ["temp_unweighted","temp_unweighted_2"]
)
pd.DataFrame.from_dict(results).sort_values(["out_sample_mse","in_sample_mse"]).to_csv("test_cv_pred_int.csv")

['fd_temp_unweighted', 'fd_temp_unweighted_2', 'fd_temp_unweighted_3'] ['fd_precip_unweighted', 'fd_precip_unweighted_2', 'fd_precip_unweighted_3'] ['fd_humidity_unweighted', 'fd_humidity_unweighted_2', 'fd_humidity_unweighted_3']
['temp_unweighted', 'temp_unweighted_2'] None None


In [168]:
train_data = pd.read_csv(f"../data/regression/cross_validation/gdp_regression_data_train_cv_0.csv")
regression = pf.feols(
                "fd_ln_gdp ~ temp_unweighted + temp_unweighted_2 | 0", 
                data=train_data
            )

In [180]:
dir(reg

AttributeError: 'Feols' object has no attribute 'cov'

In [226]:
train_data = pd.read_csv(f"../data/regression/cross_validation/gdp_regression_data_train_cv_0.csv")
x = np.array(train_data[["temp_unweighted","temp_unweighted_2"]])
x = sm.add_constant(x)
y = np.array(train_data.fd_ln_gdp)
model = sm.OLS(y,x)
results = model.fit()
yhat = results.predict()
prediction_intervals = []
for index, obs in enumerate(x):
    se_pred = np.linalg.multi_dot([obs, results.cov_params(), np.transpose(obs)])
    prediction_interval = (yhat[index] - se_pred * 1.9603795, yhat[index] + se_pred * 1.9603795)
    prediction_intervals.append(prediction_interval)

In [219]:
results.cov_params()

array([[1.95883511e-05]])

In [137]:
# predict with fixed effects
yhat = regression.predict()
error = np.square(yhat-data.fd_log_tfp)
np.mean(error)

0.006735429878711787

In [140]:
# predict without fixed effects
yhat_ = results.predict()
error = np.square(yhat_-data.fd_log_tfp)
np.mean(error)

0.006980271648325952