In [59]:
import csv
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
import pickle as pkl
import matplotlib.pyplot as plt
import random
import pyfixest as pf
import statsmodels.api as sm
import itertools as it
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [83]:
regression_data_full = pd.read_csv("../data/regression/gdp_regression_data.csv")
regression_data_insample = pd.read_csv("../data/regression/gdp_regression_data_insample.csv")
regression_data_outsample = pd.read_csv("../data/regression/gdp_regression_data_outsample.csv")

In [88]:
model_groups = [
    {
        "temp_vars":["temp_unweighted","temp_unweighted_2","temp_unweighted_3"],
        "precip_vars":["precip_unweighted","precip_unweighted_2","precip_unweighted_3"],
        "humidity_vars":["humidity_unweighted","humidity_unweighted_2","humidity_unweighted_3"]
    },
    {
        "temp_vars":["temp_daily_std_unweighted","temp_daily_std_unweighted_2","temp_daily_std_unweighted_3"],
        "precip_vars":["precip_daily_std_unweighted","precip_daily_std_unweighted_2","precip_daily_std_unweighted_3"],
        "humidity_vars":["humidity_daily_std_unweighted","humidity_daily_std_unweighted_2","humidity_daily_std_unweighted_3"]
    },
    {
        "temp_vars":["temp_annual_std_unweighted","temp_annual_std_unweighted_2","temp_annual_std_unweighted_3"],
        "precip_vars":["precip_annual_std_unweighted","precip_annual_std_unweighted_2","precip_annual_std_unweighted_3"],
        "humidity_vars":["humidity_annual_std_unweighted","humidity_annual_std_unweighted_2","humidity_annual_std_unweighted_3"]
    },
    {
        "temp_vars":["fd_temp_unweighted","fd_temp_unweighted_2","fd_temp_unweighted_3"],
        "precip_vars":["fd_precip_unweighted","fd_precip_unweighted_2","fd_precip_unweighted_3"],
        "humidity_vars":["fd_humidity_unweighted","fd_humidity_unweighted_2","fd_humidity_unweighted_3"]
    },
    {
        "temp_vars":["fd_temp_daily_std_unweighted","fd_temp_daily_std_unweighted_2","fd_temp_daily_std_unweighted_3"],
        "precip_vars":["fd_precip_daily_std_unweighted","fd_precip_daily_std_unweighted_2","fd_precip_daily_std_unweighted_3"],
        "humidity_vars":["fd_humidity_daily_std_unweighted","fd_humidity_daily_std_unweighted_2","fd_humidity_daily_std_unweighted_3"]
    },
    {
        "temp_vars":["fd_temp_annual_std_unweighted","fd_temp_annual_std_unweighted_2","fd_temp_annual_std_unweighted_3"],
        "precip_vars":["fd_precip_annual_std_unweighted","fd_precip_annual_std_unweighted_2","fd_precip_annual_std_unweighted_3"],
        "humidity_vars":["fd_humidity_annual_std_unweighted","fd_humidity_annual_std_unweighted_2","fd_humidity_annual_std_unweighted_3"]
    }
]

In [87]:
def run_regression(results, in_sample_data, out_sample_data, target_var, temp_var_list=None, precip_var_list=None, humidity_var_list=None, fe_string = "country + year"):
    assert any([temp_var_list, precip_var_list, humidity_var_list]) != None
    data_columns = in_sample_data.columns
    var_list = []
    if temp_var_list != None:
        for var in temp_var_list: var_list.append(var)
    if precip_var_list != None:
        for var in precip_var_list: var_list.append(var)
    if humidity_var_list != None:
        for var in humidity_var_list: var_list.append(var)
    var_string = " + ".join(var_list)
    for incremental_effects in [0,1,2,3]:
        for i in range(incremental_effects):
            for incremental_col in [col for col in regression_data_insample.columns if col.endswith(f"incremental_effect_{str(i+1)}")]:
                var_list.append(incremental_col)
            var_string += f"*incremental_effects_{str(i+i)}"
        covariate_string = " + ".join(var_list)
        regression = pf.feols(
            f"{target_var} ~ {covariate_string} | {fe_string}", 
            data=in_sample_data
        )
        in_sample_mse = np.mean(np.square(regression.predict()-in_sample_data[target_var]))
        out_sample_mse = np.mean(np.square(regression.predict(out_sample_data)-out_sample_data[target_var]))
        results["covariate_string"].append(covariate_string)
        results["in_sample_mse"].append(in_sample_mse)
        results["out_sample_mse"].append(out_sample_mse)
    return results

In [89]:
results = {"covariate_string":[],"out_sample_mse":[],"in_sample_mse":[]}
for group in model_groups:
    temp_vars, precip_vars, humidity_vars = [], [], []
    for var in group["temp_vars"]:
        temp_vars.append(var)
        results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", temp_vars)
    for var in group["precip_vars"]:
        precip_vars.append(var)
        results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", None, precip_vars)
    for var in group["humidity_vars"]:
        humidity_vars.append(var)
        results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", None, None, humidity_vars)
    results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", temp_vars, precip_vars)
    results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", None, precip_vars, humidity_vars)
    results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", temp_vars, None, humidity_vars)
    results = run_regression(results, regression_data_insample, regression_data_outsample, "fd_ln_gdp", temp_vars, precip_vars, humidity_vars)
pd.DataFrame.from_dict(results).sort_values(["out_sample_mse","in_sample_mse"]).to_csv("test.csv")

            The following variables are collinear: ['ABW_incremental_effect_1', 'AND_incremental_effect_1', 'ASM_incremental_effect_1', 'ATG_incremental_effect_1', 'BMU_incremental_effect_1', 'COM_incremental_effect_1', 'CPV_incremental_effect_1', 'CYM_incremental_effect_1', 'DMA_incremental_effect_1', 'FJI_incremental_effect_1', 'FRO_incremental_effect_1', 'FSM_incremental_effect_1', 'GIB_incremental_effect_1', 'GRD_incremental_effect_1', 'GRL_incremental_effect_1', 'GUM_incremental_effect_1', 'ISL_incremental_effect_1', 'KIR_incremental_effect_1', 'LCA_incremental_effect_1', 'LIE_incremental_effect_1', 'MCO_incremental_effect_1', 'MDV_incremental_effect_1', 'MHL_incremental_effect_1', 'MNP_incremental_effect_1', 'MUS_incremental_effect_1', 'NCL_incremental_effect_1', 'NRU_incremental_effect_1', 'PLW_incremental_effect_1', 'PRK_incremental_effect_1', 'PYF_incremental_effect_1', 'SMR_incremental_effect_1', 'STP_incremental_effect_1', 'SYC_incremental_effect_1', 'TCA_incremental_effect_

In [48]:
headers = []
for var in var_list:
    headers.append(var)
headers.append("In-sample MSE")
with open("test_out.csv", "w") as file_output:
    writer = csv.writer(file_output)
    writer.writerow(headers)
    for permutation in permutation_list:
        vars = " + ".join([var for index, var in enumerate(var_list) if permutation[index] == 1])
        regression = pf.feols(f"fd_log_tfp ~ {vars} | country + year", data=data)
        yhat = regression.predict()
        error = np.mean(np.square(yhat-data.fd_log_tfp))
        res_row = []
        for i in permutation:
            res_row.append(i)
        res_row.append(error)
        writer.writerow(res_row)

In [27]:
regression.tidy()

Unnamed: 0_level_0,Estimate,Std. Error,t value,Pr(>|t|),2.5%,97.5%
Coefficient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
temp_unweighted,0.014584,0.006973,2.091417,0.038022,0.000816,0.028352
temp_unweighted_2,-0.000984,0.00064,-1.538625,0.125811,-0.002247,0.000279
temp_unweighted_3,2.1e-05,1.6e-05,1.354382,0.177466,-1e-05,5.3e-05


In [38]:
regression.summary()

###

Estimation:  OLS
Dep. var.: fd_log_tfp, Fixed effects: ISO3+year
Inference:  CRV1
Observations:  9255

| Coefficient   |   Estimate |   Std. Error |   t value |   Pr(>|t|) |   2.5% |   97.5% |
|:--------------|-----------:|-------------:|----------:|-----------:|-------:|--------:|
| fd_tmean      |     -0.006 |        0.009 |    -0.705 |      0.482 | -0.025 |   0.012 |
| fd_tmean_sq   |     -0.000 |        0.000 |    -0.211 |      0.834 | -0.000 |   0.000 |
| fd_prcp       |      0.000 |        0.000 |     3.181 |      0.002 |  0.000 |   0.000 |
| fd_prcp_sq    |     -0.000 |        0.000 |    -3.109 |      0.002 | -0.000 |  -0.000 |
---
RMSE: 0.082 R2: 0.04 R2 Within: 0.01 


In [111]:
centered_data = pf.estimation.demean(
    np.array(data[["fd_tmean", "fd_tmean_sq", "fd_prcp", "fd_prcp_sq"]]), 
    np.array(data[["encoded_iso_id","year"]]), 
    np.ones(len(data))
)

In [118]:
x = centered_data[0]
y = np.array(data.fd_log_tfp)
model = sm.OLS(y,x)
results = model.fit()
print(results.params)

[-6.47599918e-03 -4.75268083e-05  1.81684996e-04 -1.81898741e-07]


In [137]:
# predict with fixed effects
yhat = regression.predict()
error = np.square(yhat-data.fd_log_tfp)
np.mean(error)

0.006735429878711787

In [140]:
# predict without fixed effects
yhat_ = results.predict()
error = np.square(yhat_-data.fd_log_tfp)
np.mean(error)

0.006980271648325952