In [1]:
import csv
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
import pickle as pkl
import matplotlib.pyplot as plt
import random
import pyfixest as pf
import statsmodels.api as sm

In [2]:
data = pd.read_csv("../data/ortiz-bobea/data2/regdata_preferred_case_encoded_iso_id.csv")

In [4]:
data.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'ISO3', 'year', 'FAO.N', 'WDI.Code',
       'Country', 'Region', 'Sub.Region', 'tfp', 'output', 'log_tfp',
       'log_output', 'fd_log_tfp', 'fd_log_output', 'cum_log_fd_log_tfp',
       'cum_log_fd_log_output', 'FAO', 'FAOregion', 'prcp', 'tmin', 'tmean',
       'tmax', 'tmin_sq', 'tmin_cu', 'tmean_sq', 'tmean_cu', 'tmax_sq',
       'tmax_cu', 'prcp_sq', 'prcp_cu', 'fd_prcp', 'fd_prcp_cu', 'fd_prcp_sq',
       'fd_tmax', 'fd_tmax_cu', 'fd_tmax_sq', 'fd_tmean', 'fd_tmean_cu',
       'fd_tmean_sq', 'fd_tmin', 'fd_tmin_cu', 'fd_tmin_sq', 'block', 'mean',
       'weights', 'encoded_iso_id'],
      dtype='object')

In [47]:
var_list = ["tmean","tmean_sq","tmean_cu","fd_tmean","fd_tmean_sq","fd_tmean_cu","prcp","prcp_sq","prcp_cu","fd_prcp","fd_prcp_sq","fd_prcp_cu"]
permutation_list = [
    np.ones(len(var_list)),
    [0,0,0,1,1,0,0,0,0,1,1,0],
    [1,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,1,0,0,0,0,0,0,0,0],
]


In [48]:
headers = []
for var in var_list:
    headers.append(var)
headers.append("In-sample MSE")
with open("test_out.csv", "w") as file_output:
    writer = csv.writer(file_output)
    writer.writerow(headers)
    for permutation in permutation_list:
        vars = " + ".join([var for index, var in enumerate(var_list) if permutation[index] == 1])
        regression = pf.feols(f"fd_log_tfp ~ {vars} | ISO3 + year", data=data)
        yhat = regression.predict()
        error = np.mean(np.square(yhat-data.fd_log_tfp))
        res_row = []
        for i in permutation:
            res_row.append(i)
        res_row.append(error)
        writer.writerow(res_row)

In [37]:
regression.tidy()

Unnamed: 0_level_0,Estimate,Std. Error,t value,Pr(>|t|),2.5%,97.5%
Coefficient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fd_tmean,-0.006475999,0.009191869,-0.704536,0.482058,-0.02462014,0.01166814
fd_tmean_sq,-4.752681e-05,0.0002257485,-0.21053,0.833505,-0.0004931395,0.0003980859
fd_prcp,0.000181685,5.712337e-05,3.180572,0.001746,6.892724e-05,0.0002944428
fd_prcp_sq,-1.818987e-07,5.85087e-08,-3.108918,0.0022,-2.97391e-07,-6.640643e-08


In [38]:
regression.summary()

###

Estimation:  OLS
Dep. var.: fd_log_tfp, Fixed effects: ISO3+year
Inference:  CRV1
Observations:  9255

| Coefficient   |   Estimate |   Std. Error |   t value |   Pr(>|t|) |   2.5% |   97.5% |
|:--------------|-----------:|-------------:|----------:|-----------:|-------:|--------:|
| fd_tmean      |     -0.006 |        0.009 |    -0.705 |      0.482 | -0.025 |   0.012 |
| fd_tmean_sq   |     -0.000 |        0.000 |    -0.211 |      0.834 | -0.000 |   0.000 |
| fd_prcp       |      0.000 |        0.000 |     3.181 |      0.002 |  0.000 |   0.000 |
| fd_prcp_sq    |     -0.000 |        0.000 |    -3.109 |      0.002 | -0.000 |  -0.000 |
---
RMSE: 0.082 R2: 0.04 R2 Within: 0.01 


In [111]:
centered_data = pf.estimation.demean(
    np.array(data[["fd_tmean", "fd_tmean_sq", "fd_prcp", "fd_prcp_sq"]]), 
    np.array(data[["encoded_iso_id","year"]]), 
    np.ones(len(data))
)

In [118]:
x = centered_data[0]
y = np.array(data.fd_log_tfp)
model = sm.OLS(y,x)
results = model.fit()
print(results.params)

[-6.47599918e-03 -4.75268083e-05  1.81684996e-04 -1.81898741e-07]


In [137]:
# predict with fixed effects
yhat = regression.predict()
error = np.square(yhat-data.fd_log_tfp)
np.mean(error)

0.006735429878711787

In [140]:
# predict without fixed effects
yhat_ = results.predict()
error = np.square(yhat_-data.fd_log_tfp)
np.mean(error)

0.006980271648325952