In [1]:
import csv
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
import pickle as pkl
import matplotlib.pyplot as plt
import random
import pyfixest as pf
import statsmodels.api as sm

# Create Dataset

In [48]:
import csv
import pandas as pd
import numpy as np
from countrycode import countrycode as cc
from calendar import monthrange
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [30]:
gdp_data = pd.read_csv("../data/GDP_per_capita/worldbank_wdi_gdp_per_capita.csv")
tfp_data = pd.read_csv("../data/TFP/AgTFPInternational2021_AG_TFP.csv", header=2)
natural_disasters_data = pd.read_csv("../data/natural_disasters/emdat_1960-2024.csv")
disaster_types_to_extract = ["Wildfire", "Drought", "Extreme temperature"]
gdp_years = range(1961,2024)
tfp_years = range(1962,2022)

In [3]:
def format_target_data(data, years, year_column_format, country_column, output_var):
    formatted_outcome_var = {}
    for row in data.iterrows():
        row = row[1]
        country = row[country_column]
        formatted_outcome_var[country] = {}
        for year in years:
            year_data = float(row[year_column_format.replace("year",str(year))])
            last_year_data = float(row[year_column_format.replace("year",str(year-1))])
            formatted_outcome_var[country][year] = {}
            if np.isnan(year_data) or np.isnan(last_year_data):
                formatted_outcome_var[country][year][output_var] = np.NaN
            else:
                outcome = np.log(float(year_data)) - np.log(float(last_year_data))
                formatted_outcome_var[country][year][output_var] = outcome
    return formatted_outcome_var

In [4]:
def add_climate_vars_to_dataset(dataset, climate_val, prev_climate_val, country, year, climate_var, weights):
    dataset[country][year][f"{climate_var}_{weights}"] = climate_val
    dataset[country][year][f"{climate_var}_{weights}_2"] = np.square(climate_val)
    dataset[country][year][f"{climate_var}_{weights}_3"] = np.power(climate_val,3)
    if prev_climate_val != None:
        dataset[country][year][f"fd_{climate_var}_{weights}"] = climate_val - prev_climate_val
        dataset[country][year][f"fd_{climate_var}_{weights}_2"] = np.square(climate_val) - np.square(prev_climate_val)
        dataset[country][year][f"fd_{climate_var}_{weights}_3"] = np.power(climate_val,3) - np.power(prev_climate_val,3)
    return dataset

In [37]:
def add_natural_disasters_to_dataset(dataset, extracted_disasters):
    for country, data_by_year in dataset.items():
        for year, data in data_by_year.items():
            for disaster in disaster_types_to_extract:
                if country in extracted_disasters and year in extracted_disasters[country] and disaster in extracted_disasters[country][year]:
                    dataset[country][year][disaster] = 1
                else:
                    dataset[country][year][disaster] = 0
    return dataset

In [32]:
extracted_disasters["AFG"][1969]

{'Drought': 1}

In [6]:
gdp_data = gdp_data.replace("..", np.NaN)
formatted_gdp_data = format_target_data(gdp_data, gdp_years, "year [YRyear]", "Country Code", "fd_ln_gdp")
formatted_tfp_data = format_target_data(tfp_data, tfp_years, "year", "ISO3", "fd_ln_tfp")

In [38]:
extracted_disasters = {}
missing_countries = set()
extreme_temp_subtypes = set()
for row in natural_disasters_data.iterrows():
    row = row[1]
    disaster_type = row["Disaster Type"]
    if disaster_type in disaster_types_to_extract:
        if disaster_type == "Extreme temperature" and row["Disaster Subtype"] != "Heat wave":
            continue
        country = row.ISO
        year = int(row["DisNo."].split("-")[0])
        if disaster_type == "Extreme temperature":
            disaster_type = "Heat_wave"
        if country not in extracted_disasters:
            extracted_disasters[country] = {}
        if year not in extracted_disasters[country]:
            extracted_disasters[country][year] = {}
        extracted_disasters[country][year][disaster_type] = 1

formatted_gdp_data = add_natural_disasters_to_dataset(formatted_gdp_data, extracted_disasters)
formatted_tfp_data = add_natural_disasters_to_dataset(formatted_tfp_data, extracted_disasters)

In [8]:
for climate_var in ["temp","precip","humidity"]:
    for weights in ["unweighted", "pop_weighted","ag_weighted"]:
        aggregate_var = "mean"
        if weights != "unweighted":
            aggregate_var = "weighted_mean"
        weights_no_dash = weights.replace("_","")
        data = pd.read_csv(f"../data/{climate_var}/monthly/processed_by_country/{weights}/{climate_var}.monthly.bycountry.{weights_no_dash}.mean.csv")
        for row in data.iterrows():
            prev_climate_val = None
            row = row[1]
            country = cc(row.country, origin="fips", destination="iso3c")
            if country != None:
                for year in range(1960,2024):
                    monthly_climate_vals = []
                    for month in range(1,13):
                        if month < 10:
                            month = "0" + str(month)
                        monthly_climate_vals.append(row[f"{weights_no_dash}_by_country.{aggregate_var}.X{year}.{month}.01"])
                    annual_climate_mean = np.mean(monthly_climate_vals)
                    if climate_var == "temp":
                        # celsius to kelvin
                        annual_climate_mean = annual_climate_mean - 273.15
                    elif climate_var == "precip":
                        # precipitation rate per second to total monthly precipitation (X by approx. # of seconds in a month)
                        annual_climate_mean = annual_climate_mean * 2.628e+6
                    if year in gdp_years and country in formatted_gdp_data:
                        formatted_gdp_data = add_climate_vars_to_dataset(formatted_gdp_data, annual_climate_mean, prev_climate_val, country, year, climate_var, weights)
                    if year in tfp_years and country in formatted_tfp_data:
                        formatted_tfp_data = add_climate_vars_to_dataset(formatted_tfp_data, annual_climate_mean, prev_climate_val, country, year, climate_var, weights)
                    prev_climate_val = annual_climate_mean

In [58]:
prev_results = {"annual_climate_std":{},"mean_daily_climate_std":{}}
for climate_var in ["temp","precip","humidity"]:
    for weights in ["unweighted", "pop_weighted","ag_weighted"]:
        print(climate_var, weights)
        aggregate_var = "mean"
        if weights != "unweighted":
            aggregate_var = "weighted_mean"
        weights_no_dash = weights.replace("_","")
        for year in range(1960,2024):
            data = pd.read_csv(f"../data/{climate_var}/daily/processed_by_country/{weights}/{climate_var}.daily.bycountry.{weights_no_dash}.{year}.csv")
            data["ISO3"] = cc(data.country, origin="fips", destination="iso3c")
            climate_columns = data.loc[:, data.columns.str.startswith(f"{weights}_by_country.{aggregate_var}")]
            data["annual_std"] = np.std(climate_columns, axis=1)
            # TODO: ensure that this number is correct for all files
            for measurement in range(0,1464,4):
                data[f"daily_std_{int(measurement/4)}"] = np.std(climate_columns.iloc[:,measurement:measurement+4], axis=1)
            data["mean_daily_std"] = np.mean(data.loc[:, data.columns.str.startswith("daily_std")], axis=1)
            for row in data.iterrows():
                row = row[1]
                country = row.ISO3
                mean_daily_climate_std = row.mean_daily_std
                annual_climate_std = row.annual_std
                prev_daily_std, prev_annual_std = None, None
                if country in prev_results["mean_daily_climate_std"] and year-1 in prev_results["mean_daily_climate_std"][country]:
                    prev_daily_std = prev_results["mean_daily_climate_std"][country][year-1]
                if country in prev_results["annual_climate_std"] and year-1 in prev_results["annual_climate_std"][country]:
                    prev_annual_std = prev_results["annual_climate_std"][country][year-1]
                if year in gdp_years and country in formatted_gdp_data:
                    formatted_gdp_data = add_climate_vars_to_dataset(formatted_gdp_data, mean_daily_climate_std, prev_daily_std, country, year, climate_var + "_daily_std", weights)
                    formatted_gdp_data = add_climate_vars_to_dataset(formatted_gdp_data, annual_climate_std, prev_annual_std, country, year, climate_var + "_annual_std", weights)
                if year in tfp_years and country in formatted_tfp_data:
                    formatted_tfp_data = add_climate_vars_to_dataset(formatted_tfp_data, mean_daily_climate_std, prev_daily_std, country, year, climate_var + "_daily_std", weights)
                    formatted_tfp_data = add_climate_vars_to_dataset(formatted_tfp_data, annual_climate_std, prev_annual_std, country, year, climate_var + "_annual_std", weights)
                if country not in prev_results["mean_daily_climate_std"]:
                    prev_results["mean_daily_climate_std"][country] = {}
                prev_results["mean_daily_climate_std"][country][year] = mean_daily_climate_std
                if country not in prev_results["annual_climate_std"]:
                    prev_results["annual_climate_std"][country] = {}
                prev_results["annual_climate_std"][country][year] = annual_climate_std

temp unweighted


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x700248900f70>>
Traceback (most recent call last):
  File "/home/hayden_freedman/pymc_dev.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [54]:
def write_regression_data_to_file(file, data):
    writer = csv.writer(file)
    headers =["country","year"]
    for column in data["AFG"][1961]:
        headers.append(column)
    writer.writerow(headers)
    for country, data_by_year in data.items():
        for year, data in data_by_year.items():
            new_row = [country,year]
            for column in headers[2:]:
                try:
                    new_row.append(data[column])
                except KeyError:
                    print(country, year, column)
                    assert 1 == 2
            writer.writerow(new_row)

with open("../data/regression/gdp_regression_data.csv", "w") as gdp_file:
    write_regression_data_to_file(gdp_file, formatted_gdp_data)
# with open("../data/regression/tfp_regression_data.csv", "w") as tfp_file:
#     write_regression_data_to_file(tfp_file, formatted_tfp_data)

AFG 1992 temp_daily_std_ag_weighted


AssertionError: 

In [None]:
data = {
    "country":[],"year":[],
    # outcome variables
    "fd_ln_GDP":[],"fd_ln_TFP":[],
    # mean annual temp
    "temp_uw_mean_annual":[],"temp2_uw_mean_annual":[],"fd_temp_uw_mean_annual":[],"fd_temp2_uw_mean_annual":[],
    # mean annual precip
    "precip_uw_mean_annual":[],"precip2_uw_mean_annual:[]","fd_precip_uw_mean_annual":[],"fd_precip2_uw_mean_annual":[],
    # mean annual humidity
    
}

In [47]:
var_list = ["tmean","tmean_sq","tmean_cu","fd_tmean","fd_tmean_sq","fd_tmean_cu","prcp","prcp_sq","prcp_cu","fd_prcp","fd_prcp_sq","fd_prcp_cu"]
permutation_list = [
    np.ones(len(var_list)),
    [0,0,0,1,1,0,0,0,0,1,1,0],
    [1,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,1,0,0,0,0,0,0,0,0],
]


In [48]:
headers = []
for var in var_list:
    headers.append(var)
headers.append("In-sample MSE")
with open("test_out.csv", "w") as file_output:
    writer = csv.writer(file_output)
    writer.writerow(headers)
    for permutation in permutation_list:
        vars = " + ".join([var for index, var in enumerate(var_list) if permutation[index] == 1])
        regression = pf.feols(f"fd_log_tfp ~ {vars} | ISO3 + year", data=data)
        yhat = regression.predict()
        error = np.mean(np.square(yhat-data.fd_log_tfp))
        res_row = []
        for i in permutation:
            res_row.append(i)
        res_row.append(error)
        writer.writerow(res_row)

In [37]:
regression.tidy()

Unnamed: 0_level_0,Estimate,Std. Error,t value,Pr(>|t|),2.5%,97.5%
Coefficient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fd_tmean,-0.006475999,0.009191869,-0.704536,0.482058,-0.02462014,0.01166814
fd_tmean_sq,-4.752681e-05,0.0002257485,-0.21053,0.833505,-0.0004931395,0.0003980859
fd_prcp,0.000181685,5.712337e-05,3.180572,0.001746,6.892724e-05,0.0002944428
fd_prcp_sq,-1.818987e-07,5.85087e-08,-3.108918,0.0022,-2.97391e-07,-6.640643e-08


In [38]:
regression.summary()

###

Estimation:  OLS
Dep. var.: fd_log_tfp, Fixed effects: ISO3+year
Inference:  CRV1
Observations:  9255

| Coefficient   |   Estimate |   Std. Error |   t value |   Pr(>|t|) |   2.5% |   97.5% |
|:--------------|-----------:|-------------:|----------:|-----------:|-------:|--------:|
| fd_tmean      |     -0.006 |        0.009 |    -0.705 |      0.482 | -0.025 |   0.012 |
| fd_tmean_sq   |     -0.000 |        0.000 |    -0.211 |      0.834 | -0.000 |   0.000 |
| fd_prcp       |      0.000 |        0.000 |     3.181 |      0.002 |  0.000 |   0.000 |
| fd_prcp_sq    |     -0.000 |        0.000 |    -3.109 |      0.002 | -0.000 |  -0.000 |
---
RMSE: 0.082 R2: 0.04 R2 Within: 0.01 


In [111]:
centered_data = pf.estimation.demean(
    np.array(data[["fd_tmean", "fd_tmean_sq", "fd_prcp", "fd_prcp_sq"]]), 
    np.array(data[["encoded_iso_id","year"]]), 
    np.ones(len(data))
)

In [118]:
x = centered_data[0]
y = np.array(data.fd_log_tfp)
model = sm.OLS(y,x)
results = model.fit()
print(results.params)

[-6.47599918e-03 -4.75268083e-05  1.81684996e-04 -1.81898741e-07]


In [137]:
# predict with fixed effects
yhat = regression.predict()
error = np.square(yhat-data.fd_log_tfp)
np.mean(error)

0.006735429878711787

In [140]:
# predict without fixed effects
yhat_ = results.predict()
error = np.square(yhat_-data.fd_log_tfp)
np.mean(error)

0.006980271648325952