import csv
import os
import pymc as pm
import pandas as pd
import numpy as np
import arviz as az
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from pytensor import tensor as pt
import pickle as pkl
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import country_converter as cc

In [43]:
data = pd.read_csv("../data/processed/drought-agtfp-gdp-data.csv")
print(len(data))

11387


# Drought - Ag. TfP

## remove missing rows

In [44]:
indices_to_drop = []
no_nan_cols = ["ln_TFP_change"]
for index, row in enumerate(data.itertuples()):
    if any(np.isnan(getattr(row,col)) for col in no_nan_cols):
        indices_to_drop.append(index)
data = data.drop(indices_to_drop)
data = data.reset_index()
print(len(data))

8715


## linear regression to get a sense of good priors

In [57]:
from sklearn.linear_model import LinearRegression
x = np.array(data.drought).reshape(-1,1)
y = np.array(data.ln_TFP_change).reshape(-1,1)
model = LinearRegression().fit(x,y)
print(model.coef_, model.intercept_)
print(np.std(data.ln_TFP_change))

[[-0.00878432]] [0.0082863]
0.0827685552697079


## create fixed effect matrices

In [82]:
min_year = min(data.year)
data_len = len(data.year)
year_fixed_effect_matrix = [np.zeros(data_len) for year in set(data.year)]
country_fixed_effect_matrix = [np.zeros(data_len) for country in set(data.country)]
country_index = -1
curr_country = ""
for row_index, row in enumerate(data.itertuples()):
    if row.country != curr_country:
        country_index += 1
        curr_country = row.country
    year_index = row.year - min_year
    country_fixed_effect_matrix[country_index][row_index] = 1
    year_fixed_effect_matrix[year_index][row_index] = 1

## create gradual effect matrices

In [83]:
data_len = len(data.year)
country_counters = {}
country_grad_effect_matrix = [np.zeros(data_len) for country in set(data.country)]
country_index = -1
curr_country = ""
for row_index, row in enumerate(data.itertuples()):
    if row.country != curr_country:
        country_index += 1
        curr_country = row.country
        if curr_country not in country_counters:
            country_counters[curr_country] = 1
    country_grad_effect_matrix[country_index][row_index] = country_counters[curr_country]
    country_counters[curr_country] += 1

## Build Model

In [93]:
with pm.Model() as model:

    drought_tfp_coef = pm.Normal("drought_tfp_coef", 0, .1)
    intercept = pm.Normal("intercept", 0, .1)

    year_fixed_effect_coefs = pt.expand_dims(pm.Normal("year_fixed_effect_coefs", 0, 10, shape=(len(set(data.year)))),axis=1)
    year_fixed_effects = pm.Deterministic("year_fixed_effects",pt.sum(year_fixed_effect_coefs*year_fixed_effect_matrix,axis=0))

    country_fixed_effect_coefs = pt.expand_dims(pm.Normal("country_fixed_effect_coefs", 0, 10, shape=(len(set(data.country)))),axis=1)
    country_fixed_effects = pm.Deterministic("country_fixed_effects",pt.sum(country_fixed_effect_coefs*country_fixed_effect_matrix,axis=0))

    country_grad_effect_coefs = pt.expand_dims(pm.Normal("country_grad_effect_coefs", 0, 10, shape=(len(set(data.country)))),axis=1)
    country_grad_effects = pm.Deterministic("grad_effects",pt.sum(country_grad_effect_coefs*country_grad_effect_matrix,axis=0))
    
    tfp_prior = pm.Normal(
        "tfp_prior", 
        (drought_tfp_coef * data.drought) + 
        country_fixed_effects +
        year_fixed_effects + 
        country_grad_effects +
        intercept
    )

    tfp_sd = pm.HalfNormal("tfp_sd", 1)
    tfp_posterior = pm.Normal("tfp_posterior", tfp_prior, tfp_sd, observed = data.ln_TFP_change)

    prior = pm.sample_prior_predictive()
    trace = pm.sample(target_accept=.99, cores=4)
    posterior = pm.sample_posterior_predictive(trace, extend_inferencedata=True)

Sampling: [country_fixed_effect_coefs, country_grad_effect_coefs, drought_tfp_coef, intercept, tfp_posterior, tfp_prior, tfp_sd, year_fixed_effect_coefs]
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [drought_tfp_coef, intercept, year_fixed_effect_coefs, country_fixed_effect_coefs, country_grad_effect_coefs, tfp_prior, tfp_sd]



KeyboardInterrupt



In [64]:
az.summary(trace, var_names=["drought_tfp_coef", "intercept", "tfp_sd"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
drought_tfp_coef,-0.007,0.039,-0.078,0.066,0.001,0.001,823.0,1321.0,1.0
intercept,0.008,0.011,-0.014,0.027,0.0,0.0,1013.0,1444.0,1.0
tfp_sd,0.034,0.012,0.019,0.056,0.005,0.004,6.0,16.0,1.95


# create dataset

In [41]:
integrated_data_dict = {
    "country":[],
    "year":[],
    "drought":[],
    "extreme_temperature":[],
    "flood":[],
    "ln_GDP_change":[],
    "ln_TFP_change":[]
}

burke_data = pd.read_csv("../data/burke/data/input/GrowthClimateDataset.csv")
ortizbobea_data = pd.read_csv("../data/ortiz-bobea/data2/regdata_preferred_case.csv")
extreme_weather_data = pd.read_csv("../data/global_data/extreme_weather_events.csv")

all_countries = sorted((set(ortizbobea_data["ISO3"]).union(set(burke_data["iso"]))).intersection(set(extreme_weather_data["ISO"])))
all_years = sorted(set(ortizbobea_data["year"]).union(set(burke_data["year"])).union(set([int(year.split("-")[0]) for year in extreme_weather_data["DisNo."]])))

for index, country in enumerate(list(all_countries)):
    for year in all_years:
        burke_row = burke_data.loc[(burke_data["iso"] == country) & (burke_data["year"] == year)]
        ob_row = ortizbobea_data.loc[(ortizbobea_data["ISO3"] == country) & (ortizbobea_data["year"] == year)]
        ew_rows = extreme_weather_data.loc[(extreme_weather_data["DisNo."].str.startswith(str(year), na=False)) & (extreme_weather_data["ISO"] == country)]
        integrated_data_dict["country"].append(country)
        integrated_data_dict["year"].append(year)
        try:
            integrated_data_dict["ln_TFP_change"].append(ob_row["fd_log_tfp"].item())
        except ValueError:
            integrated_data_dict["ln_TFP_change"].append(np.NaN)
        try:
            integrated_data_dict["ln_GDP_change"].append(burke_row["growthWDI"].item())
        except ValueError:
            integrated_data_dict["ln_GDP_change"].append(np.NaN)
        for disaster_type in ["Flood","Drought","Extreme temperature"]:
            if len(ew_rows) == 0:
                integrated_data_dict[disaster_type.lower().replace(" ","_")].append(0)
            else:
                if any(ew_row["Disaster Type"] == disaster_type for _, ew_row in ew_rows.iterrows()):
                    integrated_data_dict[disaster_type.lower().replace(" ","_")].append(1)
                else:
                    integrated_data_dict[disaster_type.lower().replace(" ","_")].append(0)
pd.DataFrame.from_dict(integrated_data_dict).to_csv("../data/processed/drought-agtfp-gdp-data.csv", index=False)    