In [2]:
import csv
import os
import pymc as pm
from pymc import do, observe
import pandas as pd
import numpy as np
import arviz as az
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from pytensor import tensor as pt
import pickle as pkl
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import itertools as it
import country_converter as cc
import math



# Import data

In [37]:
data = pd.read_csv("../data/processed/burke_ortizbobea_integrated_dataset_with_custom_temp.csv")

# Remove all data for countries where one of the variables is entirely missing

In [38]:
country_temp_data = {}
country_precip_data = {}
country_gdp_data = {}
country_tfp_data = {}

for row in data.iterrows():
    country = row[1].country
    if country not in country_temp_data:
        country_temp_data[country] = []
    if country not in country_precip_data:
        country_precip_data[country] = []
    if country not in country_gdp_data:
        country_gdp_data[country] = []
    if country not in country_tfp_data:
        country_tfp_data[country] = []

    country_temp_data[country].append(row[1].unweighted_temp)
    country_precip_data[country].append(row[1].unweighted_precip)
    country_gdp_data[country].append(row[1].ln_gdp_change)
    country_tfp_data[country].append(row[1].ln_tfp_change)

countries_missing_temp = [country for country in country_temp_data if all(np.isnan(country_temp_data[country]))]
countries_missing_precip = [country for country in country_precip_data if all(np.isnan(country_precip_data[country]))]
countries_missing_gdp = [country for country in country_gdp_data if all(np.isnan(country_gdp_data[country]))]
countries_missing_tfp = [country for country in country_tfp_data if all(np.isnan(country_tfp_data[country]))]

countries_to_remove = set(countries_missing_temp + countries_missing_precip + countries_missing_gdp + countries_missing_tfp)
print(countries_to_remove)

indices_to_drop = []
for index, row in enumerate(data.itertuples()):
    if row.country in countries_to_remove:
        indices_to_drop.append(index)
        
data_len_before = len(data)
data = data.drop(indices_to_drop)
data = data.reset_index()
print(f"Removed {data_len_before - len(data)} rows for completely missing country data.")

[]
{'MLT', 'NOR', 'SYC', 'SGP', 'BHR', 'COD', 'JAM', 'ISR', 'ROU'}
Removed 504 rows for completely missing country data.


# Scale data

In [30]:
precip_scaler, gdp_scaler, temp_scaler, tfp_scaler = StandardScaler(), StandardScaler(), StandardScaler(), StandardScaler()
precip_scaled = precip_scaler.fit_transform(np.array(data.unweighted_precip).reshape(-1,1)).flatten()
gdp_scaled = gdp_scaler.fit_transform(np.array(data.ln_gdp_change).reshape(-1,1)).flatten()
temp_scaled = temp_scaler.fit_transform(np.array(data.unweighted_temp).reshape(-1,1)).flatten()
tfp_scaled = tfp_scaler.fit_transform(np.array(data.ln_tfp_change).reshape(-1,1)).flatten()

# Year and country fixed effects

In [33]:
data_len = len(data.year)
year_mult_mat = [np.zeros(data_len) for year in set(data.year)]
country_mult_mat = [np.zeros(data_len) for country in set(data.country)]
country_index = -1
curr_country = ""
for row_index, row in enumerate(data.itertuples()):
    if row.country != curr_country:
        country_index += 1
        curr_country = row.country
    year_index = row.year - 1960
    country_mult_mat[country_index][row_index] = 1
    year_mult_mat[year_index][row_index] = 1

# Build integrated Burke-OrtizBobea model

In [36]:
with pm.Model() as model:

    country_coefs_temp_prior = pt.expand_dims(pm.Normal("country_coefs_temp_prior", 0, 1, shape=(len(set(data.country)))),axis=1)
    temp_prior = pm.Deterministic("temp_prior",pt.sum(country_coefs_temp_prior*country_mult_mat,axis=0))    
    temp_std = pm.HalfNormal("temp_std", 1)
    temp_posterior = pm.Normal("temp_posterior", temp_prior, temp_std, observed=temp_scaled)
    
    temp_gdp_coef = pm.Normal('temp_gdp_coef',0,1)
    temp_gdp_coef2 = pm.Normal('temp_gdp_coef2',0,1)

    temp_tfp_coef = pm.Normal('temp_tfp_coef',0,1)
    temp_tfp_coef2 = pm.Normal('temp_tfp_coef2',0,1)
    temp_tfp_intercept = pm.Normal('temp_tfp_intercept',0,1)

    country_coefs_precip_prior = pt.expand_dims(pm.Normal("country_coefs_precip_prior", 0, 1, shape=(len(set(data.country)))),axis=1)
    precip_prior = pm.Deterministic("precip_prior",pt.sum(country_coefs_precip_prior*country_mult_mat,axis=0))
    precip_std = pm.HalfNormal("precip_std", 1)
    precip_posterior = pm.Normal("precip_posterior", precip_prior, precip_std, observed=precip_scaled)

    precip_gdp_coef = pm.Normal('precip_gdp_coef',0,1)
    precip_gdp_coef2 = pm.Normal('precip_gdp_coef2',0,1)

    precip_tfp_coef = pm.Normal('precip_tfp_coef',0,1)
    precip_tfp_coef2 = pm.Normal('precip_tfp_coef2',0,1)

    gdp_year_coefs = pt.expand_dims(pm.Normal("gdp_year_coefs", 0, 10, shape=(len(set(data.year)))),axis=1)
    gdp_year_fixed_effects = pm.Deterministic("gdp_year_fixed_effects",pt.sum(gdp_year_coefs*year_mult_mat,axis=0))
    gdp_country_coefs = pt.expand_dims(pm.Normal("gdp_country_coefs", 0, 10, shape=(len(set(data.country)))),axis=1)
    gdp_country_fixed_effects = pm.Deterministic("gdp_country_fixed_effects",pt.sum(gdp_country_coefs*country_mult_mat,axis=0))

    gdp_intercept = pm.Normal("gdp_intercept", 0, 1)
    
    gdp_prior = pm.Deterministic(
        "gdp_prior", 
        gdp_intercept + 
        (temp_gdp_coef * temp_posterior) + 
        (temp_gdp_coef2 * pt.sqr(temp_posterior)) +
        (precip_gdp_coef * precip_posterior) +
        (precip_gdp_coef2 * pt.sqr(precip_posterior)) +
        gdp_year_fixed_effects +
        gdp_country_fixed_effects
    )
    gdp_std = pm.HalfNormal('gdp_std', sigma=10)
    gdp_posterior = pm.Normal('gdp_posterior', mu=gdp_prior, sigma=gdp_std, observed=data["ln_gdp_change"])

    # tfp_year_coefs = pt.expand_dims(pm.Normal("tfp_year_coefs", 0, 10, shape=(len(set(data.year)))),axis=1)
    # tfp_year_fixed_effects = pm.Deterministic("tfp_year_fixed_effects",pt.sum(tfp_year_coefs*year_mult_mat,axis=0))
    # tfp_country_coefs = pt.expand_dims(pm.Normal("tfp_country_coefs", 0, 10, shape=(len(set(data.country)))),axis=1)
    # tfp_country_fixed_effects = pm.Deterministic("tfp_country_fixed_effects",pt.sum(tfp_country_coefs*country_mult_mat,axis=0))
    
    # tfp_prior = pm.Deterministic(
    #     "tfp_prior", 
    #     tfp_intercept + 
    #     (temp_tfp_coef * temp_posterior) + 
    #     (temp_tfp_coef2 * pt.sqr(temp_posterior)) +
    #     precip_tfp_intercept +
    #     (precip_tfp_coef * precip_posterior) +
    #     (precip_tfp_coef2 * pt.sqr(precip_posterior)) +
    #     tfp_year_fixed_effects +
    #     tfp_country_fixed_effects
    # )
    
    # tfp_std = pm.HalfNormal('tfp_std', sigma=10)
    # tfp_posterior = pm.Normal('tfp_posterior', mu=tfp_prior, sigma=tfp_std, observed=data["ln_tfp_change"])
    
    prior = pm.sample_prior_predictive()
    trace = pm.sample()
    posterior = pm.sample_posterior_predictive(trace, extend_inferencedata=True)

with open ('../models/burke_unweighted_temp.pkl', 'wb') as buff:
    pkl.dump ({
        "prior": prior, 
        "trace": trace, 
        "posterior": posterior,
        "temp_scaler": temp_scaler,
        "precip_scaler": precip_scaler,
        "gdp_scaler": gdp_scaler
    }, buff)

Sampling: [country_coefs_precip_prior, country_coefs_temp_prior, gdp_country_coefs, gdp_posterior_observed, gdp_posterior_unobserved, gdp_std, gdp_year_coefs, precip_gdp_coef, precip_gdp_coef2, precip_gdp_intercept, precip_posterior_observed, precip_posterior_unobserved, precip_std, precip_tfp_coef, precip_tfp_coef2, precip_tfp_intercept, temp_gdp_coef, temp_gdp_coef2, temp_gdp_intercept, temp_posterior, temp_std, temp_tfp_coef, temp_tfp_coef2, temp_tfp_intercept, tfp_country_coefs, tfp_posterior_observed, tfp_posterior_unobserved, tfp_std, tfp_year_coefs]
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [country_coefs_temp_prior, temp_std, temp_gdp_coef, temp_gdp_coef2, temp_gdp_intercept, temp_tfp_coef, temp_tfp_coef2, temp_tfp_intercept, country_coefs_precip_prior, precip_std, precip_posterior_unobserved, precip_gdp_coef, precip_gdp_coef2, precip_gdp_intercept, precip_tfp_coef, precip_tfp_coef2, precip_tfp_i

ValueError: Not enough samples to build a trace.

# Create integrated dataset with unweighted temp for burke and ortiz-bobea

In [24]:
def get_item(data):
    try:
        return data.item()
    except AttributeError:
        return np.NaN
    except ValueError:
        return np.NaN

integrated_data = []

unweighted_temp_file = pd.read_csv("../data/burke/data/input/custom_monthly_unweighted_temp_by_country.csv")
unweighted_precip_file = pd.read_csv("../data/burke/data/input/custom_monthly_unweighted_precip_by_country.csv")
tfp_file = pd.read_csv("../data/ortiz-bobea/data2/regdata_preferred_case.csv")
gdp_file = pd.read_csv("../data/burke/data/input/GrowthClimateDataset.csv")

all_countries = sorted(set(tfp_data["ISO3"]).intersection(set(gdp_data["iso"])))
all_years = set(tfp_data["year"]).union(set(gdp_data["year"]))

for country in all_countries:
    for year in all_years:
        unweighted_temp = get_item(unweighted_temp_file.loc[(unweighted_temp_file["Country"] == country) & (unweighted_temp_file["Year"] == year)]["Mean_Temp"])
        unweighted_precip = get_item(unweighted_precip_file.loc[(unweighted_precip_file["Country"] == country) & (unweighted_precip_file["Year"] == year)]["Unweighted_Precipitation"])
        gdp = get_item(gdp_file.loc[(gdp_data["iso"] == country) & (gdp_file["year"] == year)]["growthWDI"])
        tfp = get_item(tfp_file.loc[(tfp_data["ISO3"] == country) & (tfp_file["year"] == year)]["fd_log_tfp"])
        integrated_data.append([country, year, unweighted_temp, unweighted_precip, gdp, tfp])

with open("../data/processed/burke_ortizbobea_integrated_dataset_with_custom_temp.csv", "w") as write_file:
    writer = csv.writer(write_file)
    writer.writerow(["country","year","unweighted_temp","unweighted_precip","ln_gdp_change","ln_tfp_change"])
    for row in integrated_data:
        writer.writerow(row)