In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import dotenv_values, find_dotenv

In [11]:
# this looks for your configuration file and then reads it as a dictionary
config = dotenv_values(find_dotenv())

In [12]:
# set path using the dictionary key for which one you want
correlationpath = os.path.abspath(config["CORRELATIONDATA"]) + '\\'
cleandatapath = os.path.abspath(config["CLEANDATA"]) + '\\'

In [14]:
# import the correlation data
gdpcorr = pd.read_excel(open(correlationpath+'correlation_data.xlsx','rb'),sheet_name='gdp_first_diff_corr')
conscorr = pd.read_excel(open(correlationpath+'correlation_data.xlsx','rb'),sheet_name='c_first_diff_corr')

# reset index in order to reshape
gdpcorr = gdpcorr.set_index(gdpcorr.columns[0])
gdpcorr.index.name = None
conscorr = conscorr.set_index(conscorr.columns[0])
conscorr.index.name = None

# reshape
gdp_reshape = gdpcorr.stack().reset_index()
cons_reshape = conscorr.stack().reset_index()

# rename the columns
gdp_reshape.columns = ['iso3_firstcountry', 'iso3_secondcountry', 'gdp_corr']
cons_reshape.columns = ['iso3_firstcountry', 'iso3_secondcountry', 'cons_corr']

allcorrelationdata = pd.merge(left=gdp_reshape, right=cons_reshape, how='inner', on=['iso3_firstcountry', 'iso3_secondcountry'])

# create mask that will remove duplicates
mask_duplicates = (allcorrelationdata[['iso3_firstcountry', 'iso3_secondcountry']].apply(frozenset, axis=1).duplicated()) | (allcorrelationdata['iso3_firstcountry']==allcorrelationdata['iso3_secondcountry'])
allcorrelationdata = allcorrelationdata[~mask_duplicates]

# countries that were included when we calculated the correlations: 
allcorrelationcountries = list(gdpcorr.index.unique())

In [16]:
# import the shares data (original version)
shares_original = pd.read_pickle(cleandatapath+'FullData_OriginalData_plusExports.pkl')[['iso3', 'year', 'Import USD', 'Import EUR', 'Export USD', 'Export EUR']]

# import the shares data (alternative version)
shares_alt = pd.read_pickle(cleandatapath+'FullData_OurData_NScode_TSimportShares_Taiwan.pkl')[['iso3', 'year', 'Import USD', 'Import EUR', 'Export USD', 'Export EUR']]

# import the shares data (edited version of original)
shares_altoriginal = pd.read_pickle(cleandatapath+'FullData_OurData_NScode_CorrectedOriginalVersion.pkl')[['iso3', 'year', 'Import USD', 'Import EUR', 'Export USD', 'Export EUR']]

# countries with enough invoicing data (can use any version, will have the same countries)
allsharescountries = list(shares_original.dropna()['iso3'].unique())

# organize data
dataversions = [shares_original, shares_alt, shares_altoriginal]
dataversions_names = ['original', 'alt', 'altoriginal']
sharesdata_dict = dict.fromkeys(dataversions_names)

for i, version in enumerate(dataversions):
    # taking the same year range as the correlation data and only using one obs per year since it is annual data
    # only consider countries that we calculated correlation coefficients for
    version = version[(version['iso3'].isin(allcorrelationcountries)) & ((version['year'] >= 1990) & (version['year'] <= 2021))].groupby(['iso3', 'year']).first().reset_index()
    # getting the averages
    version = version[['iso3', 'Import USD', 'Import EUR', 'Export USD', 'Export EUR']].groupby(['iso3'], as_index=False).mean()
    # some countries just don't have data for both shares (canada, mexico, philippines, and south africa)
    # this line drops these countries
    version = version.dropna()
    version.name = dataversions_names[i]
    
    sharesdata_dict[version.name] = version

In [17]:
# import consumption, centrality and nominal gdp shares
cons_ngdp_cent = pd.read_pickle(cleandatapath+'cons_ngdp_cent_for_regression.pkl')

worldcons = cons_ngdp_cent[['year', 'iso3', 'worldtotcons', 'totcons']]
cent = cons_ngdp_cent[['year', 'iso3', 'cent_exp']]
ngdpshares = cons_ngdp_cent[['year', 'iso3', 'gdpshare']]

# list of countries with consumption data
allconsumptioncountries = list(worldcons.dropna()['iso3'].unique())

# list of countries with centrality data
allcentcountries = list(cent.dropna()['iso3'].unique())

# list of countries with gdp share data
allngdpcountries = list(ngdpshares.dropna()['iso3'].unique())

# taking the same year range as the correlation data
# only consider countries that we calculated correlation coefficients for
worldcons = worldcons[(worldcons['iso3'].isin(allcorrelationcountries)) & ((worldcons['year'] >= 1990) & (worldcons['year'] <= 2021))]
cent = cent[(cent['iso3'].isin(allcorrelationcountries)) & ((cent['year'] >= 1990) & (cent['year'] <= 2021))]
ngdpshares = ngdpshares[(ngdpshares['iso3'].isin(allcorrelationcountries)) & ((ngdpshares['year'] >= 1990) & (ngdpshares['year'] <= 2021))]

In [18]:
# countries that have data for correlations, correlations with world consumption, nominal GDP shares, and invoice shares
countriestoinclude = list(set(allcorrelationcountries) & set(allsharescountries) & set(allconsumptioncountries) & set(allngdpcountries))

In [35]:
# create a class
class My_Class:
    def __init__(self, ydata, productsdata, othercovariates, countries):
        self.countries = countries
        # select which shares will be used to calculate the products
        self.productsdata = productsdata
        # select gdp correlations vs consumption correlations
        self.ydata = ydata
        # select whatever else you need
        self.othercovariates = othercovariates

    def tell_me_about_data(self):
        print("The dependent variable will be calculated from: ", list(self.ydata.columns))
        print("The products will be calculated from: ", list(self.productsdata.columns))
        print("The added covariate will be calculated from: ", list(self.othercovariates.columns))
        print("The included countries are: ", self.countries)



In [None]:
# class My_Class:
#     def __init__(self, correlationdata, sharesdata, covariates, countries, detrendingversion, sharesversion, regressiontype):
#

In [37]:
# create an object with attributes defined as: consumption correlation
myobject = My_Class(allcorrelationdata[['cons_corr']], sharesdata_dict['alt'][['Import USD', 'Import EUR']], cent[['cent_exp']], countries=countriestoinclude)

myobject.tell_me_about_data()

The dependent variable will be calculated from:  ['cons_corr']
The products will be calculated from:  ['Import USD', 'Import EUR']
The added covariate will be calculated from:  ['cent_exp']
The included countries are:  ['BEL', 'CZE', 'CHL', 'HUN', 'KOR', 'IDN', 'FIN', 'DNK', 'GBR', 'SWE', 'TUR', 'NOR', 'USA', 'DEU', 'IRL', 'TWN', 'LUX', 'IND', 'NLD', 'FRA', 'JPN', 'POL', 'ESP', 'ITA', 'AUS', 'BRA', 'AUT', 'CHE', 'ISL', 'THA', 'GRC', 'PRT']
