In [1]:
import pandas as pd
import numpy as np
import pickle
from ipywidgets import FloatProgress
import matplotlib.pyplot as plt
import statsmodels.api as sm
import re
import statsmodels.tsa.x13 as X13
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.dates as pltdates
import matplotlib.patches as mpatches
import matplotlib as matplotlib
from IPython.display import display
import argparse
import os
import subprocess
from datetime import datetime
import scipy as scipy
import xlrd as xlrd
import country_converter as coco

In [2]:
# Set parameters and paths

path_rawdata = os.path.abspath(os.path.join(os.getcwd(), '..', 'RawData')) + '/'
path_cleandata = os.path.abspath(os.path.join(os.getcwd(), '..', 'CleanData')) + '/'
path_invoicing = os.path.abspath(os.path.join(os.getcwd(), '..', 'CodeNick/Old Cleaning Code and Data')) + '/'
path_ICIO = os.path.abspath(os.path.join(os.getcwd(), '..', 'CodeNick/Old Cleaning Code and Data/ICIO_for_matlab')) + '/'

# Use new FX data
saveAppend = '_currencyShareRaw'
loadAppend_FX = '_OurData_NScode'

# Options on constructing consumption currency shares and production currency shares
#useTimeSeries =True
#matchTS_toICIO = False  # set to 1 to make the Import/C time series match the ICIO in 2014 (when ICIO was calculated for)
#FullHomeShare = 1  # Set to 1 to assume 100% of home consumption is in home currency
interpolate_backward = True # Set to 1 to estimate country specific regression slopes and interpolate the currency shares backwards in time based on this


#nonImportConsumptionTransformation = []
#useTimeSeriesCountryList = [] #'Poland','Thailand','India','Indonesia','Hungary','New Zealand'
#useCrossSectionalCountryList = ['Taiwan'] # Taiwan doesn't have any data in WDI. For now use other method. ['Switzerland','Czech Republic','Thailand','Sweden','Taiwan'] #

# Load data
portfoliodata = pd.read_pickle(path_cleandata+'PortfolioData'+loadAppend_FX+'.pkl')

In [3]:
# get time series of currency shares imports_currency_shares_time_series.xlsx

# dataframe that will store the actually available data
currency_TS = pd.read_excel(open(path_invoicing+'imports_currency_shares_time_series.xlsx','rb'),sheet_name='Annual')

# Full sample is 1970 to 2023 so we need to add some additional rows so that currency_TS covers this sample when interpolating
years = range(1970,2024)
countries = currency_TS['Country'].unique()
Country = [c for c in countries for y in years]
year = [y for c in countries for y in years]

tomerge = pd.DataFrame(data=Country)
tomerge=tomerge.rename(columns={0:'Country'})
tomerge['year']=year


# Merge with the empty dataframe with all countries
currency_TS = pd.merge(currency_TS,tomerge,on=['Country','year'],how='outer')


# Fill in necessary Country names iso3 codes and imf codes
currency_TS['ISO3C'] = currency_TS.groupby('Country')['ISO3C'].transform('first')
currency_TS['imfcode'] = currency_TS.groupby('Country')['imfcode'].transform('first')
currency_TS = currency_TS[currency_TS['ISO3C'].notnull()]

# columns that will store the added values
currency_TS_I = currency_TS.copy()
currency_TS_I = currency_TS_I.rename(columns={c: c+'_I' for c in currency_TS_I.columns if c not in ['Country', 'ISO3C', 'imfcode', 'year']})



In [4]:
# Define a function to to clean up the new import and export series
def first_last_nams(df,var):
    f_var = var+'_first'
    f_var_d = var+'_first_date'
        
    df.loc[:,f_var_d]=df[df[var].notnull()].groupby('Country')['year'].transform('first') # save first date in data
    l_var = var+'_last'
    l_var_d = var+'_last_date'
    df.loc[:,l_var_d]=df[df[var].notnull()].groupby('Country')['year'].transform('last') # save last date in data
    df.loc[:,f_var_d] = df.groupby('Country')[f_var_d].transform('first')
    df.loc[:,l_var_d] = df.groupby('Country')[l_var_d].transform('last')
    
    df.loc[:,f_var] = df.groupby('Country')[var].transform('first') # find first value of the data 
    df.loc[:,l_var] = df.groupby('Country')[var].transform('first') # find last value of the data
    
    df.loc[df['year']<df[f_var_d],var]=df[f_var][df['year']<df[f_var_d]] # make the earlier missing values of the variable equal to the first value
    df.loc[df['year']>df[l_var_d],var]=df[l_var][df['year']>df[l_var_d]] # extend last value forward if missing
    
    df = df.drop(columns = [l_var,l_var_d,f_var,f_var_d])
    
    return df

# Run the above function (Turned off because we decided not to use this except for the backward stuff below)
#currency_TS_I = first_last_nams(currency_TS_I,'Export USD_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Export EUR_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Export Home_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Export Other_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Export Unclassified_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Export Other_Excl_USDEUR_I')

#currency_TS_I = first_last_nams(currency_TS_I,'Import USD_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Import EUR_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Import Home_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Import Other_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Import Unclassified_I')
#currency_TS_I = first_last_nams(currency_TS_I,'Import Other_Excl_USDEUR_I')


#USe canned interpolation code to fill in missings in the middle of two observstions
currency_TS_I = currency_TS_I.groupby('ISO3C').apply(lambda group: group.interpolate(limit_area='inside'))

# NS Now let's try interpolating backwards and forwards more intelligently. To do this I will estimate currency 
# share slopes and interpolate these backwards 
interpolateVarList = ['Export Home','Import Home','Export USD','Import USD','Export EUR',\
                      'Import EUR','Export Other','Import Other']
if interpolate_backward==True:
    clist  = currency_TS['Country'].unique()
    for c in clist:
        for v in interpolateVarList:
            # regress
            temp = currency_TS[currency_TS['Country']==c][[v,'year']].dropna()
            if len(temp)>2:
                mod = sm.OLS(temp[v],sm.add_constant(temp['year'])).fit()

                # Backwards interpolate
                firstYear = temp['year'].min()
                currency_TS_I.loc[(currency_TS_I['Country']==c)&(currency_TS_I['year']<firstYear),v+'_I'] = mod.params[0]+mod.params[1]*currency_TS_I.loc[(currency_TS_I['Country']==c)&(currency_TS_I['year']<firstYear),'year']
                currency_TS_I.loc[(currency_TS_I['Country']==c)&(currency_TS_I[v+'_I']>100),v+'_I']=100
                currency_TS_I.loc[(currency_TS_I['Country']==c)&(currency_TS_I[v+'_I']<0),v+'_I']=0



In [5]:

# merge original currency with interpolated version
currency_TS_all = pd.merge(left=currency_TS, right=currency_TS_I, how='inner', on=['Country', 'ISO3C', 'imfcode', 'year'])

# remove all nanas
notnullInd = currency_TS_all['Export USD'].notnull()|currency_TS_all['Export EUR'].notnull()|currency_TS_all['Export Home'].notnull()|currency_TS_all['Export Other'].notnull()

# put zeros in rows where one currency share is not empty
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Export USD'].isnull()),'Export USD']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Export EUR'].isnull()),'Export EUR']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Export Home'].isnull()),'Export Home']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Export Other'].isnull()),'Export Other']=0

notnullInd = currency_TS_all['Export USD_I'].notnull()|currency_TS_all['Export EUR_I'].notnull()|currency_TS_all['Export Home_I'].notnull()|currency_TS_all['Export Other_I'].notnull()

# put zeros in rows where one currency share is not empty
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Export USD_I'].isnull()),'Export USD_I']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Export EUR_I'].isnull()),'Export EUR_I']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Export Home_I'].isnull()),'Export Home_I']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Export Other_I'].isnull()),'Export Other_I']=0

notnullInd = currency_TS_all['Import USD'].notnull()|currency_TS_all['Import EUR'].notnull()|currency_TS_all['Import Home'].notnull()|currency_TS_all['Import Other'].notnull()

# put zeros in rows where one currency share is not empty
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Import USD'].isnull()),'Import USD']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Import EUR'].isnull()),'Import EUR']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Import Home'].isnull()),'Import Home']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Import Other'].isnull()),'Import Other']=0

notnullInd = currency_TS_all['Import USD_I'].notnull()|currency_TS_all['Import EUR_I'].notnull()|currency_TS_all['Import Home_I'].notnull()|currency_TS_all['Import Other_I'].notnull()

# put zeros in rows where one currency share is not empty
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Import USD_I'].isnull()),'Import USD_I']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Import EUR_I'].isnull()),'Import EUR_I']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Import Home_I'].isnull()),'Import Home_I']=0
currency_TS_all.loc[(notnullInd==True)&(currency_TS_all['Import Other_I'].isnull()),'Import Other_I']=0


In [6]:
# now link the country shares to country labels
#cc = coco.CountryConverter().data
#cc.loc[cc['name_short']=='United Kingdom','ISO2']='GB'
#countryList = pd.read_csv(open(path_ICIO+'ICIO_cnty_index.csv','rb'))
#countryList = pd.merge(cc[['ISO2','ISO3','name_short']],countryList,left_on = 'ISO2',right_on='cnty',how='right')
#countryList.loc[51,'name_short']='Rest of the World'
#countryList['own_share_imports']=consumptionCountryShares
#countryList['own_share_exports']=exportCountryShares
#countryList.loc[countryList['ISO2'] == 'TR', 'name_short'] = 'Turkey'

Euro_list = ['Austria','Belgium','Cyprus','Germany','Estonia',
            'Spain','Finland','France','Greece','Ireland','Italy',
            'Lithuania','Luxembourg','Latvia','Monaco',
            'Malta','Netherlands','Portugal','Slovenia','Slovakia',
            'Vatican City']

In [8]:
# former: right merge gets rid of any invoicing data for countries not in countryList
# current: left merge
#df_all = pd.merge(currency_TS_all,countryList,left_on='Country',right_on='name_short',how='left')

# remove rest of the world (for now)
#df_all = df_all[df_all['name_short']!='Rest of the World']

# euro indicator
currency_TS_all['Euro Indicator']=currency_TS_all['Country'].isin(Euro_list)



In [9]:
# Add in a month variable and delete any entries where month does not exist
portfoliodata['month'] = portfoliodata['Date'].dt.month
portfoliodata = portfoliodata[portfoliodata['month'].notnull()]
temp = portfoliodata.groupby(['iso3','year']).mean().reset_index()

df_all = pd.merge(currency_TS_all,temp[['iso3','year','export_GDP','import_Cons']],left_on=['ISO3C','year'],right_on=['iso3','year'],how='left')
df_all = df_all.drop(columns=['iso3'])

In [11]:
# merge with portfolio data
df_all.rename(columns={"ISO3C": "iso3"}, inplace=True)
portfoliodata['year'] = portfoliodata['year'].astype(float)

# [NICK] Add in a month variable and delete any entries where month does not exist
portfoliodata['month'] = portfoliodata['Date'].dt.month
portfoliodata = portfoliodata[portfoliodata['month'].notnull()]

# truncate columns of df_all for Julien to have a parsimonious dataseries
# df_all_par = df_all[['Country','year','ISO2','iso3','imfcode','Import USD_I','Import EUR_I','Import Home_I','Import USD','Import EUR','Import Home','SumShares','SumShares_I','SumShares_X','SumShares_X_I']]
df_all_par = df_all[['Country','year','iso3','imfcode',
                     'Import USD_I','Import EUR_I','Import Home_I','Import USD','Import EUR','Import Home','Import Other',
                     'Export USD_I','Export EUR_I','Export Home_I','Export USD','Export EUR','Export Home','Export Other'
                     ]]

# merge with other data
df_all_par = pd.merge(left=df_all_par, right=portfoliodata, how='outer', on=['iso3','year'])
df_all_par.sort_values(by=['iso3', 'year'], inplace=True)

In [12]:
# these countries don't have any invoicing data but do have shares:
# Brunei Darussalam, China, Hong Kong, Mexico, Philippines, Singapore, Slovakia, Vietnam
# keeping a list of these countries:
# shares_noinvoicing = df_all.loc[df_all['year'].isna()][['ISO2', 'Country', 'cnty_id', 'own_share_imports', 'own_share_exports']]
# shares_noinvoicing_countries = list(shares_noinvoicing['Country'].unique())

df_all=df_all_par

# out of invoicing, portfoliodata, and ICIO:
# make list of countries in both portfoliodata and invoicing
# make list of countries in invoicing, portfoliodata, and ICIO

In [13]:
# Create a DATE BASED Euro Indicator
cc = coco.CountryConverter()
euro = pd.read_csv(path_rawdata+'Euro_Yield_Dates.csv', encoding='latin-1')
euro['EntryDate'] = pd.DatetimeIndex(pd.to_datetime(euro['Date'], format='%d/%m/%Y')).year
euro['Date'] = None
euro['iso2'] = euro['Code'].str[:2]
euro['iso3'] = cc.pandas_convert(series=euro['iso2'], to='ISO3')
euro = euro.drop(columns=['Currency','Code','Date','iso2'])

df_all = pd.merge(df_all,euro,on='iso3',how='left')

df_all['Euro Indicator']=False
df_all.loc[df_all['year']>=df_all['EntryDate'],'Euro Indicator']=True


In [14]:
# final thing: find currency shares for the Euro Area:
wdiin = pd.read_pickle(path_cleandata+'WDI.pkl')

wdiin = wdiin[wdiin['gdp_cur_dol'].notnull()][['iso3','year','gdp_cur_dol']]
df_all['year']=df_all['year'].astype('int')
df_all = pd.merge(left=df_all,right=wdiin,on=['iso3','year'],how='left')


# Eu GDP shares
df_all.loc[df_all['year']>2019,'Euro Indicator'] = df_all[df_all['year']>2019].groupby(['iso3'])['Euro Indicator'].transform('first')
test = lambda x: x/x.sum()
#df_all['nanSumShares']=df_all['SumShares_I'].isnull()

# make adjustments based on the availability of each invoice share over time
#df_all['notNanSS'] = df_all['SumShares'].notnull()
#df_all['notNanSS_I'] = df_all['SumShares_I'].notnull()
#df_all['notNanSS_X'] = df_all['SumShares_X'].notnull()
#df_all['notNanSS_X_I'] = df_all['SumShares_X_I'].notnull()
df_all['notNanUSD'] = df_all['Import USD'].notnull()
df_all['notNanEUR'] = df_all['Import EUR'].notnull()
df_all['notNanHome'] = df_all['Import Home'].notnull()
df_all['notNanOther'] = df_all['Import Other'].notnull()



#df_all.loc[:,'EU_GDP_SS']=df_all.groupby(['year','month','Euro Indicator','notNanSS'])['gdp_cur_dol'].transform(test)
#df_all.loc[:,'EU_GDP_SS_I']=df_all.groupby(['year','month','Euro Indicator','notNanSS_I'])['gdp_cur_dol'].transform(test)
#df_all.loc[:,'EU_GDP_SS_X']=df_all.groupby(['year','month','Euro Indicator','notNanSS_X'])['gdp_cur_dol'].transform(test)
#df_all.loc[:,'EU_GDP_SS_X_I']=df_all.groupby(['year','month','Euro Indicator','notNanSS_X_I'])['gdp_cur_dol'].transform(test)
df_all.loc[:,'EU_GDP_USD']=df_all.groupby(['year','month','Euro Indicator','notNanUSD'])['gdp_cur_dol'].transform(test)
df_all.loc[:,'EU_GDP_EUR']=df_all.groupby(['year','month','Euro Indicator','notNanEUR'])['gdp_cur_dol'].transform(test)
df_all.loc[:,'EU_GDP_Home']=df_all.groupby(['year','month','Euro Indicator','notNanHome'])['gdp_cur_dol'].transform(test)
df_all.loc[:,'EU_GDP_Other']=df_all.groupby(['year','month','Euro Indicator','notNanOther'])['gdp_cur_dol'].transform(test)

# Now keep if in EU and calculate EU import shares
#df_all['importShares_sum'] = df_all['SumShares']*df_all['EU_GDP_SS'] 
#df_all['importShares_sum_I'] = df_all['SumShares_I']*df_all['EU_GDP_SS_I'] 
#df_all['exportShares_sum'] = df_all['SumShares_X']*df_all['EU_GDP_SS_X'] 
#df_all['exportShares_sum_I'] = df_all['SumShares_X_I']*df_all['EU_GDP_SS_X_I'] 
df_all['ImportUSD_sum'] = df_all['Import USD']*df_all['EU_GDP_USD'] 
df_all['ImportEUR_sum'] = df_all['Import EUR']*df_all['EU_GDP_EUR'] 
df_all['ImportHome_sum'] = df_all['Import Home']*df_all['EU_GDP_Home'] 
df_all['ImportOther_sum'] = df_all['Import Other']*df_all['EU_GDP_Other'] 


In [16]:
#### N.B. Estonia is not populated with EU_GDP because its data is not monthly!!!! I am ok with that!


df_all[(df_all['Euro Indicator']==True)&(df_all['gdp_cur_dol'].notnull())&(df_all['EU_GDP_USD'].isnull())]

Unnamed: 0,Country,year,iso3,imfcode,Import USD_I,Import EUR_I,Import Home_I,Import USD,Import EUR,Import Home,...,notNanHome,notNanOther,EU_GDP_USD,EU_GDP_EUR,EU_GDP_Home,EU_GDP_Other,ImportUSD_sum,ImportEUR_sum,ImportHome_sum,ImportOther_sum
15965,Estonia,2011,EST,939.0,21.459103,56.843098,0.0,21.459103,56.843098,0.0,...,True,True,,,,,,,,
15966,Estonia,2012,EST,939.0,19.372803,58.925003,0.0,19.372803,58.925003,0.0,...,True,True,,,,,,,,
15967,Estonia,2013,EST,939.0,15.905174,61.39603,0.0,15.905174,61.39603,0.0,...,True,True,,,,,,,,
15968,Estonia,2014,EST,939.0,17.901501,59.705952,0.0,17.901501,59.705952,0.0,...,True,True,,,,,,,,
15969,Estonia,2015,EST,939.0,17.182262,60.257442,0.0,,,,...,False,False,,,,,,,,
15970,Estonia,2016,EST,939.0,16.463024,60.808933,0.0,16.463024,60.808933,0.0,...,True,True,,,,,,,,
15971,Estonia,2017,EST,939.0,16.364794,61.11272,0.0,16.364794,61.11272,0.0,...,True,True,,,,,,,,
15972,Estonia,2018,EST,939.0,19.542723,58.792297,0.0,19.542723,58.792297,0.0,...,True,True,,,,,,,,
15973,Estonia,2019,EST,939.0,17.977434,59.836975,0.0,17.977434,59.836975,0.0,...,True,True,,,,,,,,
15974,Estonia,2020,EST,939.0,,,,,,,...,False,False,,,,,,,,


In [17]:

#Calculate sum of shares for the Euro area by aggregating across the EA countries

EUR = df_all[df_all['Euro Indicator']==True]

varslist = ['ImportUSD_sum','ImportEUR_sum','ImportHome_sum','ImportOther_sum','gdp_cur_dol']
EUR=EUR.groupby(['year','month'])[varslist].sum().reset_index()
EUR = EUR.rename(columns={'gdp_cur_dol':'EU_GDP_temp'})
df_all = df_all.drop(columns=varslist)

#varslist = ['SumShares','SumShares_I','SumShares_X','SumShares_X_I']
EUR['iso3']='EUR'

df_all = pd.merge(df_all,EUR,on=['iso3','year','month'],how='left')

#df_all.loc[df_all['iso3']=='EUR','SumShares']=df_all.loc[df_all['iso3']=='EUR','importShares_sum']
#df_all.loc[df_all['iso3']=='EUR','SumShares_I']=df_all.loc[df_all['iso3']=='EUR','importShares_sum_I']
#df_all.loc[df_all['iso3']=='EUR','SumShares_X']=df_all.loc[df_all['iso3']=='EUR','exportShares_sum']
#df_all.loc[df_all['iso3']=='EUR','SumShares_X_I']=df_all.loc[df_all['iso3']=='EUR','exportShares_sum_I']
df_all.loc[df_all['iso3']=='EUR','Import USD']=df_all.loc[df_all['iso3']=='EUR','ImportUSD_sum']
df_all.loc[df_all['iso3']=='EUR','Import EUR']=df_all.loc[df_all['iso3']=='EUR','ImportEUR_sum']
df_all.loc[df_all['iso3']=='EUR','Import Home']=df_all.loc[df_all['iso3']=='EUR','ImportHome_sum']
df_all.loc[df_all['iso3']=='EUR','Import Other']=df_all.loc[df_all['iso3']=='EUR','ImportOther_sum']
df_all.loc[(df_all['iso3']=='EUR')&(df_all['EU_GDP_temp']>0),'gdp']=df_all.loc[df_all['iso3']=='EUR','EU_GDP_temp']
varslist = ['ImportUSD_sum','ImportEUR_sum','ImportHome_sum','ImportOther_sum','EU_GDP_temp']
df_all = df_all.drop(columns = varslist)




In [18]:
# fix euro labelling
df_all.loc[df_all['iso3'] == 'EUR', 'ISO2'] = 'EU'
df_all.loc[df_all['iso3']=='EUR','Country']='EU'
df_all.loc[df_all['iso3'] == 'EUR', 'imfcode'] = 160

In [21]:
# Before saving remove all entries with months = nan (this means there was an outer merge with entries we can;t use)
df_all = df_all[df_all['month'].notnull()]

# drop uneeded variables
#df_all = df_all.drop(columns=['EntryDate', 'nanSumShares',
#       'notNanSS', 'notNanSS_I', 'notNanSS_X', 'notNanSS_X_I', 'EU_GDP_SS',
#       'EU_GDP_SS_I', 'EU_GDP_SS_X', 'EU_GDP_SS_X_I'])

# save
df_all.to_pickle(path_cleandata+'FullData'+saveAppend+'.pkl')
df_all.to_csv(path_cleandata+'FullData'+saveAppend+'.csv')

In [23]:
saveAppend

'_currencyShareRaw'