In [1]:
# import DOTS data from csv file
import pandas as pd
from datetime import datetime
import country_converter as coco
import os
from dotenv import dotenv_values, find_dotenv

# this looks for your configuration file and then reads it as a dictionary
config = dotenv_values(find_dotenv())

# set path using the dictionary key for which one you want
path_cleandata = os.path.abspath(config["CLEANDATA"]) + '\\'
path_rawdata = os.path.abspath(config["RAWDATA"]) + '\\'

cc = coco.CountryConverter()

# Loading DOTS data from csv file
dots = pd.read_csv(path_rawdata+'IMF_DOTS.csv',low_memory=False)
tmpdots = dots.copy()

alldots = tmpdots.copy()
alldots = alldots[['Country Code', 'Counterpart Country Code', 
                'Indicator Code', 'Time Period', 'Value']]

# Renaming columns
alldots.columns = ['CountryCode','PartnerCode','type','year','value']

# Convert 'type' to a category data type
alldots['type'] = alldots.type.astype('category')

# Rename categories for type column
alldots['type'] = alldots['type'].cat.rename_categories(
    {'TMG_CIF_USD': 'imports_cif', 'TBG_USD': 'tradebalance', 
     'TXG_FOB_USD': 'exports_fob', 'TMG_FOB_USD': 'imports_fob'})

alldots.dropna()

# Load dataframe with IMF country code in 'IMF' colum and iso3 country code in 'iso3' column
country_code_mapping = pd.read_csv(path_rawdata+'Country_iso_codes.csv')
country_code_mapping = country_code_mapping[['iso3','IMF']]  # CHANGE #1 ABOVE MADE HERE!!!!

# Make a dictionary out of the dataframe
dict = country_code_mapping.set_index('IMF').to_dict()['iso3']

# Replace IMF country codes in CountryCode and PartnerCode columns
# by corresponding ISO3 country codes
alldots['CountryCode']= alldots['CountryCode'].map(dict)
alldots['PartnerCode']= alldots['PartnerCode'].map(dict)

# Rename contry code columns
alldots.rename(columns = {'CountryCode': 'iso3_o', 'PartnerCode': 'iso3_d'}, inplace = True)

# Drop missing values
dotssub = alldots[['iso3_o','iso3_d','type','year','value']].dropna()

# Create euro area aggregate for DOTS corresponding to WDI's EMU 
# First, get year of adoption of euro for each country
euro = pd.read_csv(path_rawdata+'Euro_Yield_Dates.csv', encoding='latin-1')
euro['Currency'] = None
euro['EntryDate'] = pd.DatetimeIndex(pd.to_datetime(euro['Date'], format='%d/%m/%Y')).year
euro['Date'] = None
euro['iso2'] = euro['Code'].str[:2]
euro['iso3'] = cc.pandas_convert(series=euro['iso2'], to='ISO3')
euro = euro[['iso3','EntryDate']] # euro dataframe is only left with 2 columns, country code and entry year

tmpdots = dotssub.copy()

# create a euro dummy for if the o or d is in the euro in that year
tmpeuro = euro.copy()
tmpeuro.rename(columns = {'iso3': 'iso3_o', 'EntryDate': 'EntryDate_o'}, inplace=True)
tmpeuro.sort_values(['iso3_o'])
tmpdots.sort_values(['iso3_o'])

tmpdots = pd.merge(left=tmpdots, right=tmpeuro, on='iso3_o', how='left')

tmpeuro = euro.copy()
tmpeuro.rename(columns = {'iso3': 'iso3_d', 'EntryDate': 'EntryDate_d'}, inplace=True)

tmpdots = pd.merge(left=tmpdots, right=tmpeuro, on='iso3_d', how='left')

tmpdots = tmpdots[tmpdots['iso3_d'].notna()]
tmpdots = tmpdots[tmpdots['iso3_o'].notna()]

tmpdots['ineuro_o'] = tmpdots['EntryDate_o'] <= tmpdots['year']
tmpdots['ineuro_d'] = tmpdots['EntryDate_d'] <= tmpdots['year']

tmpdots[['ineuro_o', 'ineuro_d']] = tmpdots[['ineuro_o', 'ineuro_d']].fillna(False)

# Make tmpdots the full dataframe with in euro dummy for o and d countries
tmpdots = tmpdots[['iso3_o', 'iso3_d', 'year', 'ineuro_o', 'ineuro_d', 'type', 'value']]

# Create new dots dataframe with only observations where d countries are not in euro
# This is going to represent the trade flows between origin anywhere and destination
# not in eurozone
noeuro_d = tmpdots.loc[tmpdots['ineuro_d'] == False]
# sum imports resp. exports by countries o in the euro zone in a year
euro_o = noeuro_d.loc[noeuro_d['ineuro_o'].eq(1)]\
        .groupby(['iso3_d','type','year'], as_index=False)['value'].sum()
euro_o['iso3_o']='EUR'
euro_o = euro_o[['iso3_o','iso3_d','type','year','value']]

# Now create dots dataframe with only observations where o countries are not in euro
noeuro_o = tmpdots.loc[tmpdots['ineuro_o'] == False]
# sum imports resp. exports by countries d in the euro zone in a year
euro_d = noeuro_o.loc[noeuro_o['ineuro_d'].eq(1)]\
        .groupby(['iso3_o','type','year'], as_index=False)['value'].sum()
euro_d['iso3_d']='EUR'
euro_d = euro_d[['iso3_o','iso3_d','type','year','value']]

# Combine two dataframes to create new dataframe that has all trade flows between
# euro zone and non-euro zone countries
euro = pd.concat([euro_o, euro_d])
euro.sort_values(['iso3_o','iso3_d','year'])

alldots = pd.concat([dotssub, euro])
alldots.sort_values(['iso3_o','iso3_d','year'])

# Need to check with Nick that what is achieved with setkey in r is achieved with sort_values here

# Drop trade balance data
alldots = alldots[alldots['type'].isin(['imports_cif','imports_fob','exports_fob'])]
# Reshape from long to wide format
dotswide = alldots.pivot_table(index=['iso3_o','iso3_d','year'],columns='type',values='value').reset_index()\
            .rename_axis(None, axis=1)
           
# Make clean bilateral dataset
tmp1 = dotswide[['iso3_o','iso3_d','year','imports_cif','exports_fob']]
tmp1 = tmp1.rename(columns = {'imports_cif': 'imports_o', 'exports_fob': 'exports_o'})
tmp2 = dotswide[['iso3_o','iso3_d','year','imports_cif','exports_fob']]
tmp2 = tmp2.rename(columns = {'iso3_o': 'iso3_d','iso3_d': 'iso3_o','imports_cif': 'imports_d', 'exports_fob': 'exports_d'})

tmp = pd.merge(left=tmp1, right=tmp2, on=['iso3_o','iso3_d','year'], how='outer')

tmp.sort_values(by=['iso3_o','iso3_d','year'])

# Replace missing values for exports_o with imports_d (etc) and vice versa
tmp['exports_o'].fillna(tmp['imports_d'])
tmp['exports_d'].fillna(tmp['imports_o'])
tmp['imports_o'].fillna(tmp['exports_d'])
tmp['imports_d'].fillna(tmp['exports_o'])

# Save
tmp.to_pickle(path_cleandata+'IMF_DOTS.pkl')