In [4]:
from warnings import filterwarnings

filterwarnings('ignore')

In [18]:
from copy import deepcopy
import pandas as pd
import numpy as np

In [19]:
pd.set_option('display.max_columns', None)

In [20]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [21]:
trade_df = pd.read_csv('data/dyadic_trade_3.0.csv', encoding = 'utf8')

trade_df.rename({'flow1': 'money_flow_1', 'flow2': 'money_flow_2',
         'ccode1': 'c_code_1', 'ccode2': 'c_code_2',
         'importer1': 'state_name_1', 'importer2': 'state_name_2'},
        axis = 1, inplace = True)

trade_df['c_code_1'] = trade_df['c_code_1'].astype(int)
trade_df['c_code_2'] = trade_df['c_code_2'].astype(int)

trade_df['money_flow_1'] = ([s * 1000000 for s in trade_df['money_flow_1']])
trade_df['money_flow_2'] = ([s * 1000000 for s in trade_df['money_flow_2']])

trade_df.loc[trade_df['money_flow_1'] == -9000000, 'money_flow_1'] = None
trade_df.loc[trade_df['money_flow_2'] == -9000000, 'money_flow_2'] = None

trade_df = trade_df.sort_values(by = 'year', ascending = True).reset_index()

trade_df.drop(['source1', 'source2', 'bel_lux_alt_flow1', 'bel_lux_alt_flow2', 'china_alt_flow1', 'china_alt_flow2', 'version', 'index'], axis = 1, inplace = True)

In [22]:
trade_df_copy = deepcopy(trade_df)

trade_df_copy.rename({'c_code_1': 'c_code_1_copy', 'c_code_2':'c_code_2_copy',
         'state_name_1': 'state_name_1_copy', 'state_name_2': 'state_name_2_copy',
         'money_flow_1': 'money_flow_1_copy', 'money_flow_2': 'money_flow_2_copy'},
        axis = 1, inplace = True)

trade_df_copy.rename({'c_code_1_copy': 'c_code_2', 'c_code_2_copy':'c_code_1',
         'state_name_1_copy': 'state_name_2', 'state_name_2_copy': 'state_name_1',
         'money_flow_1_copy': 'money_flow_2', 'money_flow_2_copy': 'money_flow_1'},
        axis = 1, inplace = True)

In [23]:
trade_df = pd.concat([trade_df, trade_df_copy], axis = 0)

trade_df = trade_df.sort_values(by = 'year', ascending = True).reset_index()

trade_df.drop(['index'], axis = 1, inplace = True)

In [24]:
trade_df.head()

Unnamed: 0,c_code_1,c_code_2,money_flow_1,money_flow_2,state_name_1,state_name_2,year
0,210,390,,,Netherlands,Denmark,1870
1,255,160,190000.0,,Germany,Argentina,1870
2,200,70,1450000.0,5110000.0,United Kingdom,Mexico,1870
3,600,255,,,Morocco,Germany,1870
4,350,135,,,Greece,Peru,1870


In [25]:
trade_df.to_pickle('pickle/trade_df.pkl')
trade_df.to_csv('dataframe_exports/trade_df.csv')

In [26]:
trade_combo_df = deepcopy(trade_df[trade_df['year'] >= 1946][['c_code_1', 'c_code_2']])

duplicate_list = ['c_code_1', 'c_code_2']

trade_combo_df.drop_duplicates(subset = duplicate_list, keep = 'first', inplace = True)

In [27]:
trade_combo_df = trade_combo_df.reset_index()
trade_combo_df.drop(['index'], axis = 1, inplace = True)

In [28]:
len(trade_combo_df)

39630

In [29]:
for i, code in enumerate(trade_combo_df['c_code_1']):
    trade_combo_df.loc[i, 'trade_combo'] = str(trade_combo_df['c_code_1'][i]) + ' ' + str(trade_combo_df['c_code_2'][i])

KeyboardInterrupt: 

In [None]:
trade_combo_df.head()

In [28]:
column_list = ['money_flow_1', 'money_flow_2']
cubic_list = ['money_flow_1', 'money_flow_2']

In [29]:
trade_net_df = deepcopy(trade_df[trade_df['year'] >= 1946])

for c_code in trade_combo_df['trade_combo'].unique():
    for column in column_list:
        data_list = list(trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].values)
        if str(data_list).count('nan') == len(data_list):
            trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = 0
        elif column not in cubic_list and str(data_list).count('nan') <= len(data_list) - 2:
            trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].interpolate()
        elif column in cubic_list and str(data_list).count('nan') <= len(data_list) - 2:
            try:
                trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].interpolate(method = 'cubic')
            except:
                trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].bfill()
                trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].ffill()
        data_list = list(trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].values)
        if str(data_list).count('nan') <= len(data_list) - 1:
            trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].bfill()            
            trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].ffill()            

NameError: name 'trade_net_df' is not defined

In [30]:
trade_net_df.head()

Unnamed: 0,c_code_1,c_code_2,money_flow_1,money_flow_2,state_name_1,state_name_2,year
176748,230,355,0.0,0.0,Spain,Bulgaria,1946
176749,645,678,0.0,0.0,Iraq,Yemen Arab Republic,1946
176750,70,920,0.0,0.0,Mexico,New Zealand,1946
176751,160,385,5800000.0,12800000.0,Argentina,Norway,1946
176752,2,145,59000000.0,37070000.0,United States of America,Bolivia,1946


In [31]:
trade_net_df.to_pickle('pickle/trade_net_df.pkl')
trade_net_df.to_csv('dataframe_exports/trade_net_df.csv')

In [30]:
trade_net_df = pd.read_pickle('pickle/trade_net_df.pkl')

In [31]:
trade_net_df.loc[(trade_net_df['money_flow_1'] < 0) | (trade_net_df['money_flow_2'] < 0), ['money_flow_1', 'money_flow_2']] = None

In [32]:
column_list = ['money_flow_1', 'money_flow_2']
cubic_list = ['money_flow_1', 'money_flow_2']

In [34]:
# trade_net_df = deepcopy(trade_df[trade_df['year'] >= 1946])

for c_code in trade_combo_df['trade_combo'].unique():
    for column in column_list:
        data_list = list(trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].values)
        if str(data_list).count('nan') == len(data_list):
            trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = 0
        elif column not in cubic_list and str(data_list).count('nan') <= len(data_list) - 2:
            trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].interpolate()
        elif column in cubic_list and str(data_list).count('nan') <= len(data_list) - 2:
            try:
                trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].interpolate(method = 'cubic')
            except:
                trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].bfill()
                trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].ffill()
        data_list = list(trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].values)
        if str(data_list).count('nan') <= len(data_list) - 1:
            trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].bfill()            
            trade_net_df.loc[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1])), column] = trade_net_df[(trade_net_df['c_code_1'] == int(c_code.split()[0])) & (trade_net_df['c_code_2'] == int(c_code.split()[1]))][column].ffill()            

AttributeError: 'float' object has no attribute 'split'

In [None]:
trade_net_df.loc[(trade_net_df['money_flow_1'] < 0) | (trade_net_df['money_flow_2'] < 0), ['money_flow_1', 'money_flow_2']]

In [None]:
aggregations = {
  'c_code_2': 'count',
  'money_flow_1': 'sum',
  'money_flow_2': 'sum'
  }

trade_df_group = trade_df.groupby(['c_code_1', 'state_name_1', 'year']).agg(aggregations).reset_index()

trade_df_group.rename({'c_code_2': 'num_trade_states', 'money_flow_1': 'export_dollars', 'money_flow_2': 'import_dollars'}, axis = 1, inplace = True)

In [None]:
trade_df_group.to_pickle('pickle/trade_df_group.pkl')
trade_df_group.to_csv('dataframe_exports/trade_df_group.csv')

In [None]:
trade_df_group.head()