In [6]:
import pandas as pd
import numpy as np
#save and load as parquet (for big files)
import pyarrow as pa
import pyarrow.parquet as pq

In [7]:
trade_mx = pd.read_parquet(r'..\data\trade_matrix_carbon_footprint_v4.parquet')

In [8]:
codes = pd.read_csv(r"..\data\country_matrix_v4_final.csv")

In [9]:
trade_mx.head()

Unnamed: 0,Reporter Country Code,Partner Country Code,Item Code,Year,Value_tons,Flag,distance_in_km,same_continent,share_border,any_island_or_missing,transportation_method,food_miles,kgCO2eq_tkm
0,4,2,231,2005,3.0,A,5855.362822,0,0.0,0,water,17566.09,175.660885
1,3,3,56,2005,52.0,A,108.698297,1,1.0,0,land,5652.311,1130.462293
2,7,2,176,2022,25.06,A,7596.96399,0,0.0,0,water,190379.9,1903.799176
3,4,2,711,1996,171.0,A,5855.362822,0,0.0,0,water,1001267.0,10012.670425
4,4,2,711,1997,160.0,A,5855.362822,0,0.0,0,water,936858.1,9368.580515


In [10]:
codes.head()

Unnamed: 0,country_from,continent_from,country_to,Continent_to,distance_in_km,same_continent,share_border,any_island_or_missing,transportation_method,country_from_code,country_to_code,internal_distance
0,afghanistan,asia,afghanistan,asia,515.733111,1,1.0,0,land,2,2,515.733111
1,afghanistan,asia,albania,europe,4335.926901,0,0.0,0,water,2,3,
2,afghanistan,asia,algeria,africa,5855.362822,0,0.0,0,water,2,4,
3,afghanistan,asia,angola,africa,7596.96399,0,0.0,0,water,2,7,
4,afghanistan,asia,argentina,south america,15273.883835,0,0.0,0,water,2,9,


## Create key for merging

In [11]:
codes['key'] = codes['country_from_code'].astype(str) + '-' + codes['country_to_code'].astype(str)
trade_mx['key'] = trade_mx['Reporter Country Code'].astype(str) + '-' + trade_mx['Partner Country Code'].astype(str)

In [12]:
trade_mx.isnull().sum()

Reporter Country Code    0
Partner Country Code     0
Item Code                0
Year                     0
Value_tons               0
Flag                     0
distance_in_km           0
same_continent           0
share_border             0
any_island_or_missing    0
transportation_method    0
food_miles               0
kgCO2eq_tkm              0
key                      0
dtype: int64

In [13]:
full_trade_mx = trade_mx.merge(codes[["continent_from","Continent_to", "key"]], on='key', how='left')

In [14]:
full_trade_mx.head()

Unnamed: 0,Reporter Country Code,Partner Country Code,Item Code,Year,Value_tons,Flag,distance_in_km,same_continent,share_border,any_island_or_missing,transportation_method,food_miles,kgCO2eq_tkm,key,continent_from,Continent_to
0,4,2,231,2005,3.0,A,5855.362822,0,0.0,0,water,17566.09,175.660885,4-2,africa,asia
1,3,3,56,2005,52.0,A,108.698297,1,1.0,0,land,5652.311,1130.462293,3-3,europe,europe
2,7,2,176,2022,25.06,A,7596.96399,0,0.0,0,water,190379.9,1903.799176,7-2,africa,asia
3,4,2,711,1996,171.0,A,5855.362822,0,0.0,0,water,1001267.0,10012.670425,4-2,africa,asia
4,4,2,711,1997,160.0,A,5855.362822,0,0.0,0,water,936858.1,9368.580515,4-2,africa,asia


In [20]:
# Selecting and renaming the necessary columns
full_trade_mx = full_trade_mx[['continent_from', 'Continent_to', 'Year', 'kgCO2eq_tkm', 'Value_tons']]

# Grouping by year and summing the values
continent_mx = full_trade_mx.groupby(['continent_from', 'Continent_to', 'Year']).agg({'kgCO2eq_tkm': 'sum'}).reset_index()

In [21]:
# Renaming for sankey code
continent_mx = continent_mx.rename(columns={
    'continent_from': 'target',
    'Continent_to': 'source',
    'Year': 'year',
    'kgCO2eq_tkm': 'value'
})

In [22]:
continent_mx

Unnamed: 0,target,source,year,value
0,africa,africa,1986,1.534729e+08
1,africa,africa,1987,8.264991e+07
2,africa,africa,1988,1.004438e+08
3,africa,africa,1989,1.144096e+08
4,africa,africa,1990,1.249245e+08
...,...,...,...,...
1327,south america,south america,2018,1.766014e+10
1328,south america,south america,2019,2.060040e+10
1329,south america,south america,2020,1.940492e+10
1330,south america,south america,2021,2.097569e+10


In [23]:
continent_mx = continent_mx[['year', 'source', 'target', 'value']]

In [27]:
continent_mx.to_csv(r"..\data\continent_trade_matrix_CO2eq.csv", index=False)

In [28]:
continent_mx['source'].value_counts()

source
africa           222
asia             222
europe           222
north america    222
oceania          222
south america    222
Name: count, dtype: int64