# Japan e-Stat exports data

Source: [e-Stat](https://www.e-stat.go.jp/en)

In [1]:
import pandas as pd
import numpy as np

## 1. Import and join data

In [2]:
path = 'data/raw/jp/'
files = [f'{path}{filename}' for filename in os.listdir(path) if filename[-4:] == '.csv'][1:17]

files

['data/raw/jp/ik-101h2017e001.csv',
 'data/raw/jp/ik-101h2017e002.csv',
 'data/raw/jp/ik-101h2017e003.csv',
 'data/raw/jp/ik-101h2017e004.csv',
 'data/raw/jp/ik-101h2017e005.csv',
 'data/raw/jp/ik-101h2017e006.csv',
 'data/raw/jp/ik-101h2017e007.csv',
 'data/raw/jp/ik-101h2017e008.csv',
 'data/raw/jp/ik-101h2018e001.csv',
 'data/raw/jp/ik-101h2018e002.csv',
 'data/raw/jp/ik-101h2018e003.csv',
 'data/raw/jp/ik-101h2018e004.csv',
 'data/raw/jp/ik-101h2018e005.csv',
 'data/raw/jp/ik-101h2018e006.csv',
 'data/raw/jp/ik-101h2018e007.csv',
 'data/raw/jp/ik-101h2018e008.csv']

In [3]:
df_country_names = pd.read_csv('data/raw/jp/estat-jp-country-codes.csv')

print(len(df_country_names))

df_country_names.head()

232


Unnamed: 0,Code,CountryName
0,103,Republic of Korea
1,104,North Korea
2,105,People's Republic of China
3,106,Taiwan
4,107,Mongolia


In [4]:
df_isos = pd.read_excel(
        'data/raw/Comtrade Country Code and ISO list.xlsx',
        usecols=[1, 4],
        keep_default_na=False # Necessary because the ISO code for Namibia is ‘NA’
    ) \
    .rename({
        'Country Name, Full ': 'country_name',
        'ISO2-digit Alpha': 'iso2'
    }, axis=1) \
    .drop_duplicates('iso2') \
    .reset_index(drop=True)

print(len(df_isos))

df_isos.head()

251


Unnamed: 0,country_name,iso2
0,Afghanistan,AF
1,"Africa CAMEU region, not elsewhere specified",
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


In [5]:
df = pd.concat(
        (pd.read_csv(file, usecols=[1, 2, 3, 5, *range(9, 45)]) for file in files),
        ignore_index=True
    ) \
    .merge(df_country_names, 'left', left_on='Country', right_on='Code') \
    .rename({ 'CountryName': 'country_name' }, axis=1) \
    .merge(df_isos, 'left', on='country_name') \
    .drop(['Country', 'Code'], axis=1)

print(len(df))

df.head()

265678


Unnamed: 0,Year,HS,Unit2,Quantity1-Jan,Quantity2-Jan,Value-Jan,Quantity1-Feb,Quantity2-Feb,Value-Feb,Quantity1-Mar,...,Quantity2-Oct,Value-Oct,Quantity1-Nov,Quantity2-Nov,Value-Nov,Quantity1-Dec,Quantity2-Dec,Value-Dec,country_name,iso2
0,2017,'000000190',KG,0,46399374,20539899,0,8847025,18768662,0,...,29760607,21598642,0,64994657,22480562,0,8410100,23084883,Republic of Korea,KR
1,2017,'010121000',NO,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Republic of Korea,KR
2,2017,'010129000',NO,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Republic of Korea,KR
3,2017,'010611000',NO,0,0,0,0,11,9510,0,...,0,0,0,0,0,0,0,0,Republic of Korea,KR
4,2017,'010612000',NO,0,0,0,0,2,9090,0,...,0,0,0,0,0,0,1,954,Republic of Korea,KR


## 2. Transpose data and fill missing ISO codes

In [6]:
value_cols = [
    'Quantity2-Jan',
    'Quantity2-Feb',
    'Quantity2-Mar',
    'Quantity2-Apr',
    'Quantity2-May',
    'Quantity2-Jun',
    'Quantity2-Jul',
    'Quantity2-Aug',
    'Quantity2-Sep',
    'Quantity2-Oct',
    'Quantity2-Nov',
    'Quantity2-Dec'
]

In [7]:
df_melted = df.copy() \
    .query(
        'HS.str.startswith("\'3915") | \
        HS.str.startswith("\'4707")'
    ) \
    .melt(id_vars=['Year', 'country_name', 'iso2', 'HS'], value_vars=value_cols)

print(len(df_melted))

df_melted.head()

3996


Unnamed: 0,Year,country_name,iso2,HS,variable,value
0,2017,Republic of Korea,KR,'391510000',Quantity2-Jan,53550
1,2017,Republic of Korea,KR,'391520000',Quantity2-Jan,24800
2,2017,Republic of Korea,KR,'391530000',Quantity2-Jan,1016985
3,2017,Republic of Korea,KR,'391590110',Quantity2-Jan,263821
4,2017,Republic of Korea,KR,'391590190',Quantity2-Jan,401333


In [8]:
conditions = [
    df_melted['country_name'] == 'People\'s Republic of China',
    df_melted['country_name'] == 'Taiwan',
    df_melted['country_name'] == 'Hong Kong',
    df_melted['country_name'] == 'Viet Nam',
    df_melted['country_name'] == 'Laos',
    df_melted['country_name'] == 'Pakistan',
    df_melted['country_name'] == 'Macao',
    df_melted['country_name'] == 'Germany',
    df_melted['country_name'] == 'Switzerland',
    df_melted['country_name'] == 'Russia',
    df_melted['country_name'] == 'Czech Republic',
    df_melted['country_name'] == 'United States of America',
    df_melted['country_name'] == 'Tanzania',
    df_melted['country_name'] == 'South Africa',
    df_melted['country_name'] == 'France',
    df_melted['country_name'] == 'Republic of Congo'    
]

missing_isos = [
    'CN',
    'TW',
    'HK',
    'VN',
    'LA',
    'PK',
    'MO',
    'DE',
    'CH',
    'RU',
    'CZ',
    'US',
    'TZ',
    'ZA',
    'FR',
    'CG'
]

df_melted = df_melted.assign(iso2 = lambda x:
    np.select(conditions, missing_isos, default=x['iso2'])
)

In [9]:
df_melted.dtypes

Year             int64
country_name    object
iso2            object
HS              object
variable        object
value            int64
dtype: object

In [10]:
df_melted.isnull().any()

Year            False
country_name    False
iso2            False
HS              False
variable        False
value           False
dtype: bool

## 3. Filter by plastics/paper and assign time period

In [11]:
def set_time(row):
    month = str(row['variable'])[-3:]
    year = str(row['Year'])
    time_str = f'{year}-{month}'

    return pd.to_datetime(time_str)

In [12]:
df_plastics = df_melted.copy() \
    .query('HS.str.startswith("\'3915")') \
    .assign(period = lambda x: x.apply(set_time, axis=1)) \
    .rename({ 'value': 'kg' }, axis=1) \
    .drop(['Year', 'HS', 'variable'], axis=1) \
    .groupby(['country_name', 'iso2', 'period'], as_index=False).sum() \
    .sort_values(['country_name', 'period']) \
    .assign(exporter = 'JP')

print(len(df_plastics))

df_plastics.head()

888


Unnamed: 0,country_name,iso2,period,kg,exporter
0,Australia,AU,2017-01-01,0,JP
1,Australia,AU,2017-02-01,0,JP
2,Australia,AU,2017-03-01,0,JP
3,Australia,AU,2017-04-01,0,JP
4,Australia,AU,2017-05-01,200,JP


In [13]:
df_paper = df_melted.copy() \
    .query('HS.str.startswith("\'4707")') \
    .assign(period = lambda x: x.apply(set_time, axis=1)) \
    .assign(kg = lambda x: x['value'] * 1000) \
    .drop('value', axis=1) \
    .drop(['Year', 'HS', 'variable'], axis=1) \
    .groupby(['country_name', 'iso2', 'period'], as_index=False).sum() \
    .sort_values(['country_name', 'period']) \
    .assign(exporter = 'JP')

print(len(df_paper))

df_paper.head()

408


Unnamed: 0,country_name,iso2,period,kg,exporter
0,Bangladesh,BD,2017-01-01,0,JP
1,Bangladesh,BD,2017-02-01,0,JP
2,Bangladesh,BD,2017-03-01,0,JP
3,Bangladesh,BD,2017-04-01,0,JP
4,Bangladesh,BD,2017-05-01,0,JP


## 4. Write monthly data to CSV

In [14]:
df_plastics[['period', 'iso2', 'country_name', 'exporter', 'kg']].to_csv(
    'data/processed/jp-estat/jp-plastics-exports-monthly.csv', 
    index=False
)

df_paper[['period', 'iso2', 'country_name', 'exporter', 'kg']].to_csv(
    'data/processed/jp-estat/jp-paper-exports-monthly.csv',
    index=False
)

## 5. Compare H1 2017 to H1 2018

In [15]:
df_plastics_h1 = df_plastics.copy() \
    .query(
        'period.dt.year in [2017, 2018] & \
        period.dt.month in [1, 2, 3, 4, 5, 6]'
    ) \
    .pivot_table(
        values='kg',
        index='country_name',
        columns=df_plastics['period'].dt.year,
        aggfunc='sum'
    ) \
    .assign(pct_change = lambda x: (x[2018] - x[2017]) / x[2017]) \
    .assign(h1_2017_proportion = lambda x: x[2017] / x[2017].sum()) \
    .assign(h1_2018_proportion = lambda x: x[2018] / x[2018].sum()) \
    .assign(pct_change_proportional = lambda x:
        x['h1_2018_proportion'] - x['h1_2017_proportion']) \
    .sort_values('pct_change_proportional')

df_plastics_h1.head()

period,2017,2018,pct_change,h1_2017_proportion,h1_2018_proportion,pct_change_proportional
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
People's Republic of China,429530897.0,16927768.0,-0.96059,0.565924,0.031641,-0.534283
Hong Kong,196987093.0,21682627.0,-0.889929,0.259538,0.040528,-0.21901
Macao,425454.0,0.0,-1.0,0.000561,0.0,-0.000561
Sri Lanka,110919.0,33685.0,-0.69631,0.000146,6.3e-05,-8.3e-05
Czech Republic,76518.0,60228.0,-0.212891,0.000101,0.000113,1.2e-05


In [16]:
df_paper_h1 = df_paper.copy() \
    .query(
        'period.dt.year in [2017, 2018] & \
        period.dt.month in [1, 2, 3, 4, 5, 6]'
    ) \
    .pivot_table(
        values='kg',
        index='country_name',
        columns=df_paper['period'].dt.year,
        aggfunc='sum'
    ) \
    .assign(pct_change = lambda x: (x[2018] - x[2017]) / x[2017]) \
    .assign(h1_2017_proportion = lambda x: x[2017] / x[2017].sum()) \
    .assign(h1_2018_proportion = lambda x: x[2018] / x[2018].sum()) \
    .assign(pct_change_proportional = lambda x:
        x['h1_2018_proportion'] - x['h1_2017_proportion']) \
    .sort_values('pct_change_proportional')

df_paper_h1.head()

period,2017,2018,pct_change,h1_2017_proportion,h1_2018_proportion,pct_change_proportional
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
People's Republic of China,1576347000.0,1193055000.0,-0.243152,0.731361,0.634262,-0.097099
Taiwan,173785000.0,139577000.0,-0.196841,0.080629,0.074203,-0.006426
Bangladesh,0.0,82000.0,inf,0.0,4.4e-05,4.4e-05
Pakistan,104000.0,255000.0,1.451923,4.8e-05,0.000136,8.7e-05
Ghana,0.0,445000.0,inf,0.0,0.000237,0.000237


## 6. Write H1 comparison data to CSV

In [17]:
# df_plastics_h1.to_csv('data/processed/jp-estat/jp-plastics-exports-h1.csv')

# df_paper_h1.to_csv('data/processed/jp-estat/jp-paper-exports-h1.csv')