# Competitiveness Analysis (by Country and Product)

## Part 1: Loading Data

In [1]:
import os
import pandas as pd

path = '//172.20.23.190/ds/Raw Data/2016大數爬蟲案/data/ITC HS4/all/'
files = pd.Series(os.listdir(path))
# Filter for import data
files = files[files.str.contains('_I')]

In [2]:
from functools import reduce

df_map = map(lambda f: pd.read_csv(path + f, index_col=0,
                                   dtype={'Country': 'object',
                                          'Product Code': 'object',
                                          'Partner': 'object',
                                          'Value in 2001': 'float',
                                          'Value in 2002': 'float',
                                          'Value in 2003': 'float',
                                          'Value in 2004': 'float',
                                          'Value in 2005': 'float',
                                          'Value in 2006': 'float',
                                          'Value in 2007': 'float',
                                          'Value in 2008': 'float',
                                          'Value in 2009': 'float',
                                          'Value in 2010': 'float',
                                          'Value in 2011': 'float',
                                          'Value in 2012': 'float',
                                          'Value in 2013': 'float',
                                          'Value in 2014': 'float',
                                          'Value in 2015': 'float'}), files)
df = reduce(lambda x, y: pd.concat([x, y], axis=0, ignore_index=True), df_map)

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62410872 entries, 0 to 62410871
Data columns (total 18 columns):
Country          object
Product Code     object
Partner          object
Value in 2001    float64
Value in 2002    float64
Value in 2003    float64
Value in 2004    float64
Value in 2005    float64
Value in 2006    float64
Value in 2007    float64
Value in 2008    float64
Value in 2009    float64
Value in 2010    float64
Value in 2011    float64
Value in 2012    float64
Value in 2013    float64
Value in 2014    float64
Value in 2015    float64
dtypes: float64(15), object(3)
memory usage: 8.4+ GB
None


## Part 2: Preprocessing

In [4]:
# Remove the leading single quote (') in product code column
df['Product Code'] = df['Product Code'].apply(lambda x: x[1:])
# Remove rows for commodities sum and Taiwan as importer
df = df[(df['Product Code'] != 'TOTAL') & (df['Country'] != 'Taipei, Chinese')]
# Remove rows where partner is 'All'
df = df[df['Partner'] != 'All']
# Select only columns for 2012 to 2015
df = pd.concat((df.loc[:, :'Partner'], df.loc[:, 'Value in 2012':]), axis=1)
df.columns = ['country', 'product', 'partner', 'val12', 'val13', 'val14', 'val15']
# Compute growth rates
def growthRate(data, start_year, end_year):
    return (data['val' + str(end_year)] - data['val' + str(start_year)]) /\
           data['val' + str(start_year)] * 100
df['g13'] = growthRate(df, 12, 13)
df['g14'] = growthRate(df, 13, 14)
df['g15'] = growthRate(df, 14, 15)

In [5]:
# Remove HS6 rows
df = df[df['product'].apply(len) == 4]

In [6]:
# Compute total imports for all (country, product) pairs
total = df.groupby(['country', 'product']).agg({
        'val12': 'sum',
        'val13': 'sum',
        'val14': 'sum',
        'val15': 'sum'})
total['g13'] = growthRate(total, 12, 13)
total['g14'] = growthRate(total, 13, 14)
total['g15'] = growthRate(total, 14, 15)
total = total[['val15', 'g13', 'g14', 'g15']].reset_index()

In [7]:
import numpy as np

# Compute commodity-wise market share for each partner country
df['share'] = df['val15'] / df.groupby(['country', 'product'])['val15'].transform('sum') * 100
# Compute commodity-wise rank for each partner country
df['rank'] = df.groupby(['country', 'product'])['val15'].rank(ascending=False, method='min')

# Compute no. of non-zero partners for each importing country by commodity
n_partner = df[(df['val15'] != 0) & (df['val15'].notnull())].groupby(
    ['country', 'product']).agg({'partner': 'count'}).rename(columns={'partner': 'n_partner'})
# Compute Pearson's median skewness coefficient for each country by commodity
skewness = df.groupby(['country', 'product']).agg(
    {'val15': lambda x: 3 * (x.mean() - x.median()) / x.std() if x.std() != 0 else np.nan}).rename(
    columns={'val15': 'skew'})

In [8]:
# Extract data for Taiwan
tw = df.loc[df['partner'] == 'Taipei, Chinese',
            ['country', 'product', 'val15', 'g13', 'g14', 'g15', 'share', 'rank']]
tw.columns = ['country', 'product', 'tw_val15', 'tw_g13', 'tw_g14', 'tw_g15', 'tw_share', 'tw_rank']

# Extract data for top 3
top3 = df.groupby(['country', 'product']).apply(lambda x: x.nsmallest(3, 'rank')).loc[
    :, ['country', 'product', 'partner', 'val15', 'g13', 'g14', 'g15', 'share']]

def getCountryByRank(data, rank):
    rs = data.groupby(['country', 'product']).nth(rank).reset_index().reset_index().loc[
    :, ['country', 'product', 'partner', 'val15', 'g13', 'g14', 'g15', 'share']]
    rs.columns = ['country', 'product', 'partner'] +\
    [str(rank + 1) + '_' + x for x in ['val15', 'g13', 'g14', 'g15', 'share']]
    return rs

first  = getCountryByRank(top3, 0)
second = getCountryByRank(top3, 1)
third  = getCountryByRank(top3, 2)

In [9]:
import re
from io import StringIO

# Load product description file
path_desc = 'C:/Users/2093/Desktop/Data Center/03. Data/01. HS_code/customs/\
稅則貨名檔(八碼)_最後更新時間 2017-02-07/note_8_C.txt'
with open(path_desc, encoding='utf-8') as f:
    txt = f.read()
# Handle some parsing issues
txt = re.sub(r'(\d)[ ]+', r'\1 ', txt)
txt = re.sub(r',', '，', txt)
txt = re.sub(r'(\D)[ ]+', r'\1', txt)
txt = re.sub(r'HS_NONOTE', r'HS_NO NOTE', txt)
desc = pd.read_csv(StringIO(txt), sep=' ', header=0, names=['product', 'desc'])

In [10]:
# Merge all tables
rs = total.merge(n_partner, how='left', left_on=['country', 'product'], right_index=True).merge(
    skewness, how='left', left_on=['country', 'product'], right_index=True).merge(
    tw, how='left', on=['country', 'product']).merge(
    first, how='left', on=['country', 'product']).rename(columns={'partner': '1_name'}).merge(
    second, how='left', on=['country', 'product']).rename(columns={'partner': '2_name'}).merge(
    third, how='left', on=['country', 'product']).rename(columns={'partner': '3_name'}).merge(
    desc, how='left', on='product').iloc[:, [0, 1, -1] + list(range(2, 32))]
# Output results
rs.to_csv('comp_aggregate.csv', sep=',', index=False)

## Part 3: Statistical Exploratory Data Analysis

#### Some outdated product codes that are not found in MOF's product list:

In [11]:
print(sorted(rs[rs['desc'].isnull()]['product'].unique()))

['0503', '0509', '1402', '1403', '2527', '2838', '2848', '2851', '4108', '4109', '4110', '4111', '4204', '4815', '5304', '6503', '6908', '7012', '7414', '7416', '7417', '7803', '7805', '7906', '8004', '8005', '8006', '8469', '8485', '8520', '8524', '9009', '9203', '9204', '9501', '9502', '9999']


#### Products with highest average degree of competition:

In [12]:
rs.groupby('desc').agg({'n_partner': 'mean', 'skew': 'mean'}).sort_values(
    'n_partner', ascending=False).head()

Unnamed: 0_level_0,n_partner,skew
desc,Unnamed: 1_level_1,Unnamed: 2_level_1
其他塑膠製品及第３９０１至３９１４節之材料製成品,38.025641,0.511484
電話機，包括蜂巢式網路或其他無線網路電話；其他傳輸或接收聲音、圖像或其他資料之器具，包括有線或無線網路（如區域或廣域網路）之通訊器具，但不包括第8443，8525，8527或8528節之傳輸或接收器具,37.60515,0.468829
電路開關、保護電路或連接電路用之電氣用具（例如：開關、繼電器、熔絲裝置、突波遏止器、插頭、插座、燈頭及其他連接器、接線盒），其電壓未超過１０００伏特者；光纖、光纖束、光纖電纜或光纖傳輸纜用之連接器,35.910638,0.497108
自動資料處理機及其附屬單元；磁性或光學閱讀機，以符號方式將資料轉錄於資料媒體之機器及處理此類資料之未列名機器,35.683761,0.461736
第８７０１至８７０５節機動車輛所用之零件及附件,35.470339,0.488684


#### Products with lowest average degree of competition:

In [13]:
rs.groupby('desc').agg({'n_partner': 'mean', 'skew': 'mean'}).sort_values('n_partner').head()

Unnamed: 0_level_0,n_partner,skew
desc,Unnamed: 1_level_1,Unnamed: 2_level_1
鈾或釷礦石及其精砂,1.454545,0.403716
經拉鬆處理之回收羊毛或動物粗細毛,1.5,0.324049
鈷礦石及其精砂,1.684211,0.305355
乾椰子肉,1.734694,0.421047
石棉,1.864198,0.320217


#### Countries with highest "import diversity":

In [14]:
rs.groupby('country').agg({'n_partner': 'mean'}).sort_values('n_partner', ascending=False).head()

Unnamed: 0_level_0,n_partner
country,Unnamed: 1_level_1
Netherlands,50.7621
Germany,44.414333
France,39.927347
United Kingdom,37.481451
Belgium,36.840099


#### Countries with lowest "import diversity":

In [15]:
rs.groupby('country').agg({'n_partner': 'mean'}).sort_values('n_partner', ascending=True).head()

Unnamed: 0_level_0,n_partner
country,Unnamed: 1_level_1
Guinea-Bissau,1.0
Mali,1.0
Caribbean Nes,1.0
Mauritania,1.0
Serbia and Montenegro,1.0
