# Competitiveness Analysis (by Country and Product)

MapReduce version to save memory.

In [1]:
import os
import pandas as pd

aggr_level = '6'
path = '//172.20.23.190/ds/Raw Data/2016大數爬蟲案/data/ITC HS{}/all/'.format(aggr_level)
files = pd.Series(os.listdir(path))
# Filter for import data
files = files[files.str.contains('_I')]
# Exclude Taiwan from importing countries
files.drop(files[files.str.contains('Taipei')].index.values[0], inplace=True)

In [2]:
# Load product description file
desc = pd.read_csv('//172.26.1.102/dstore/Projects/mof-crawler/full_hscode11.tsv', sep='\t',
                   dtype='str', usecols=['hs2cn', 'hs4cn', 'hs6', 'hs6cn'])
desc.columns = ['desc2', 'desc4', 'product', 'desc6']
# Because each row corresponds to an HS11 code in the original table, need to remove duplicates
desc.drop_duplicates(subset='product', inplace=True)

In [None]:
# Country name mapping table
ctry_map = pd.read_csv('//172.20.23.190/ds/Raw Data/2016大數爬蟲案/data/ITC HS6/itc_df_complete.csv',
                       usecols=['itc_name', 'countryName'])
ctry_map.columns = ['country', 'ch_name']
# Convert to en -> zh dictionary
ctry_map = ctry_map.set_index('country').to_dict()['ch_name']

In [3]:
import numpy as np
from functools import reduce

def aggr_data(file):
    
    df = pd.read_csv(path + file, index_col=0,
                     dtype={'Country': 'object',
                            'Product Code': 'object',
                            'Partner': 'object',
                            'Value in 2001': 'float',
                            'Value in 2002': 'float',
                            'Value in 2003': 'float',
                            'Value in 2004': 'float',
                            'Value in 2005': 'float',
                            'Value in 2006': 'float',
                            'Value in 2007': 'float',
                            'Value in 2008': 'float',
                            'Value in 2009': 'float',
                            'Value in 2010': 'float',
                            'Value in 2011': 'float',
                            'Value in 2012': 'float',
                            'Value in 2013': 'float',
                            'Value in 2014': 'float',
                            'Value in 2015': 'float'}).reset_index(drop=True)

    # Remove the leading single quote (') in product code column
    df['Product Code'] = df['Product Code'].apply(lambda x: x[1:])
    # Remove rows for commodities sum
    df = df[df['Product Code'] != 'TOTAL']
    # Remove rows where partner is 'All' (it seems that HS6 tables don't have this code)
    df = df[df['Partner'] != 'All']
    # Select only columns for 2012 to 2015
    df = pd.concat((df.loc[:, :'Partner'], df.loc[:, 'Value in 2012':]), axis=1)
    df.columns = ['country', 'product', 'partner', 'val12', 'val13', 'val14', 'val15']
    # Compute growth rates
    def growthRate(data, start_year, end_year):
        return (data['val' + str(end_year)] - data['val' + str(start_year)]) /\
               data['val' + str(start_year)] * 100
    df['g13'] = growthRate(df, 12, 13)
    df['g14'] = growthRate(df, 13, 14)
    df['g15'] = growthRate(df, 14, 15)

    # Compute total imports for all (country, product) pairs
    total = df.groupby(['country', 'product']).agg({
            'val12': 'sum',
            'val13': 'sum',
            'val14': 'sum',
            'val15': 'sum'})
    total['g13'] = growthRate(total, 12, 13)
    total['g14'] = growthRate(total, 13, 14)
    total['g15'] = growthRate(total, 14, 15)
    total = total[['val15', 'g13', 'g14', 'g15']].reset_index()

    # Compute commodity-wise market share for each partner country
    df['share'] = df['val15'] / df.groupby(['country', 'product'])['val15'].transform('sum') * 100
    # Compute commodity-wise rank for each partner country
    df['rank'] = df.groupby(['country', 'product'])['val15'].rank(ascending=False, method='min')

    # Compute no. of non-zero partners for each importing country by commodity
    n_partner = df[(df['val15'] != 0) & (df['val15'].notnull())].groupby(
        ['country', 'product']).agg({'partner': 'count'}).rename(columns={'partner': 'n_partner'})
    # Compute Pearson's median skewness coefficient for each country by commodity
    skewness = df.groupby(['country', 'product']).agg(
        {'val15': lambda x: 3 * (x.mean() - x.median()) / x.std() if x.std() != 0 else np.nan}).rename(
        columns={'val15': 'skew'})

    # Extract data for Taiwan
    tw = df.loc[df['partner'] == 'Taipei, Chinese',
                ['country', 'product', 'val15', 'g13', 'g14', 'g15', 'share', 'rank']]
    tw.columns = ['country', 'product', 'tw_val15', 'tw_g13', 'tw_g14', 'tw_g15',
                  'tw_share', 'tw_rank']
    # When import value from Taiwan is zero, manually overwrite corresponding rank of Taiwan with NaN
    tw.loc[tw['tw_val15'] == 0, 'tw_rank'] = None

    # Extract data for top 3
    top3 = df.groupby(['country', 'product']).apply(lambda x: x.nsmallest(3, 'rank')).loc[
        :, ['country', 'product', 'partner', 'val15', 'g13', 'g14', 'g15', 'share']]
    def getCountryByRank(data, rank):
        rs = data.groupby(['country', 'product']).nth(rank).reset_index().loc[
        :, ['country', 'product', 'partner', 'val15', 'g13', 'g14', 'g15', 'share']]
        rs.columns = ['country', 'product', 'partner'] +\
        [str(rank + 1) + '_' + x for x in ['val15', 'g13', 'g14', 'g15', 'share']]
        return rs
    first  = getCountryByRank(top3, 0)
    second = getCountryByRank(top3, 1)
    third  = getCountryByRank(top3, 2)

    # Merge all tables
    rs = total.merge(n_partner, how='left', left_on=['country', 'product'], right_index=True).merge(
        skewness, how='left', left_on=['country', 'product'], right_index=True).merge(
        tw, how='left', on=['country', 'product']).merge(
        first, how='left', on=['country', 'product']).rename(columns={'partner': '1_name'}).merge(
        second, how='left', on=['country', 'product']).rename(columns={'partner': '2_name'}).merge(
        third, how='left', on=['country', 'product']).rename(columns={'partner': '3_name'}).merge(
        desc, how='left', on='product').iloc[:, [0, 1, -3, -2, -1] + list(range(2, 32))]
    # Replace en country names with zh names
    rs['country'].replace(ctry_map, inplace=True)
    rs['1_name'].replace(ctry_map, inplace=True)
    rs['2_name'].replace(ctry_map, inplace=True)
    rs['3_name'].replace(ctry_map, inplace=True)

    return rs

In [4]:
%%time

df_map = map(lambda f: aggr_data(f), files)
df = reduce(lambda x, y: pd.concat([x, y], axis=0, ignore_index=True), df_map)

# Output results
df.to_csv('comp_aggregate_{}.csv'.format(aggr_level), sep=',', index=False)

Wall time: 1h 12min 9s
