# Data Preprocessing

#### Catalogue Data

In [1]:
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

path = 'C:/Users/2093/Desktop/Data Center/03. Data/05. TAITRA/TT/'

codes = pd.read_csv(path + 'ebs_view_CodeSets_20170522.csv',
                    usecols=['new_CodeSetsId', 'new_CodeType', 'new_CodeValue', 'new_CodeValueLength',
                             'new_NameCht', 'new_NameEng'])
codes.columns = ['code_id', 'code_type', 'code_val', 'code_len', 'ch_name', 'en_name']
code_prod_map = pd.read_csv(path + 'ebs_view_new_CodeSets_new_Products_20170522.csv',
                            usecols=['new_codesetsid', 'new_productsid'])
code_prod_map.columns = ['code_id', 'prod_id']
ctlg = pd.read_csv(path + 'ebs_view_product_20170522.csv',
                   usecols=['new_BAN', 'new_ProductsId', 'new_name', 'new_DescTextEng',
                             'new_KeywordEng', 'new_modified', 'new_TTImage'],
                   dtype={'new_BAN': str},
                   parse_dates=['new_modified'])
ctlg.columns = ['ban', 'prod_id', 'prod_name', 'prod_desc', 'keyword', 'mod_date', 'image']

In [2]:
print('{} items have two TAITRA codes attached.'
      .format((code_prod_map['prod_id'].value_counts() >= 2).sum()))

code_prod_map.drop_duplicates(['prod_id'], inplace=True)
assert (code_prod_map['prod_id'].value_counts() >= 2).sum() == 0

111 items have two TAITRA codes attached.


In [3]:
# Join three DataFrames
ctlg = (ctlg.merge(code_prod_map, how='left', on='prod_id').merge(codes, how='left', on='code_id')
        .drop(['prod_id', 'code_id', 'code_type', 'code_len', 'ch_name', 'en_name'], axis=1))

In [4]:
no_code = ctlg['code_val'].isnull()

print(('{:,} items do not have TAITRA code attached.\nThese belong to {:,} unique suppliers, \
of which {} do not have any other items with TAITRA code ({}% of all suppliers).'
       .format(no_code.sum(),
               ctlg[no_code]['ban'].nunique(),
               len(set(ctlg[no_code]['ban'].unique()) - set(ctlg[~no_code]['ban'].unique())),
               round(len(set(ctlg[no_code]['ban'].unique()) - set(ctlg[~no_code]['ban'].unique()))
                     / ctlg['ban'].nunique() * 100, 2))))

ctlg = ctlg[ctlg['code_val'].notnull()]
assert ctlg['code_val'].isnull().sum() == 0
ctlg['code_val'] = ctlg['code_val'].astype(int).astype(str).str.zfill(6).astype('category')

9,924 items do not have TAITRA code attached.
These belong to 2,450 unique suppliers, of which 665 do not have any other items with TAITRA code (1.87% of all suppliers).


In [5]:
print('Fraction of non-missing values before removal:')
print(ctlg.notnull().sum() / len(ctlg))

n_supp_lost = ctlg['ban'].nunique() - ctlg[ctlg['prod_name'].notnull()]['ban'].nunique()

print('\nIf we remove all items without product name, {:,} suppliers will be lost ({}% of total number \
of suppliers).\n'.format(n_supp_lost, round(n_supp_lost / ctlg['ban'].nunique() * 100, 2)))

ctlg = ctlg[ctlg['prod_name'].notnull()]
assert ctlg['prod_name'].isnull().sum() == 0

print('Fraction of non-missing values after removal:')
print(ctlg.notnull().sum() / len(ctlg))

Fraction of non-missing values before removal:
ban          0.999625
prod_name    0.599947
prod_desc    0.586136
keyword      0.368742
mod_date     0.997704
image        0.849785
code_val     1.000000
dtype: float64

If we remove all items without product name, 1,249 suppliers will be lost (3.58% of total number of suppliers).

Fraction of non-missing values after removal:
ban          0.999380
prod_name    1.000000
prod_desc    0.976939
keyword      0.401397
mod_date     0.996252
image        0.780910
code_val     1.000000
dtype: float64


In [6]:
sno = SnowballStemmer('english')
wnl = WordNetLemmatizer()

def process_string(s):
    processed = (s.str.strip()
                  .str.lower()
                  .str.replace(r'[\t\n\r\f\v]', r'')
                  .str.replace(r'\d+', r'')
                  # capture commas followed by any number of whitespaces
                  .str.replace(r', *', r' ')
                  .apply(lambda s: s.translate(str.maketrans({x: None for x in string.punctuation}))
                         if type(s) == str else '')
                  # apply SnowballStemmer then WordNetLemmatizer to singularize missed words
                  .apply(lambda s: ' '.join(set([wnl.lemmatize(sno.stem(x)) for x in re.split(r' +', s)
                                                 if x not in stopwords.words('english')]))
                         if type(s) == str else ''))
    return processed

In [7]:
%%time

ctlg['prod_name'] = process_string(ctlg['prod_name'])
ctlg['prod_desc'] = process_string(ctlg['prod_desc'])
ctlg['keyword'] = process_string(ctlg['keyword'])

ctlg.to_csv(path + 'processed_ctlg.csv', index=False, encoding='utf-8')

Wall time: 1h 22min 16s


#### Export Data

In [8]:
%%time

ex = pd.read_csv('C:/Users/2093/Desktop/Data Center/03. Data/06. companies/財政部廠商進出口資料/KMG_HS6COUNTRY.csv',
                 engine='python')

# code_type == 1 for HS code
codes = codes.loc[(codes['code_type'] == 1) & (codes['code_len'] == 6), ['code_val', 'en_name']]
codes['en_name'] = process_string(codes['en_name'])

ex = (ex.loc[ex['MONTH'].isnull(), ['BAN_REAL', 'HSCODE', 'COUNTRY', 'EXPORT']]
      .merge(codes, left_on='HSCODE', right_on='code_val')
      [['BAN_REAL', 'HSCODE', 'COUNTRY', 'EXPORT', 'en_name']])
ex.to_csv(path + 'export_compressed.csv', index=False, encoding='utf-8')

Wall time: 2min 45s
