In [1]:
import pandas as pd
import sys
import configparser
import os
import numpy as np
config = configparser.ConfigParser()
config.read_file(open('../../settings.ini'))

sys.path.insert(0, config.get('PATHS','libs_path'))
engine_path = config.get('DATABASE','engine_path')

import postgres #from local file postgres.py
import commons
from commons import inline_table_xml, download_file, download_zip_file, extract_zip_file #from local file commons.py
from importlib import reload

import json
from sqlalchemy import create_engine

engine = create_engine(engine_path)
db = postgres.PostgresDriver(engine)

HS2012 nomenclature was obtained from [europa.eu](http://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?TargetUrl=LST_CLS_DLD&StrNom=HS_2012&StrLanguageCode=EN&StrLayoutCode=HIERARCHIC). The MDB file was converted to CSV with the following command:

```
mdb-export HS_2012_MDB.mdb "HS 2012 - STRUCTURE BG CS DA DE EL EN ES FR IT PT SV" > hs2012.csv
```

In [9]:
cols = ['CNKEY', 'CN', 'PURE_HS_CODE', 'LEVEL', 'EN', 'ES']
dtype = {c: str for c in cols}
dtype['LEVEL'] = int
hs = pd.read_csv('hs2012.csv', usecols=cols, dtype=dtype)
hs = hs[hs.PURE_HS_CODE.notnull()]

In [16]:
level5 = hs[hs.LEVEL == 5]
level5 = level5.rename(columns={'EN': 'level5_en', 'ES': 'level5_es', 'PURE_HS_CODE': 'level5'})

In [25]:
hs[hs.CNKEY.str.startswith('0101')]

Unnamed: 0,CNKEY,CN,PURE_HS_CODE,LEVEL,EN,ES
3,10100000080,0101,1.01,3,"Live horses, asses, mules and hinnies","Caballos, asnos, mulos y burdéganos, vivos"
5,10121000080,0101 21,101.21,5,Pure-bred breeding animals,Reproductores de raza pura
6,10129000080,0101 29,101.29,5,Other,Los demás
7,10130000080,0101 30,101.3,4,Asses,Asnos
8,10190000080,0101 90,101.9,4,Other,Los demás


In [10]:
level5.loc[:, 'level5'] = level5.level5.apply(lambda l: l.replace('.', ''))
level5.loc[:, 'level3'] = level5.level5.apply(lambda l: l[:4])

CNKEY           object
CN              object
PURE_HS_CODE    object
LEVEL            int64
EN              object
ES              object
dtype: object

In [8]:
#hs[hs.PURE_HS_CODE.notnull() & hs.PURE_HS_CODE.str.startswith('8705')]

Unnamed: 0,CNKEY,CN,PURE_HS_CODE,LEVEL,EN,ES
6626,870510000080,8705 10,8705.1,4,Crane lorries,Camiones grúa
6627,870520000080,8705 20,8705.2,4,Mobile drilling derricks,Camiones automóviles para sondeo o perforación
6628,870530000080,8705 30,8705.3,4,Fire fighting vehicles,Camiones de bomberos
6629,870540000080,8705 40,8705.4,4,Concrete-mixer lorries,Camiones hormigonera
6630,870590000080,8705 90,8705.9,4,Other,Los demás


In [67]:
level3 = hs[hs.LEVEL == 3]
level3 = level3.rename(columns={'EN': 'level3_en', 'ES': 'level3_es', 'PURE_HS_CODE': 'level3'})
level3.loc[:, 'level3'] = level3.level3.astype(str).apply(lambda h: h.replace('.', '')[:4])
level3 = level3.merge(level5, on='level3')
level3.loc[:, 'level2'] = level3.level3.astype(str).apply(lambda h: h[:2])

In [73]:
level2 = hs[hs.LEVEL == 2]
level2 = level2.rename(columns={'EN': 'level2_en', 'ES': 'level2_es', 'PURE_HS_CODE': 'level2'})
level2 = level2.merge(level3, on='level2')
level2.loc[:, 'level2_en'] = level2.level2_en.apply(lambda h: h.split(' - ')[1].title())
level2.loc[:, 'level2_es'] = level2.level2_es.apply(lambda h: h.split(' - ')[1].title())


In [81]:
hs2012 = level2[['level2', 'level2_en', 'level2_es', 'level3', 'level3_en', 'level3_es', 'level5', 'level5_en', 'level5_es']]

In [82]:
db.to_sql(hs2012, 'economy', 'dim_hs2012')

engine.execute("""
CREATE INDEX level3_idx 
ON economy.dim_hs2012 (level3)
""")

DROP TABLE IF EXISTS economy.dim_hs2012;
CREATE TABLE "economy"."dim_hs2012" (
"level2" TEXT,
  "level2_en" TEXT,
  "level2_es" TEXT,
  "level3" TEXT,
  "level3_en" TEXT,
  "level3_es" TEXT,
  "level5" TEXT,
  "level5_en" TEXT,
  "level5_es" TEXT
)
COPY "economy"."dim_hs2012" ("level2","level2_en","level2_es","level3","level3_en","level3_es","level5","level5_en","level5_es") FROM STDIN WITH CSV HEADER DELIMITER ',';
