In [None]:
import pandas as pd
import sys
import configparser
import os
import numpy as np
config = configparser.ConfigParser()
config.read_file(open('../../settings.ini'))

sys.path.insert(0, config.get('PATHS','libs_path'))
engine_path = config.get('DATABASE','engine_path')

import postgres #from local file postgres.py
import commons
from commons import inline_table_xml, download_file, download_zip_file, extract_zip_file #from local file commons.py
from importlib import reload

import json
from sqlalchemy import create_engine
import requests

engine = create_engine(engine_path)
db = postgres.PostgresDriver(engine)

# Read Pacha's HS12

In [None]:
hs = pd.read_csv('pacha_hs12.csv')


In [None]:
level5 = hs[hs.hs12.str.len()==6]
level5 = level5.rename(columns={'hs12': 'level5', 'description': 'level5_en'})
level5.loc[:, 'level3'] = level5.level5.apply(lambda h: h[:4])

In [None]:
level3 = hs[hs.hs12.str.len()==4]
level3 = level3.rename(columns={'hs12': 'level3', 'description': 'level3_en'})
level3 = level3.merge(level5, on='level3')
level3.loc[:, 'level2'] = level3.level3.apply(lambda h: h[:2])

In [None]:
level2 = hs[hs.hs12.str.len() == 2]
level2 = level2.rename(columns={'hs12': 'level2', 'description': 'level2_en'})
level2 = level2.merge(level3, on='level2')

# HS12 from `europa.eu`

In [None]:
cols = ['CNKEY', 'CN', 'PURE_HS_CODE', 'LEVEL', 'EN', 'ES']
dtype = {c: str for c in cols}
dtype['LEVEL'] = int
hseu = pd.read_csv('hs2012.csv', usecols=cols, dtype=dtype)
hseu = hseu[hseu.PURE_HS_CODE.notnull()]
hseu.loc[:, 'FIXED_HS'] = hseu.PURE_HS_CODE.apply(lambda l: l.replace('.', ''))
es_hs = hseu[['FIXED_HS', 'ES']]

In [None]:
level2 = level2.merge(es_hs, left_on='level2', right_on='FIXED_HS')

level2 = level2.rename(columns={'ES': 'level2_es'})
level2.loc[:, 'level2_es'] = level2.level2_es.apply(lambda h: h.split(' - ')[1].title())

level2 = level2.merge(es_hs, left_on='level3', right_on='FIXED_HS', how='left')
level2 = level2.rename(columns={'ES': 'level3_es'})

level2 = level2.merge(es_hs, left_on='level5', right_on='FIXED_HS', how='left')
level2 = level2.rename(columns={'ES': 'level5_es'})

### Add section names

In [None]:
import roman

sections = list(hseu[hseu.LEVEL==1].iterrows())
chapters = hseu[hseu.LEVEL == 2]

for (i,s1), (j,s2) in zip(sections, sections[1:]):
    level2.loc[(level2.level2.astype(int) >= int(s1.CNKEY[0:2])) & (level2.level2.astype(int) < int(s2.CNKEY[0:2])), 'level0_es'] = s1.ES
    level2.loc[(level2.level2.astype(int) >= int(s1.CNKEY[0:2])) & (level2.level2.astype(int) < int(s2.CNKEY[0:2])), 'level0_en'] = s1.EN    
    level2.loc[(level2.level2.astype(int) >= int(s1.CNKEY[0:2])) & (level2.level2.astype(int) < int(s2.CNKEY[0:2])), 'level0'] = format(roman.fromRoman(s1.CN), '02d')

In [None]:
flattened_hs2012 = level2[['level0', 'level0_en', 'level0_es', 'level2', 'level2_en', 'level2_es', 'level3', 'level3_en', 'level3_es', 'level5', 'level5_en', 'level5_es']]

flattened_hs2012.loc[flattened_hs2012.level0.isnull(), 'level0_en'] = "SECTION XXI - WORKS OF ART, COLLECTORS' PIECES AND ANTIQUES"
flattened_hs2012.loc[flattened_hs2012.level0.isnull(), 'level0_es'] = "SECCIÓN XXI - OBJETOS DE ARTE O COLECCIÓN Y ANTIGÜEDADES"
flattened_hs2012.loc[flattened_hs2012.level0.isnull(), 'level0'] = 21


In [None]:
flattened_hs2012.loc[:, 'level0_es'] = flattened_hs2012.level0_es.apply(lambda h: h.split(' - ')[1].title())
flattened_hs2012.loc[:, 'level0_en'] = flattened_hs2012.level0_en.apply(lambda h: h.split(' - ')[1].title())


### Append section id to all levels

In [None]:
flattened_hs2012.loc[:, 'level2'] = flattened_hs2012.apply(lambda r: str(r['level0']) + r['level2'], axis=1)
flattened_hs2012.loc[:, 'level3'] = flattened_hs2012.apply(lambda r: str(r['level0']) + r['level3'], axis=1)
flattened_hs2012.loc[:, 'level5'] = flattened_hs2012.apply(lambda r: str(r['level0']) + r['level5'], axis=1)

# Update names from OEC

In [None]:
oec_xlations_es = pd.DataFrame(requests.get("https://atlas.media.mit.edu/attr/hs92/es/").json()['data'])
oec_xlations_en = pd.DataFrame(requests.get("https://atlas.media.mit.edu/attr/hs92/en/").json()['data'])
original_hs92 = pd.read_csv('hs92.csv')

In [None]:
short_names = oec_xlations_es[(oec_xlations.id.str.len() == 6)] \
   .merge(original_hs92[original_hs92.Level==4], left_on='display_id', right_on='Code') \
   [['display_id', 'id', 'name', 'Description']] \
   .rename(columns={'name': 'short_name_es', 'Description': 'original_name'}) \
   .merge(oec_xlations_en, on='display_id') \
   [['display_id', 'short_name_es', 'original_name', 'name', 'id_x']] \
   .rename(columns={'name': 'short_name_en', 'id_x': 'id'})

In [None]:
flattened_hs2012 = flattened_hs2012.merge(short_names, left_on='level3_en', right_on='original_name', how='left') \
    [['level0', 'level0_en', 'level0_es', 'level2', 'level2_en', 'level2_es', 'level3', 'level3_en', 'level3_es', 'short_name_en', 'short_name_es', 'level5', 'level5_en', 'level5_es']] \
    .rename(columns={'short_name_en': 'level3_short_en', 'short_name_es': 'level3_short_es'})

In [None]:
flattened_hs2012.loc[:, 'level3_en'] = flattened_hs2012.apply(lambda h: h['level3_short_en'] if pd.notnull(h['level3_short_en']) else h['level3_en'], axis=1)
flattened_hs2012.loc[:, 'level3_es'] = flattened_hs2012.apply(lambda h: h['level3_short_es'] if pd.notnull(h['level3_short_es']) else h['level3_es'], axis=1)

# Apply manually shortened names

From: https://docs.google.com/spreadsheets/d/1fDJRP2t8BK5y59XRu1gvz7I_7L6imT6gy2e-YJQCDak/edit#gid=1687148987


In [None]:
flattened_hs2012 = pd.read_sql("SELECT * FROM economy.dim_hs2012", engine)
shortened = pd.read_csv('hs2012-manually-shortened-names.csv', dtype={'level3': str}, usecols=['level3', 'level3_short'])
shortened.loc[:, 'level3'] = shortened.level3.str.pad(6, fillchar='0')
tmp = flattened_hs2012.merge(shortened, how='left', on='level3')
tmp.loc[:, 'level3_es'] = tmp.apply(lambda h: h['level3_short'] if pd.notnull(h['level3_short']) else h['level3_es'], axis=1)
flattened_hs2012 = tmp

# Save to DB

In [None]:
db.to_sql(flattened_hs2012[['level0', 'level0_en', 'level0_es', 'level2', 'level2_en', 'level2_es', 'level3', 'level3_en', 'level3_es', 'level5', 'level5_en', 'level5_es']],
          'economy', 
          'dim_hs2012')

engine.execute("""
CREATE INDEX level3_idx 
ON economy.dim_hs2012 (level3)
""")

engine.execute("""
CREATE INDEX level5_idx 
ON economy.dim_hs2012 (level5)
""")

