In [76]:
import pandas as pd
import sys
import configparser
import os
import numpy as np
config = configparser.ConfigParser()
config.read_file(open('../../settings.ini'))

sys.path.insert(0, config.get('PATHS','libs_path'))
engine_path = config.get('DATABASE','engine_path')

import postgres #from local file postgres.py
import commons
from commons import inline_table_xml, download_file, download_zip_file, extract_zip_file #from local file commons.py
from importlib import reload

import json
from sqlalchemy import create_engine
import requests

engine = create_engine(engine_path)
db = postgres.PostgresDriver(engine)

# Read Pacha's HS12

In [77]:
hs = pd.read_csv('pacha_hs12.csv')


In [78]:
level5 = hs[hs.hs12.str.len()==6]
level5 = level5.rename(columns={'hs12': 'level5', 'description': 'level5_en'})
level5.loc[:, 'level3'] = level5.level5.apply(lambda h: h[:4])

In [79]:
level3 = hs[hs.hs12.str.len()==4]
level3 = level3.rename(columns={'hs12': 'level3', 'description': 'level3_en'})
level3 = level3.merge(level5, on='level3')
level3.loc[:, 'level2'] = level3.level3.apply(lambda h: h[:2])

In [80]:
level2 = hs[hs.hs12.str.len() == 2]
level2 = level2.rename(columns={'hs12': 'level2', 'description': 'level2_en'})
level2 = level2.merge(level3, on='level2')

# HS12 from `europa.eu`

In [81]:
cols = ['CNKEY', 'CN', 'PURE_HS_CODE', 'LEVEL', 'EN', 'ES']
dtype = {c: str for c in cols}
dtype['LEVEL'] = int
hseu = pd.read_csv('hs2012.csv', usecols=cols, dtype=dtype)
hseu = hseu[hseu.PURE_HS_CODE.notnull()]
hseu.loc[:, 'FIXED_HS'] = hseu.PURE_HS_CODE.apply(lambda l: l.replace('.', ''))
es_hs = hseu[['FIXED_HS', 'ES']]

In [82]:
level2 = level2.merge(es_hs, left_on='level2', right_on='FIXED_HS')

level2 = level2.rename(columns={'ES': 'level2_es'})
level2.loc[:, 'level2_es'] = level2.level2_es.apply(lambda h: h.split(' - ')[1].title())

level2 = level2.merge(es_hs, left_on='level3', right_on='FIXED_HS', how='left')
level2 = level2.rename(columns={'ES': 'level3_es'})

level2 = level2.merge(es_hs, left_on='level5', right_on='FIXED_HS', how='left')
level2 = level2.rename(columns={'ES': 'level5_es'})

### Add section names

In [83]:
import roman

sections = list(hseu[hseu.LEVEL==1].iterrows())
chapters = hseu[hseu.LEVEL == 2]

for (i,s1), (j,s2) in zip(sections, sections[1:]):
    level2.loc[(level2.level2.astype(int) >= int(s1.CNKEY[0:2])) & (level2.level2.astype(int) < int(s2.CNKEY[0:2])), 'level0_es'] = s1.ES
    level2.loc[(level2.level2.astype(int) >= int(s1.CNKEY[0:2])) & (level2.level2.astype(int) < int(s2.CNKEY[0:2])), 'level0_en'] = s1.EN    
    level2.loc[(level2.level2.astype(int) >= int(s1.CNKEY[0:2])) & (level2.level2.astype(int) < int(s2.CNKEY[0:2])), 'level0'] = format(roman.fromRoman(s1.CN), '02d')

In [84]:
flattened_hs2012 = level2[['level0', 'level0_en', 'level0_es', 'level2', 'level2_en', 'level2_es', 'level3', 'level3_en', 'level3_es', 'level5', 'level5_en', 'level5_es']]

flattened_hs2012.loc[flattened_hs2012.level0.isnull(), 'level0_en'] = "SECTION XXI - WORKS OF ART, COLLECTORS' PIECES AND ANTIQUES"
flattened_hs2012.loc[flattened_hs2012.level0.isnull(), 'level0_es'] = "SECCIÓN XXI - OBJETOS DE ARTE O COLECCIÓN Y ANTIGÜEDADES"
flattened_hs2012.loc[flattened_hs2012.level0.isnull(), 'level0'] = 21


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [85]:
flattened_hs2012.loc[:, 'level0_es'] = flattened_hs2012.level0_es.apply(lambda h: h.split(' - ')[1].title())
flattened_hs2012.loc[:, 'level0_en'] = flattened_hs2012.level0_en.apply(lambda h: h.split(' - ')[1].title())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


### Append section id to all levels

In [86]:
flattened_hs2012.loc[:, 'level2'] = flattened_hs2012.apply(lambda r: str(r['level0']) + r['level2'], axis=1)
flattened_hs2012.loc[:, 'level3'] = flattened_hs2012.apply(lambda r: str(r['level0']) + r['level3'], axis=1)
flattened_hs2012.loc[:, 'level5'] = flattened_hs2012.apply(lambda r: str(r['level0']) + r['level5'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


# Update names from OEC

In [87]:
oec_xlations_es = pd.DataFrame(requests.get("https://atlas.media.mit.edu/attr/hs92/es/").json()['data'])
oec_xlations_en = pd.DataFrame(requests.get("https://atlas.media.mit.edu/attr/hs92/en/").json()['data'])
original_hs92 = pd.read_csv('hs92.csv')

In [88]:
short_names = oec_xlations_es[(oec_xlations.id.str.len() == 6)] \
   .merge(original_hs92[original_hs92.Level==4], left_on='display_id', right_on='Code') \
   [['display_id', 'id', 'name', 'Description']] \
   .rename(columns={'name': 'short_name_es', 'Description': 'original_name'}) \
   .merge(oec_xlations_en, on='display_id') \
   [['display_id', 'short_name_es', 'original_name', 'name', 'id_x']] \
   .rename(columns={'name': 'short_name_en', 'id_x': 'id'})

In [90]:
flattened_hs2012 = flattened_hs2012.merge(short_names, left_on='level3_en', right_on='original_name', how='left') \
    [['level0', 'level0_en', 'level0_es', 'level2', 'level2_en', 'level2_es', 'level3', 'level3_en', 'level3_es', 'short_name_en', 'short_name_es', 'level5', 'level5_en', 'level5_es']] \
    .rename(columns={'short_name_en': 'level3_short_en', 'short_name_es': 'level3_short_es'})

In [94]:
flattened_hs2012.loc[:, 'level3_en'] = flattened_hs2012.apply(lambda h: h['level3_short_en'] if pd.notnull(h['level3_short_en']) else h['level3_en'], axis=1)
flattened_hs2012.loc[:, 'level3_es'] = flattened_hs2012.apply(lambda h: h['level3_short_es'] if pd.notnull(h['level3_short_es']) else h['level3_es'], axis=1)

In [95]:
flattened_hs2012

Unnamed: 0,level0,level0_en,level0_es,level2,level2_en,level2_es,level3,level3_en,level3_es,level3_short_en,level3_short_es,level5,level5_en,level5_es,culo
0,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010101,Horses,Caballos,Horses,Caballos,01010121,"Horses; live, pure-bred breeding animals",Reproductores de raza pura,Horses
1,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010101,Horses,Caballos,Horses,Caballos,01010129,"Horses; live, other than pure-bred breeding an...",Los demás,Horses
2,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010101,Horses,Caballos,Horses,Caballos,01010130,Asses; live,Asnos,Horses
3,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010101,Horses,Caballos,Horses,Caballos,01010190,Mules and hinnies; live,Los demás,Horses
4,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010102,Bovine,Bovino,Bovine,Bovino,01010221,"Cattle; live, pure-bred breeding animals",Reproductores de raza pura,Bovine
5,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010102,Bovine,Bovino,Bovine,Bovino,01010229,"Cattle; live, other than pure-bred breeding an...",Los demás,Bovine
6,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010102,Bovine,Bovino,Bovine,Bovino,01010231,"Buffalo; live, pure-bred breeding animals",Reproductores de raza pura,Bovine
7,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010102,Bovine,Bovino,Bovine,Bovino,01010239,"Buffalo; live, other than pure-bred breeding a...",Los demás,Bovine
8,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010102,Bovine,Bovino,Bovine,Bovino,01010290,"Bovine animals; live, other than cattle and bu...",Los demás,Bovine
9,01,Live Animals; Animal Products,Animales Vivos Y Productos Del Reino Animal,0101,Animals; live,Animales Vivos,010103,Pigs,Cerdos,Pigs,Cerdos,01010310,"Swine; live, pure-bred breeding animals",Reproductores de raza pura,Pigs


# Save to DB

In [96]:
db.to_sql(flattened_hs2012[['level0', 'level0_en', 'level0_es', 'level2', 'level2_en', 'level2_es', 'level3', 'level3_en', 'level3_es', 'level5', 'level5_en', 'level5_es']],
          'economy', 
          'dim_hs2012')

engine.execute("""
CREATE INDEX level3_idx 
ON economy.dim_hs2012 (level3)
""")

engine.execute("""
CREATE INDEX level5_idx 
ON economy.dim_hs2012 (level5)
""")



DROP TABLE IF EXISTS economy.dim_hs2012;
CREATE TABLE "economy"."dim_hs2012" (
"level0" TEXT,
  "level0_en" TEXT,
  "level0_es" TEXT,
  "level2" TEXT,
  "level2_en" TEXT,
  "level2_es" TEXT,
  "level3" TEXT,
  "level3_en" TEXT,
  "level3_es" TEXT,
  "level5" TEXT,
  "level5_en" TEXT,
  "level5_es" TEXT
)
COPY "economy"."dim_hs2012" ("level0","level0_en","level0_es","level2","level2_en","level2_es","level3","level3_en","level3_es","level5","level5_en","level5_es") FROM STDIN WITH CSV HEADER DELIMITER ',';


<sqlalchemy.engine.result.ResultProxy at 0x112eae7b8>