# New Trade Data

In [37]:
import sys
import configparser
import os
import numpy as np
config = configparser.ConfigParser()
config.read_file(open('../../settings.ini'))

sys.path.insert(0, config.get('PATHS','libs_path'))
engine_path = config.get('DATABASE','engine_path')
local_path = './data/'

import postgres #from local file postgres.py
import commons
from commons import inline_table_xml, download_file, download_zip_file, extract_zip_file #from local file commons.py
from importlib import reload

import json
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine(engine_path)
db = postgres.PostgresDriver(engine)

# Exports

In [38]:
download_zip_file("http://pacha.datawheel.us/datachile/economy/aduanas_open_data/4_clean_data/",
                  local_path,
                  "exports_data_all_in_hs12.zip")

extract_zip_file(local_path, "temp.zip")

Downloading... http://pacha.datawheel.us/datachile/economy/aduanas_open_data/4_clean_data/exports_data_all_in_hs12.zip
Unzipping... ./data/temp.zip


True

### Read and clean data

In [39]:
facts_exports = pd.read_csv(os.path.join(local_path, "4_clean_data", "exports_data_all_in_hs12.csv"),
                            dtype={'hs12': str}, low_memory=True)
                          
# filter rows with NaN in dimension and measure columns
facts_exports = facts_exports[~np.isnan(facts_exports.comuna_datachile_id) & ~np.isnan(facts_exports.country_code) & facts_exports.hs12.notnull() & ~np.isnan(facts_exports.fob_value_usd)]


### Group data by all dimensions

In [41]:
grouped_fact_exports = facts_exports[['year', 'country_code', 'comuna_datachile_id', 'hs12', 'fob_value_usd']] \
    .groupby(['year', 'country_code', 'comuna_datachile_id', 'hs12']) \
    .sum() \
    .reset_index()
    
grouped_fact_exports.loc[:, 'country_code'] = grouped_fact_exports['country_code'].astype(int)
grouped_fact_exports.loc[:, 'comuna_datachile_id'] = grouped_fact_exports['comuna_datachile_id'].astype(int)

In [42]:
grouped_fact_exports = grouped_fact_exports.merge(
    pd.read_sql("SELECT level5, substring(level5 from 3) as short_level5 FROM economy.dim_hs2012", engine),
    left_on='hs12', right_on='short_level5',
    how='inner')

In [43]:
grouped_fact_exports.loc[:, 'hs12'] = grouped_fact_exports.level5

In [45]:
del(grouped_fact_exports['level5'])
del(grouped_fact_exports['short_level5'])

### Save to DB

In [46]:
db.to_sql(grouped_fact_exports, 'economy', 'fact_exports')

engine.execute("""
CREATE INDEX year_idx 
ON economy.fact_exports ("year")
""")

engine.execute("""
CREATE INDEX hs12_idx 
ON economy.fact_exports (hs12)
""")

engine.execute("""
CREATE INDEX exports_comuna_idx 
ON economy.fact_exports (comuna_datachile_id)
""")

DROP TABLE IF EXISTS economy.fact_exports;
CREATE TABLE "economy"."fact_exports" (
"year" INTEGER,
  "country_code" INTEGER,
  "comuna_datachile_id" INTEGER,
  "hs12" TEXT,
  "fob_value_usd" REAL
)
COPY "economy"."fact_exports" ("year","country_code","comuna_datachile_id","hs12","fob_value_usd") FROM STDIN WITH CSV HEADER DELIMITER ',';


<sqlalchemy.engine.result.ResultProxy at 0x7f4efcbd5080>

## Imports

In [57]:
download_zip_file("http://pacha.datawheel.us/datachile/economy/aduanas_open_data/4_clean_data/",
                  local_path,
                  "imports_data_all_in_hs12.zip")

extract_zip_file(local_path, "temp.zip")

Downloading... http://pacha.datawheel.us/datachile/economy/aduanas_open_data/4_clean_data/imports_data_all_in_hs12.zip
Unzipping... ./data/temp.zip


True

### Read and clean data

In [58]:
facts_imports = pd.read_csv(os.path.join(local_path, "4_clean_data", "imports_data_all_in_hs12.csv"),
                            dtype={'hs12': str}, low_memory=True)

facts_imports = facts_imports[~np.isnan(facts_imports.comuna_datachile_id) & ~np.isnan(facts_imports.country_code) & facts_imports.hs12.notnull() & ~np.isnan(facts_imports.cif_value_usd)]


### Group data by all dimensions

In [59]:
grouped_fact_imports = facts_imports[['year', 'country_code', 'comuna_datachile_id', 'hs12', 'cif_value_usd']] \
    .groupby(['year', 'country_code', 'comuna_datachile_id', 'hs12']) \
    .sum() \
    .reset_index()
    
grouped_fact_imports.loc[:, 'country_code'] = grouped_fact_imports['country_code'].astype(int)
grouped_fact_imports.loc[:, 'comuna_datachile_id'] = grouped_fact_imports['comuna_datachile_id'].astype(int)

In [60]:
grouped_fact_imports = grouped_fact_imports.merge(
    pd.read_sql("SELECT level5, substring(level5 from 3) as short_level5 FROM economy.dim_hs2012", engine),
    left_on='hs12', right_on='short_level5',
    how='inner')

In [61]:
grouped_fact_imports.loc[:, 'hs12'] = grouped_fact_imports.level5

In [63]:
del(grouped_fact_imports['level5'])
del(grouped_fact_imports['short_level5'])

### Save to DB

In [64]:
db.to_sql(grouped_fact_imports, 'economy', 'fact_imports')

engine.execute("""
CREATE INDEX imports_year_idx 
ON economy.fact_imports ("year")
""")

engine.execute("""
CREATE INDEX imports_hs12_idx 
ON economy.fact_imports (hs12)
""")

engine.execute("""
CREATE INDEX imports_comuna_idx 
ON economy.fact_imports (comuna_datachile_id)
""")

DROP TABLE IF EXISTS economy.fact_imports;
CREATE TABLE "economy"."fact_imports" (
"year" INTEGER,
  "country_code" INTEGER,
  "comuna_datachile_id" INTEGER,
  "hs12" TEXT,
  "cif_value_usd" REAL
)
COPY "economy"."fact_imports" ("year","country_code","comuna_datachile_id","hs12","cif_value_usd") FROM STDIN WITH CSV HEADER DELIMITER ',';


<sqlalchemy.engine.result.ResultProxy at 0x7f4efccc3e48>

# Exports in HS92

In [24]:
download_zip_file("http://pacha.datawheel.us/datachile/economy/aduanas_open_data/4_clean_data/",
                  local_path,
                  "exports_data_all_in_hs92.zip")

extract_zip_file(local_path, "temp.zip")

Unzipping... ./data/temp.zip


True

In [28]:
facts_exports_92 = pd.read_csv(os.path.join(local_path, "4_clean_data", "exports_data_all_in_hs92.csv"),
                            dtype={'hs92': str}, low_memory=True)
                          
# filter rows with NaN in dimension and measure columns
facts_exports_92 = facts_exports_92[~np.isnan(facts_exports_92.comuna_datachile_id) & ~np.isnan(facts_exports_92.country_code) & facts_exports_92.hs92.notnull() & ~np.isnan(facts_exports_92.fob_value_usd)]


### Group data by all dimensions

In [33]:
grouped_fact_exports_92 = facts_exports_92[['year', 'country_code', 'comuna_datachile_id', 'hs92', 'fob_value_usd']] \
    .groupby(['year', 'country_code', 'comuna_datachile_id', 'hs92']) \
    .sum() \
    .reset_index()
    
grouped_fact_exports_92.loc[:, 'country_code'] = grouped_fact_exports_92['country_code'].astype(int)
grouped_fact_exports_92.loc[:, 'comuna_datachile_id'] = grouped_fact_exports_92['comuna_datachile_id'].astype(int)

### Save to DB

In [36]:
db.to_sql(grouped_fact_exports_92, 'economy', 'fact_exports_92')

engine.execute("""
CREATE INDEX fact_exports_92_year_idx 
ON economy.fact_exports_92 ("year")
""")

engine.execute("""
CREATE INDEX fact_exports_92_hs92_idx 
ON economy.fact_exports_92 (hs92)
""")

engine.execute("""
CREATE INDEX fact_exports_92_exports_comuna_idx 
ON economy.fact_exports_92 (comuna_datachile_id)
""")

DROP TABLE IF EXISTS economy.fact_exports_92;
CREATE TABLE "economy"."fact_exports_92" (
"year" INTEGER,
  "country_code" INTEGER,
  "comuna_datachile_id" INTEGER,
  "hs92" TEXT,
  "fob_value_usd" REAL
)
COPY "economy"."fact_exports_92" ("year","country_code","comuna_datachile_id","hs92","fob_value_usd") FROM STDIN WITH CSV HEADER DELIMITER ',';


<sqlalchemy.engine.result.ResultProxy at 0x7f4efce39940>