In [4]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import sys
engine = create_engine('postgresql://datachile:yapoweon@hermes:5433/datachile')

# Calculate RCA for regions

In [None]:
df = pd.read_sql("""
SELECT
  dd.the_year,
  c.region_id,
  hs_level3,
  sum(fob_us) AS fob_us
FROM economy.pg_exports_grouped e
  INNER JOIN public.dim_date dd ON dd.id = date_id
  INNER JOIN public.dim_comunas c ON c.id = exporter_comuna_id
GROUP BY dd.the_year, c.region_id, hs_level3
""", engine)

In [20]:
def rca_y(year):
    pivoted = df[df['the_year'] == year].pivot(index="region_id", columns="hs_level3", values="fob_us")
    rv = pd.DataFrame(rca(pivoted).stack(), columns=["rca"]).reset_index()
    rv['year'] = year
    return rv

rca_years = pd.concat(map(rca_y, df['the_year'].unique()))
    



## Store RCA in DB

In [22]:
rca_years.to_sql('rca_hs_region', engine, schema='economy', if_exists='replace', index=False)

Update `exports` fact table

In [None]:
engine.execute("""
UPDATE pg_exports_grouped e
SET region_rca = r.rca
FROM public.dim_comunas c, public.dim_date d, rca_hs_region r
WHERE c.id = e.exporter_comuna_id AND d.id = e.date_id AND r.region_id = c.region_id
      AND r."year" = d.the_year AND e.hs_level3 = r.hs_level3
""")


In [5]:
def rca(tbl, populations=None):
  
  # fill missing values with zeros
  tbl = tbl.fillna(0)

  # get sum over columns
  col_sums = tbl.sum(axis=1)

  # we now need to transpose or "reshape" this array so that
  # it is in the form of one long column
  col_sums = col_sums.reshape((len(col_sums), 1))

  # create the numerator matrix for the final RCA calculation by
  # dividing each value by its row's sum
  rca_numerator = np.divide(tbl, col_sums)

  # get the sum over all the rows
  row_sums = tbl.sum(axis=0)

  # if populations is set create the denominator based on that for POP RCA
  if populations.__class__ == pd.DataFrame or populations.__class__ == pd.Series:

    # create the denominator matrix for the final RCA calculation
    # by dividing the industry sums by a single value (the matrix total sum)
    rca_denominator = populations / float(populations.sum())
    
    # rca_denominator = rca_denominator.reshape((len(rca_denominator), 1))
    # print rca_numerator.shape
    
    # rca_denominator = pd.DataFrame(rca_denominator, columns=[rca_numerator.columns[0]])
    # rca_denominator = rca_denominator.reindex(index=rca_numerator.index)
    # rca_denominator = rca_denominator.reindex(columns=rca_numerator.columns, method="ffill")
    # print rca_decnominator.ix["ac"]

    # lastly we get the RCAs by dividing the numerator matrix by denominator
    rcas = rca_numerator.T / rca_denominator
    rcas = rcas.T
    

  else:
    # get total of all the values in the matrix
    total_sum = tbl.sum().sum()

    # create the denominator matrix for the final RCA calculation
    # by dividing the industry sums by a single value (the matrix total sum)
    rca_denominator = row_sums / total_sum

    # lastly we get the RCAs by dividing the numerator matrix by denominator
    rcas = rca_numerator / rca_denominator
    

  # rcas[rcas >= 1] = 1
  # rcas[rcas < 1] = 0
  
  return rcas
