In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import glob

In [2]:
files = glob.glob('../data/raw/*.csv')
all_shapes = []
all_columns = []
for file in files:
    tmp = pd.read_csv(file, nrows=10)
    all_shapes.append(tmp.shape)
    all_columns.append(tmp.columns)
assert all([x == all_shapes[0] for x in all_shapes])
assert all([all(x) == all(all_columns[0]) for x in all_columns])

In [3]:
data = dd.read_csv('../data/raw/*.csv', parse_dates=['fecha'])

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,administratorName,fundRUN,fundName,fundSeries,netPatrimony,efectivePatrimony,installmentsInCirculation,providedInstallments,providedFlow,...,currency,bloombergCode,pensionFunds,fixedRemuneration,variableRemuneration,affectedExpenses,unaffectedExpenses,investmentCommission,rescuedCommission,fecha
0,0,BANCHILE,8001-2,CAPITALISA-ACC,A,2020235000.0,2020236000.0,5877039.0,0.0,0.0,...,P,BCHCAPT CI,N,346510.0,0.0,0.0,0.0,0.0,0.0,2015-01-01
1,1,BANCHILE,8001-2,CAPITALISA-ACC,B,52909410.0,52909400.0,115618.6,0.0,0.0,...,P,BCHCACB CI,N,2899.0,0.0,0.0,0.0,0.0,0.0,2015-01-01
2,2,BANCHILE,8001-2,CAPITALISA-ACC,C,182280900.0,182280900.0,234336.7,0.0,0.0,...,P,BCHCACC CI,N,14858.0,0.0,0.0,0.0,0.0,0.0,2015-01-01
3,3,BANCHILE,8001-2,CAPITALISA-ACC,S,0.0,0.0,0.0,0.0,0.0,...,P,BCHCACS CI,N,0.0,0.0,0.0,0.0,0.0,0.0,2015-01-01
4,4,BANCHILE,8023-3,HORIZONTE,A,51393070000.0,51419660000.0,1587761.0,0.0,0.0,...,P,BCHEMPR CI,N,2347098.0,0.0,0.0,0.0,0.0,5908.0,2015-01-01


In [5]:
data.columns

Index(['Unnamed: 0', 'administratorName', 'fundRUN', 'fundName', 'fundSeries',
       'netPatrimony', 'efectivePatrimony', 'installmentsInCirculation',
       'providedInstallments', 'providedFlow', 'rescuedInstallments',
       'rescuedFlow', 'installmentValue', 'totalParticipants',
       'institutionalParticipants', 'otherParticipants', 'APV', 'aafmCategory',
       'svsCategory', 'svsCategoryId', 'currency', 'bloombergCode',
       'pensionFunds', 'fixedRemuneration', 'variableRemuneration',
       'affectedExpenses', 'unaffectedExpenses', 'investmentCommission',
       'rescuedCommission', 'fecha'],
      dtype='object')

In [7]:
with ProgressBar():
    cols = ['fundName', 'fundSeries', 'installmentValue', 'bloombergCode', 'fecha']
    sub = data[cols]
    price_data = sub.compute()

[########################################] | 100% Completed | 19.0s


In [8]:
price_data.head()

Unnamed: 0,fundName,fundSeries,installmentValue,bloombergCode,fecha
0,CAPITALISA-ACC,A,343.7506,BCHCAPT CI,2015-01-01
1,CAPITALISA-ACC,B,457.6202,BCHCACB CI,2015-01-01
2,CAPITALISA-ACC,C,777.8586,BCHCACC CI,2015-01-01
3,CAPITALISA-ACC,S,1000.0,BCHCACS CI,2015-01-01
4,HORIZONTE,A,32368.2689,BCHEMPR CI,2015-01-01


In [9]:
price_data.shape

(5960032, 5)

In [11]:
price_data.to_parquet('../data/fund_prices.parq', compression='gzip')

In [12]:
with ProgressBar():
    cols = ['fundRUN', 'fundName', 'fundSeries', 'bloombergCode', 'APV', 'aafmCategory', 'svsCategory',
           'svsCategoryId', 'currency', 'fixedRemuneration', 'variableRemuneration', 'investmentCommission',
            'rescuedCommission']
    sub = data[cols].drop_duplicates()
    fund_data = sub.compute()

[########################################] | 100% Completed | 31.0s


In [13]:
fund_data.shape

(4351557, 13)

In [14]:
fund_data.head()

Unnamed: 0,fundRUN,fundName,fundSeries,bloombergCode,APV,aafmCategory,svsCategory,svsCategoryId,currency,fixedRemuneration,variableRemuneration,investmentCommission,rescuedCommission
0,8001-2,CAPITALISA-ACC,A,BCHCAPT CI,NO,Accionario Nacional Large CAP,FM DE INVERSION EN INSTRUMENTOS DE CAPITALIZACION,5.0,P,346510.0,0.0,0.0,0.0
1,8001-2,CAPITALISA-ACC,B,BCHCACB CI,SI,Accionario Nacional Large CAP,FM DE INVERSION EN INSTRUMENTOS DE CAPITALIZACION,5.0,P,2899.0,0.0,0.0,0.0
2,8001-2,CAPITALISA-ACC,C,BCHCACC CI,NO,Accionario Nacional Large CAP,FM DE INVERSION EN INSTRUMENTOS DE CAPITALIZACION,5.0,P,14858.0,0.0,0.0,0.0
3,8001-2,CAPITALISA-ACC,S,BCHCACS CI,NO,Accionario Nacional Large CAP,FM DE INVERSION EN INSTRUMENTOS DE CAPITALIZACION,5.0,P,0.0,0.0,0.0,0.0
4,8023-3,HORIZONTE,A,BCHEMPR CI,NO,"Fondos de Deuda > 365 Dias Nacional, Inversion...",FM DE INV.EN INST.DE DEUDA DE MEDIANO Y LARGO ...,3.0,P,2347098.0,0.0,0.0,5908.0


In [15]:
fund_data.columns

Index(['fundRUN', 'fundName', 'fundSeries', 'bloombergCode', 'APV',
       'aafmCategory', 'svsCategory', 'svsCategoryId', 'currency',
       'fixedRemuneration', 'variableRemuneration', 'investmentCommission',
       'rescuedCommission'],
      dtype='object')

In [16]:
fund_data.to_parquet('../data/fund_data.parq', compression='gzip')

In [17]:
with ProgressBar():
    cols = ['fundRUN', 'fundName', 'fundSeries', 'bloombergCode', 'netPatrimony', 'providedFlow', 'rescuedFlow', 'fecha']
    sub = data[cols]
    fund_flows = sub.compute()

[########################################] | 100% Completed | 17.7s


In [18]:
fund_flows.shape

(5960032, 8)

In [19]:
fund_flows.head()

Unnamed: 0,fundRUN,fundName,fundSeries,bloombergCode,netPatrimony,providedFlow,rescuedFlow,fecha
0,8001-2,CAPITALISA-ACC,A,BCHCAPT CI,2020235000.0,0.0,0.0,2015-01-01
1,8001-2,CAPITALISA-ACC,B,BCHCACB CI,52909410.0,0.0,0.0,2015-01-01
2,8001-2,CAPITALISA-ACC,C,BCHCACC CI,182280900.0,0.0,0.0,2015-01-01
3,8001-2,CAPITALISA-ACC,S,BCHCACS CI,0.0,0.0,0.0,2015-01-01
4,8023-3,HORIZONTE,A,BCHEMPR CI,51393070000.0,0.0,26592300.0,2015-01-01


In [20]:
fund_flows.to_parquet('../data/fund_flows.parq', compression='gzip')