In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import glob

In [2]:
# Globals

filepath_glob_expression = "../data/raw/domestic_holdings/*.txt"
parquet_filepath = "../data/domestic_holdings.parq"
files = glob.glob(filepath_glob_expression)


In [3]:
# TODO Look at why *202012.txt doesn't contain data
#      Deleted for now

all_shapes = []
all_columns = []
for file in files:
    tmp = pd.read_csv(file, nrows=10, sep=',')
    all_shapes.append(tmp.shape)
    all_columns.append(tmp.columns)
assert all([x == all_shapes[0] for x in all_shapes])
assert all([all(x) == all(all_columns[0]) for x in all_columns])

In [4]:
data = dd.read_csv(filepath_glob_expression, sep=";", dtype={'FFM_6010800': 'float64'}).compute()

In [5]:
data.head()

Unnamed: 0,Run Fondo,Nombre Fondo,FFM_6010100,FFM_6010211,FFM_6010212,FFM_6010300,FFM_6010400,FFM_6010500,FFM_6010600,FFM_6010700,...,FFM_REL_6011111,FFM_6011112,FFM_6011113,FFM_6011114,FFM_6011200,FFM_6011300,FFM_6011400,FFM_6011511,FFM_6011512,FFM_6011513
0,8001,CAPITALISA-ACC.,AESGENER,94272000,9,CL,ACC,,1,N-2,...,329.71,3,0,,98254,$$,CL,0.004,0.003,4.381
1,8001,CAPITALISA-ACC.,AGUAS-A,61808000,5,CL,ACC,,1,N-1,...,361.51,3,0,,129279,$$,CL,0.006,0.008,5.764
2,8001,CAPITALISA-ACC.,BCI,97006000,6,CL,ACC,,1,N-1,...,26384.26,3,0,,62504,$$,CL,0.002,0.0,2.787
3,8001,CAPITALISA-ACC.,BSANTANDER,97036000,K,CL,ACC,,1,N-1,...,30.15,3,0,,156188,$$,CL,0.003,0.001,6.964
4,8001,CAPITALISA-ACC.,BUPACL,76005001,6,CL,ACC,,1,N-3,...,507.0,3,0,,58930,$$,CL,0.018,0.012,2.628


In [6]:

data.columns

Index(['Run Fondo', 'Nombre Fondo', 'FFM_6010100', 'FFM_6010211',
       'FFM_6010212', 'FFM_6010300', 'FFM_6010400', 'FFM_6010500',
       'FFM_6010600', 'FFM_6010700', 'FFM_6010800', 'FFM_6010900',
       'FFM_6011000', 'FFM_TIR_6011111', 'FFM_PAR_6011111', 'FFM_REL_6011111',
       'FFM_6011112', 'FFM_6011113', 'FFM_6011114', 'FFM_6011200',
       'FFM_6011300', 'FFM_6011400', 'FFM_6011511', 'FFM_6011512',
       'FFM_6011513'],
      dtype='object')

In [7]:
data.rename(columns={"Run Fondo": "Fund Id",
"Nombre Fondo": "Fund Name",
"FFM_6010100": "Asset Id",
"FFM_6010211": "Asset Agent Id",
"FFM_6010212": "Vertification Id",
"FFM_6010300": "Country Code",
"FFM_6010400": "Asset Type",
"FFM_6010500": "Maturity Date",
"FFM_6010600": "Asset Restrictions",
"FFM_6010700": "Risk Classification",
"FFM_6010800": "Company Group Classification",
"FFM_6010900": "Number Of Units",
"FFM_6011000": "Unit Or Currency Type",
"FFM_TIR_6011111": "Asset Rate",
"FFM_PAR_6011111": "Price Compared To PAR Value",
"FFM_REL_6011111": "Price",
"FFM_6011112": "Price Code",
"FFM_6011113": "Price Convention In Days",
"FFM_6011114": "Type Of Interest",
"FFM_6011200": "Valuation At End Of Month",
"FFM_6011300": "Currency Code",
"FFM_6011400": "Transaction Country Code",
"FFM_6011511": "Number Of Shared By Asset Agent",
"FFM_6011512": "Percent Of Asset Fund Has Invested",
"FFM_6011513": "Percent Of Fund Asset Is Held"}, inplace=True)

In [8]:
if os.path.exists(parquet_filepath):
  os.remove(parquet_filepath)

data.to_parquet(parquet_filepath, compression='gzip')

In [9]:
pd.read_parquet(parquet_filepath)

Unnamed: 0,Fund Id,Fund Name,Asset Id,Asset Agent Id,Vertification Id,Country Code,Asset Type,Maturity Date,Asset Restrictions,Risk Classification,...,Price,Price Code,Price Convention In Days,Type Of Interest,Valuation At End Of Month,Currency Code,Transaction Country Code,Number Of Shared By Asset Agent,Percent Of Asset Fund Has Invested,Percent Of Fund Asset Is Held
0,8001,CAPITALISA-ACC.,AESGENER,94272000,9,CL,ACC,,1,N-2,...,329.71,3,0,,98254,$$,CL,0.004,0.003,4.381
1,8001,CAPITALISA-ACC.,AGUAS-A,61808000,5,CL,ACC,,1,N-1,...,361.51,3,0,,129279,$$,CL,0.006,0.008,5.764
2,8001,CAPITALISA-ACC.,BCI,97006000,6,CL,ACC,,1,N-1,...,26384.26,3,0,,62504,$$,CL,0.002,0.000,2.787
3,8001,CAPITALISA-ACC.,BSANTANDER,97036000,K,CL,ACC,,1,N-1,...,30.15,3,0,,156188,$$,CL,0.003,0.001,6.964
4,8001,CAPITALISA-ACC.,BUPACL,76005001,6,CL,ACC,,1,N-3,...,507.00,3,0,,58930,$$,CL,0.018,0.012,2.628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35010,9981,FM BCI EST. DEU. CAL,BCMPC-F,96596540,8,CL,BE,24/03/2030,1,AA,...,,1,365,RC,1595295,$$,CL,0.000,0.020,10.949
35011,9981,FM BCI EST. DEU. CAL,BWNCO-B,92147000,2,CL,BE,01/10/2030,1,A,...,,1,365,RC,1593659,$$,CL,0.000,0.000,10.938
35012,9981,FM BCI EST. DEU. CAL,BTMOV-O,76124890,1,CL,BE,01/12/2025,1,AA,...,,1,365,NC,1056353,$$,CL,0.000,0.070,7.250
35013,9981,FM BCI EST. DEU. CAL,BNPDBC010221,97029000,1,CL,PDBC,01/02/2021,1,,...,,1,30,NL,309998,$$,CL,0.000,0.001,2.128
