In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import glob
import os

In [2]:
# Globals

filepath_glob_expression = "../data/raw/international_holdings/*.txt"
parquet_filepath = "../data/international_holdings.parq"
files = glob.glob(filepath_glob_expression)


In [3]:
all_shapes = []
all_columns = []
for file in files:
    tmp = pd.read_csv(file, nrows=10, sep=',')
    all_shapes.append(tmp.shape)
    all_columns.append(tmp.columns)
assert all([x == all_shapes[0] for x in all_shapes])
assert all([all(x) == all(all_columns[0]) for x in all_columns])

In [4]:
# TODO: Map Headers To English
# TODO: Enforce Expected Data Types

data = dd.read_csv(filepath_glob_expression, sep=";").compute()

In [5]:
data.head()

Unnamed: 0,Run Fondo,Nombre Fondo,FFM_6020100,FFM_6020200,FFM_6020300,FFM_6020400,FFM_6020500,FFM_6020600,FFM_6020700,FFM_6020800,...,FFM_rel_6021111,FFM_6021112,FFM_6021113,FFM_6021114,FFM_6021200,FFM_6021300,FFM_6021400,FFM_6021511,FFM_6021512,FFM_6021513
0,8032,BICE EST BALANCEADA,EPI,WISDOM TREE INDIA EARNINGS I,US,ETFA,,1,,GRUPO 99,...,14821.1,3,0,,136339,PROM,US,0.0,0.0,0.454
1,8032,BICE EST BALANCEADA,SPY,SPDR TRUST SERIES I,US,ETFA,,1,,GRUPO 99,...,126058.38,3,0,,4279430,PROM,US,0.0,0.0,14.243
2,8032,BICE EST BALANCEADA,SPY,SPDR TRUST SERIES I,US,ETFA,,1,,GRUPO 99,...,126058.38,3,0,,315146,PROM,US,0.0,0.0,1.049
3,8032,BICE EST BALANCEADA,VCR,VANGUARD CONSUMER DISCRET,US,ETFA,,1,,GRUPO 99,...,71583.72,3,0,,575533,PROM,US,0.0,0.0,1.915
4,8032,BICE EST BALANCEADA,VFH,VANGUARD FINANCIALS ETF,US,ETFA,,1,,GRUPO 99,...,29680.13,3,0,,504414,PROM,US,0.0,0.0,1.679


In [6]:

data.columns

Index(['Run Fondo', 'Nombre Fondo', 'FFM_6020100', 'FFM_6020200',
       'FFM_6020300', 'FFM_6020400', 'FFM_6020500', 'FFM_6020600',
       'FFM_6020700', 'FFM_6020800', 'FFM_6020900', 'FFM_6021000',
       'FFM_tir_6021111', 'FFM_par_6021111', 'FFM_rel_6021111', 'FFM_6021112',
       'FFM_6021113', 'FFM_6021114', 'FFM_6021200', 'FFM_6021300',
       'FFM_6021400', 'FFM_6021511', 'FFM_6021512', 'FFM_6021513'],
      dtype='object')

In [7]:
data.shape

(428053, 24)

In [8]:
data.rename(columns={'Run Fondo': "Fund Id",
'Nombre Fondo': "Fund Name",
'FFM_6020100': "Asset Name",
'FFM_6020200': "Issuer Name",
'FFM_6020300': "Country Code",
'FFM_6020400': "Asset Type",
'FFM_6020500': "Asset Maturity",
'FFM_6020600': "Asset Restrictions",
'FFM_6020700': "Risk Classification",
'FFM_6020800': "Asset Group Company",
'FFM_6020900': "Number Of Units",
'FFM_6021000': "Unit Type",
'FFM_tir_6021111': "Rate",
'FFM_par_6021111': "PAR Value",
'FFM_rel_6021111': "Price",
'FFM_6021112': "Price Code",
'FFM_6021113': "Rate Convention In Days",
'FFM_6021114': "Valuation At End Of Month",
'FFM_6021200': "Currency Code Of Sale",
'FFM_6021300': "Transaction Country Code",
'FFM_6021400': "Percentage Of Holdings Of Total Asset",
'FFM_6021511': "Percentage Of Issuer Capital",
'FFM_6021512': "Percentage Of Issuer",
'FFM_6021513': "Percentage of Asset In Fund"}, inplace=True)

In [9]:

data.head()

Unnamed: 0,Fund Id,Fund Name,Asset Name,Issuer Name,Country Code,Asset Type,Asset Maturity,Asset Restrictions,Risk Classification,Asset Group Company,...,Price,Price Code,Rate Convention In Days,Valuation At End Of Month,Currency Code Of Sale,Transaction Country Code,Percentage Of Holdings Of Total Asset,Percentage Of Issuer Capital,Percentage Of Issuer,Percentage of Asset In Fund
0,8032,BICE EST BALANCEADA,EPI,WISDOM TREE INDIA EARNINGS I,US,ETFA,,1,,GRUPO 99,...,14821.1,3,0,,136339,PROM,US,0.0,0.0,0.454
1,8032,BICE EST BALANCEADA,SPY,SPDR TRUST SERIES I,US,ETFA,,1,,GRUPO 99,...,126058.38,3,0,,4279430,PROM,US,0.0,0.0,14.243
2,8032,BICE EST BALANCEADA,SPY,SPDR TRUST SERIES I,US,ETFA,,1,,GRUPO 99,...,126058.38,3,0,,315146,PROM,US,0.0,0.0,1.049
3,8032,BICE EST BALANCEADA,VCR,VANGUARD CONSUMER DISCRET,US,ETFA,,1,,GRUPO 99,...,71583.72,3,0,,575533,PROM,US,0.0,0.0,1.915
4,8032,BICE EST BALANCEADA,VFH,VANGUARD FINANCIALS ETF,US,ETFA,,1,,GRUPO 99,...,29680.13,3,0,,504414,PROM,US,0.0,0.0,1.679


In [10]:
if os.path.exists(parquet_filepath):
  os.remove(parquet_filepath)

data.to_parquet(parquet_filepath, compression='gzip')

In [11]:
pd.read_parquet(parquet_filepath)

Unnamed: 0,Fund Id,Fund Name,Asset Name,Issuer Name,Country Code,Asset Type,Asset Maturity,Asset Restrictions,Risk Classification,Asset Group Company,...,Price,Price Code,Rate Convention In Days,Valuation At End Of Month,Currency Code Of Sale,Transaction Country Code,Percentage Of Holdings Of Total Asset,Percentage Of Issuer Capital,Percentage Of Issuer,Percentage of Asset In Fund
0,8032,BICE EST BALANCEADA,EPI,WISDOM TREE INDIA EARNINGS I,US,ETFA,,1,,GRUPO 99,...,14821.10,3,0,,136339,PROM,US,0.0,0.0,0.454
1,8032,BICE EST BALANCEADA,SPY,SPDR TRUST SERIES I,US,ETFA,,1,,GRUPO 99,...,126058.38,3,0,,4279430,PROM,US,0.0,0.0,14.243
2,8032,BICE EST BALANCEADA,SPY,SPDR TRUST SERIES I,US,ETFA,,1,,GRUPO 99,...,126058.38,3,0,,315146,PROM,US,0.0,0.0,1.049
3,8032,BICE EST BALANCEADA,VCR,VANGUARD CONSUMER DISCRET,US,ETFA,,1,,GRUPO 99,...,71583.72,3,0,,575533,PROM,US,0.0,0.0,1.915
4,8032,BICE EST BALANCEADA,VFH,VANGUARD FINANCIALS ETF,US,ETFA,,1,,GRUPO 99,...,29680.13,3,0,,504414,PROM,US,0.0,0.0,1.679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5260,9961,FM BCI EST. AHORRO 3,FIBRBZ0125,FIBRIA OVERSEAS FINANCE,BR,BBFE,14/01/2025,1,BBB,,...,,1,360,NC,398082,PROM,US,0.0,0.0,3.557
5261,9961,FM BCI EST. AHORRO 3,BCOCPE0929,BANCO CONTINENTAL S.A.,PE,BBFE,22/09/2029,1,BBB,,...,,1,360,NC,660374,PROM,US,0.0,0.0,5.901
5262,9961,FM BCI EST. AHORRO 3,BSMXB0425,BANCO SANTANDER MEXICO,MX,BBFE,17/04/2025,1,BBB,,...,,1,360,NC,1108197,PROM,US,0.0,0.0,9.902
5263,9987,ESG EMERGING STARS,NEMSBIU LX EQUITY,NORDEA 1 SICAV - EMERGING STAR,LU,CFME,,1,,,...,160609.97,3,0,,1677109,CU,LU,0.0,0.0,98.044


In [2]:
data.dtype

NameError: name 'data' is not defined