In [0]:
#Load libraries
import pandas as pd
import os

In [0]:
def get_characters(source,start = 0, end = 5, as_numeric=False):

  '''This is an auxiliary function that is called by function split_columns.
  It is not meant to be used directly by the user.
  
  Given a string of characters, split it from character number *start* to character number *end*-1.
  Some columns in B3's original dataset are monetary quantities given as strings with trailing zeros.
  Hence, for example, 000001234 to indicate R$12,34. Setting as_numeric=True removes the trailing zeros
  and converts the variable to numeric type.'''

  piece = source.str.slice(start, end)
  
  if as_numeric:
    #Convert to number
    piece = pd.to_numeric(piece)
  else:
    #Remove trailing spaces
    piece = piece.str.strip()
  
  return(piece)

In [0]:

def split_columns(file):

  '''This is an auxiliary function that is called by function get_dataset.
  It is not meant to be used directly by the user.

  The original file from B3 is a single column with sequential characters.
  This function splits that column into multiple columns,
  one with each variable.'''
  
  source = pd.Series(file.iloc[:,0])

  output = pd.DataFrame()
  output['date'] = get_characters(source, 2, 10)
  output['BDI'] = get_characters(source, 10, 12)
  output['security'] = get_characters(source, 12, 24)
  output['market_type'] = get_characters(source, 24, 27)
  output['company'] = get_characters(source, 27, 39)
  output['specification'] = get_characters(source, 39, 49)
  output['currency'] = get_characters(source, 52, 56)
  output['open'] = get_characters(source, 56, 69, as_numeric=True)
  output['high'] = get_characters(source, 69, 82, as_numeric=True)
  output['low'] = get_characters(source, 82, 95, as_numeric=True)
  output['average'] = get_characters(source, 95, 108, as_numeric=True)
  output['close'] = get_characters(source, 108, 121, as_numeric=True)
  output['volume'] = get_characters(source, 170, 188, as_numeric=True)

  return(output)

In [0]:
def get_dataset(file_path):

  '''The main function.
  Read the csv file with the raw data from B3 and puts it into tidy format'''

  import pandas as pd

  file = pd.read_csv(file_path)
  file = file.iloc[:(len(file)-1),:] #Remove last row
  file = split_columns(file)

  return(file)

In [13]:
#Unzip files (exactly as downloaded from B3)
!unzip COTAHIST_A2020
!unzip COTAHIST_A2019
!unzip COTAHIST_A2018
!unzip COTAHIST_A2017
!unzip COTAHIST_A2016
!unzip COTAHIST_A2015

Archive:  COTAHIST_A2020.ZIP
replace COTAHIST_A2020.TXT? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: COTAHIST_A2020.TXT      
Archive:  COTAHIST_A2019.ZIP
replace COTAHIST_A2019.TXT? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: COTAHIST_A2019.TXT      
Archive:  COTAHIST_A2018.ZIP
replace COTAHIST_A2018.TXT? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: COTAHIST_A2018.TXT      
Archive:  COTAHIST_A2017.ZIP
replace COTAHIST_A2017.TXT? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: COTAHIST_A2017.TXT      
Archive:  COTAHIST_A2016.ZIP
replace COTAHIST_A2016.TXT? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: COTAHIST_A2016.TXT      
Archive:  COTAHIST_A2015.ZIP
replace COTAHIST_A2015.TXT? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: COTAHIST_A2015.TXT      


In [0]:
file1 = get_dataset('COTAHIST_A2020.TXT')
file2 = get_dataset('COTAHIST_A2019.TXT')
file3 = get_dataset('COTAHIST_A2018.TXT')
file4 = get_dataset('COTAHIST_A2017.TXT')
file5 = get_dataset('COTAHIST_A2016.TXT')
file6 = get_dataset('COTAHIST_A2015.TXT')

In [0]:
assert (file1.columns == file2.columns).all()
assert (file1.columns == file3.columns).all()
assert (file1.columns == file4.columns).all()
assert (file1.columns == file5.columns).all()
assert (file1.columns == file6.columns).all()

In [17]:
file = pd.concat([file1, file2,file3, file4, file5, file6])
print(file.shape)

(3037272, 13)


In [0]:
#Save file
file.to_parquet(path='B3.gzip')

In [19]:
#Check if it worked by loading file again and seeing a sample
file_again = pd.read_parquet('B3.gzip')
file_again.sample(5)

Unnamed: 0,date,BDI,security,market_type,company,specification,currency,open,high,low,average,close,volume
317538,20180723,96,PARD3F,20,IHPARDINI,ON NM,R$,1900,1900,1817,1851,1817,490634
601222,20190913,78,BBSEJ31,70,BBSE,ON NM,R$,347,347,347,347,347,4511000
670943,20191025,82,CSNAW134,80,CSNAE,ON,R$,100,130,100,105,130,328000
323811,20160909,78,FIBRI52,70,FIBR,ON NM,R$,70,80,70,77,80,463400
127731,20160411,12,BPFF11,10,FII ABSOLUTO,CI ER,R$,7198,7200,7102,7154,7102,12055765
