# Algorithm to parse .txt dataset to .csv

Source: [B3](http://www.b3.com.br/pt_br/market-data-e-indices/servicos-de-dados/market-data/historico/mercado-a-vista/series-historicas/)

Layout: [link](http://www.b3.com.br/data/files/33/67/B9/50/D84057102C784E47AC094EA8/SeriesHistoricas_Layout.pdf)

### The cell below is used to get B3 data by sending the raw dataset and formatting it as DataFrame with labeled columns

In [1]:
import pandas as pd

In [2]:
year = input('Insira o ano: ')

Insira o ano: 2021


In [3]:
def b3_parser(year):
    cota = open(f"./datasets/COTAHIST_A{year}.txt",'r') 
    head = cota.readline()
    line = cota.readline()
    all_data = []
    size = [2, 8, 2, 12, 3, 12, 10, 3, 4, 13, 13, 13, 13, 13, 13, 13, 5, 18, 18, 13, 1, 8, 7, 13, 12, 3]
    col = ['tipreg', 'date', 'codbdi', 'codneg', 'tpmerc', 'nomres', 'especi', 'prazot', 'modref', 'preabe', 
           'premax', 'premin', 'premed', 'preult', 'preofc', 'preofv', 'totneg', 'quatot', 'voltot', 'preexe', 
           'indopc', 'datven', 'fatcot', 'ptoexe', 'codisi', 'dismes']
    #OBS.: The name of the columns is a pattern adopted by B3
    
    while line:
        dt = []
        x = 0
        for s in size: 
            dt.append(line[x:x+s]) 
            x+=s 
        all_data.append(dt)
        line = cota.readline()
        if line.startswith('99'): 
            break

    df = pd.DataFrame(data=all_data,columns=col)
    col = ['tipreg',  'codbdi',  'tpmerc', 'totneg', 'quatot', 'indopc', 'datven', 'fatcot',  'dismes']
    for c in col:
        df[c]=pd.to_numeric(df[c], downcast="integer")

    col = ['preabe', 'premax', 'premin', 'premed', 'preult', 'preofc', 'preofv','voltot', 'preexe','ptoexe']
    for c in col:
        df[c] = df[c].astype(float)/100

    col = ['date','datven']
    for c in col:
        df[c] = pd.to_datetime(df[c],format='%Y%m%d',errors='coerce')

    col = ['codneg']
    for c in col:
        df[c] = df[c].str.strip()
    
    return df

In [4]:
%%time
df = b3_parser(year)

Wall time: 20.7 s


### Cell below serves to filter only by IPO's

In [5]:
def listFilter(dataframe, valores):
    return dataframe.loc[dataframe['codneg'].isin(valores)]

list_ipos = ["PSVM11", "PARC3", "WIZS3",  "AALR3", "MOVI3", "PARD3", "AZUL4", "CRFB3", "IRBR3", "OMGE3", "CAML3", 
             "BKBR3", "BRDT3", "GNDI3", "HAPV3", "BIDI4", "SBFG3", "CNTO3","NEOE3", "VIVA3", "BMGB4", "CEAB3", 
             "MTRE3", "LWSA3", "MDNE3", "PRNR3", "ALPK3", "AMBP3", "SOMA3", "DMVF3", "LJQQ3", "LAVV3", "PGMN3",
             "PETZ3", "PLPL3", "MELK3", "HBSA3", "CURY3", "BOAS3", "SEQL3", "GMAT3", "TFCO4", "CASH3", "ENJU3", 
             "AERI3", "RRRP3", "AVLL3", "RDOR3", "NGRD3", "HBRE3", "VAMO3", "ESPA3", "INTB3", "MOSI3", "MBLY3", 
             "JALL3", "POWE3", "CSED3", "OPCT3", "ORVR3", "ELMD3", "BMOB3", "CMIN3", "WEST3", "MODL11", "MATD3", 
             "BLAU3", "GGPS3", "SOJA3", "RECV3", "CXSE3", "ALLD3"]

df = listFilter(df, list_ipos)

In [6]:
# CELL TO CHOOSE DESIRABLE COLUMNS
df = df[['date','codneg','preabe','preult']]
df.head()

Unnamed: 0,date,codneg,preabe,preult
47,2021-01-04,AALR3,11.13,11.13
56,2021-01-04,AERI3,10.39,10.53
62,2021-01-04,ALPK3,9.4,9.35
68,2021-01-04,AMBP3,26.51,26.85
82,2021-01-04,AVLL3,24.29,23.61


In [7]:
# RESETING INDEX
df = df.reset_index(drop=True)

# TRANSFORMING THE DATAFRAME IN CSV 
df.to_csv(f"COTAHIST_A{year}.csv", index = False)

**After taking csv of all the desired years, just unite them and order them by date**