## Imports and Defs

### Imports

In [1078]:
import requests
from bs4 import BeautifulSoup
from lxml import html
import zipfile
import os
import io
import re

from urllib.parse import urljoin
from io import BytesIO
import pandas as pd
pd.set_option('display.max_colwidth', None)

import unidecode
import string

import time
from datetime import datetime
import pickle


### Variables

In [1079]:
prefix = 'dfp'
radical = 'cia_aberta'
extension = '.zip'
demonstracoes_financeiras = ['DRA', 'DMPL', 'DFC_MD', 'DFC_MI', 'BPA', 'BPP', 'DRE', 'DVA']  # Add all other items
demonstracoes_financeiras_dict = {
    'BPA': 'Balanço Patrimonial Ativo (BPA)',
    'BPP': 'Balanço Patrimonial Passivo (BPP)',
    'DFC_MD': 'Demonstração de Fluxo de Caixa - Método Direto (DFC-MD)',
    'DFC_MI': 'Demonstração de Fluxo de Caixa - Método Indireto (DFC-MI)',
    'DMPL': 'Demonstração de Mutações do Patrimônio Líquido (DMPL)',
    'DRA': 'Demonstração de Resultado Abrangente (DRA)',
    'DRE': 'Demonstração de Resultado (DRE)',
    'DVA': 'Demonstração de Valor Adicionado (DVA)'
}
base_de_consolidacao = ['ind', 'con']  # Add other variables if existing
base_de_consolidacao_dict = {
    'ind': 'Individual',
    'con': 'Consolidado'
}



In [1080]:
# URL base do site
base_url = "https://dados.cvm.gov.br/dados/CIA_ABERTA/"
xpath = '/html/body/div[1]/pre'

# Inicializar uma sessão
session = requests.Session()

# Lista para armazenar links de arquivos CSV e ZIP
files_list = []

# Conjunto para armazenar subpastas já visitadas
visited_subfolders = set()


In [1081]:
col_category = ['FILE_NAME', 'DEMONSTRATIVO', 'BALANCE_SHEET', 'ANO', 'AGRUPAMENTO', 'CNPJ_CIA', 'VERSAO', 'DENOM_CIA', 'CD_CVM', 'GRUPO_DFP', 'MOEDA', 'ESCALA_MOEDA', 'ORDEM_EXERC', 'CD_CONTA', 'DS_CONTA', 'ST_CONTA_FIXA', 'COLUNA_DF']
col_datetime = ['DT_REFER', 'DT_FIM_EXERC', 'DT_INI_EXERC']
col_float = ['VL_CONTA']



### Defs

In [1082]:
# save and load pkl
def save_pkl(demo, file_name):
    # Save dump pickle
    with open(f'{file_name}.pkl', 'wb') as f:
        pickle.dump(demo, f)
    return demo

def load_pkl(file_name):
    # Read load pickle
    with open(f'{file_name}.pkl', 'rb') as f:
        demo = pickle.load(f)
    return demo

In [1083]:
def remaining_time(start_time, size, i):
    # Calculate the number of items processed and the number of remaining items
    counter = i + 1
    remaining_items = size - counter
    
    # Calculate the percentage of completion
    percentage = counter / size
    
    # Calculate the elapsed time
    running_time = time.time() - start_time
    
    # Calculate the average time taken per item
    avg_time_per_item = running_time / counter
    
    # Calculate the remaining time based on the average time per item
    remaining_time = remaining_items * avg_time_per_item
    
    # Convert remaining time to hours, minutes, and seconds
    hours, remainder = divmod(int(remaining_time), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    # Format remaining time as a string
    remaining_time_formatted = f'{int(hours)}h {int(minutes):02}m {int(seconds):02}s'
    
    # Create a progress string with all the calculated values
    progress = (
        f'{percentage:.2%} '
        f'{counter}+{remaining_items}, '
        f'{avg_time_per_item:.6f}s per item, '
        f'Remaining: {remaining_time_formatted}'
    )

    return progress

In [1084]:
# Função auxiliar para reunir links
def gather_links(url):
    visited_subfolders.add(url)  # Marcar a subpasta como visitada
    response = session.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    for link in soup.find_all("a"):
        href = link.get("href")
        full_link = urljoin(url, href)

        if full_link.startswith(base_url) and full_link not in visited_subfolders:
            if full_link.endswith((".csv", ".zip", ".txt")):
                files_list.append(full_link)
            elif full_link.endswith("/"):
                gather_links(full_link)
    return files_list

def get_files_dates(files_list, xpath):

    folders = set()
    for url in files_list:
        folder = os.path.dirname(url)
        folders.add(folder)
    folders = sorted(list(folders))

    data = []
    for url in folders:
        response = requests.get(url)
        response.raise_for_status()
        tree = html.fromstring(response.content)
        elements = tree.xpath(xpath)

        for element in elements:
            content = element.text_content()
            lines = content.split('\r\n')
            for line in lines:
                if len(line) > 3:
                    parts = line.split()
                    file_name = parts[0]
                    date_str = f"{parts[1]} {parts[2]}"
                    match = re.match(r'(\d{2})-(\w{3})-(\d{4}) (\d{2}:\d{2})', date_str)
                    if match:
                        day, month_abbr, year, time = match.groups()
                        month_mapping = {
                            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
                            'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
                            'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
                        }
                        month = month_mapping.get(month_abbr)
                        formatted_date_str = f"{day}-{month}-{year} {time}"
                        date = datetime.strptime(formatted_date_str, "%d-%m-%Y %H:%M")
                        full_link = url + '/' + file_name
                        data.append([full_link, date])

    df = pd.DataFrame(data, columns=['file_name', 'date'])

    return df


In [1085]:
# helper for categories
def get_categories(files_list):
    # Extrair categorias e arquivos meta
    categories = set()
    meta_files = [file_link for file_link in files_list if "meta" in file_link]
    files = [file_link for file_link in files_list if "meta" not in file_link]

    for file_link in files_list:
        cat = '/'.join(file_link.replace(base_url,'').split('/')[:-2])
        categories.add(cat)
    categories = sorted(list(categories))

    return categories


In [1086]:
# Função auxiliar para extrair metadados
def extract_meta(content):
    meta_dict = {}
    blocks = content.split("-----------------------\r\nCampo: ")[1:]
    
    for block in blocks:
        lines = block.strip().split("\r\n")
        campo = lines[0]
        descricao = None
        
        for line in lines:
            if 'Descrição' in line or 'Descrio' in line:
                descricao = line.split(':')[1].strip()
                break
        
        if descricao is not None:
            meta_dict[campo] = descricao
    
    return meta_dict



In [1087]:
# Extrair e processar metadados
def get_metadados(files_list):
    meta_dict = {}
    meta_files = [file_link for file_link in files_list if "meta" in file_link]

    for file in meta_files:
        response = session.get(file)
        response.raise_for_status()

        if file.endswith('.zip'):
            zip_file = zipfile.ZipFile(io.BytesIO(response.content))

            for file_in_zip in zip_file.namelist():
                with zip_file.open(file_in_zip) as zip_file_content:
                    file_content = zip_file_content.read().decode('utf-8', errors='ignore')
                    d = extract_meta(file_content)
                    meta_dict[file_in_zip.split('.')[0]] = d
        elif file.endswith('.txt'):
            file_content = response.content.decode('iso-8859-1')
            d = extract_meta(file_content)
            file_name = file.split('/')[-1].split('.')[0]
            meta_dict[file_name] = d

    return meta_dict

In [1110]:
# download database - extract financial sheets from web zip files
def download_database(demonstrativos, update=True):
    if update == True:
        # Filter the sorted DataFrame by current year
        files_list_df = files_list_df[(files_list_df['date'].dt.year == datetime.now().year)]
        files_list = files_list['file_name'].to_list()
        
    # Initialize variables
    fin_sheet = []
    fin_sheet_links = []
    total_size = 0  
    total_size_csv = 0
    total_rows = 0
    dataframes = []

    for i, demonstrativo in enumerate(demonstrativos):
        print(remaining_time(start_time, len(demonstrativos), i))
        # Retrieve the list of files based on the specified 'demonstrativo'
        download_files = [file_link for file_link in files_list if 'meta' not in file_link and demonstrativo in file_link]
        # If 'update' is True, select the most recent file
        if update:
            download_files = [max(download_files, key=lambda url: int(url.split('_')[-1].split('.')[0]))]

        # Create an empty list to hold the DataFrames

        # Iterate through the list of URLs
        start_time_2 = time.time()
        for j, zip_url in enumerate(download_files):
            print('  ' + remaining_time(start_time_2, len(download_files), j))
            # Download the zip file
            response = requests.get(zip_url)
            
            # Check if the download was successful
            if response.status_code == 200:
                # Get the size of the downloaded file
                file_size = len(response.content)/(1024 ** 2)
                total_size += file_size
                print(f'{zip_url}, {file_size:.3f} Mb, {total_size:.3f} Mb total')

                # Extract the zip file in memory
                with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
                    # Iterate through the files in the zip
                    start_time_3 = time.time()
                    for k, file_info in enumerate(zip_ref.infolist()):
                        # print('  ' + '  ' + remaining_time(start_time_3, len(zip_ref.infolist()), k))
                        # Check if the file is a CSV
                        if file_info.filename.lower().endswith('.csv'):
                            # file size
                            csv_size = file_info.file_size/(1024 ** 2)
                            total_size_csv += csv_size/(1024 ** 1)

                            # Extract the CSV file
                            csv_content = zip_ref.read(file_info.filename)
                            csv_filename = os.path.basename(file_info.filename)

                            # Extract metadata from the CSV filename
                            meta_csv = csv_filename.replace('cia_aberta_', '').replace('.csv', '').split('_')
                            ano = meta_csv[-1]
                            demonstrativo = meta_csv[0]
                            meta_csv = meta_csv[1:-1]
                            if len(meta_csv) > 0:
                                agrupamento = meta_csv[-1]
                                meta_csv = meta_csv[:-1]
                            else:
                                agrupamento = ''
                            balance_sheet = '_'.join(meta_csv)

                            # Read CSV content into a pandas DataFrame
                            csv_data = pd.read_csv(BytesIO(csv_content), encoding='iso-8859-1', sep=';')

                            # Add metadata columns to the DataFrame
                            csv_data.insert(0, 'FILE_NAME', csv_filename)
                            csv_data.insert(1, 'DEMONSTRATIVO', demonstrativo)
                            csv_data.insert(2, 'BALANCE_SHEET', balance_sheet)
                            csv_data.insert(3, 'ANO', ano)
                            csv_data.insert(4, 'AGRUPAMENTO', agrupamento)

                            # Append the DataFrame to the list
                            dataframes.append(csv_data)
                            total_rows += len(csv_data)
                            # print(f'{file_info.filename} {csv_size:.3f} Mb, {total_size_csv:.3f} Gb total, {total_rows:,.0f} total rows')
    print(f'Total {len(dataframes)} databases found and {total_rows} downloaded')


    return dataframes

In [1089]:
# clean text
def clean_text(text):
    if isinstance(text, str):  # Check if the value is a string
        cleaned_text = unidecode.unidecode(text).translate(str.maketrans('', '', string.punctuation)).upper().strip()
        return cleaned_text

In [1090]:
# remove words from cell
def clean_cell(cell):
    words_to_remove = ['  EM LIQUIDACAO', ' EM LIQUIDACAO', ' EXTRAJUDICIAL', '  EM RECUPERACAO JUDICIAL', '  EM REC JUDICIAL', ' EM RECUPERACAO JUDICIAL']
    for word in words_to_remove:
        if word in cell:
            cell = cell.replace(word, '').strip()
    return cell



In [1091]:
def clean_DT_INI_EXERC(demo):
    print('double clean dataframes')
    try:
        lines_removed = 0
        for i, (year, df) in enumerate(demo.items()):
            size = len(df)
            
            # Apply the condition to filter the DataFrame
            mask = (df['DT_INI_EXERC'] == pd.to_datetime(str(year) + '-01-01')) | df['DT_INI_EXERC'].isna()
            df_filtered = df[mask].copy()  # Make a copy to avoid modifying the original DataFrame
            
            # Update the 'MATH_MAGIC' column for the filtered rows using .loc indexer
            df_filtered.loc[:, 'MATH_MAGIC'] = False
            
            # Update the dictionary with the filtered DataFrame
            demo[year] = df_filtered
            
            lines_removed += size - len(df_filtered)
            print(year, remaining_time(start_time, len(demo), i))
        print(f'{lines_removed} lines removed')
    except Exception as e:
        pass
    return demo


In [1092]:
# clean dataframes
def clean_dataframes(dict_of_df):
    print('clean dataframes')
    col_datetime = ['DT_REFER', 'DT_FIM_EXERC', 'DT_INI_EXERC']

    for i, (year, df) in enumerate(dict_of_df.items()):
        print(year, remaining_time(start_time, len(dict_of_df), i))
        # remove extra rows
        try:
            df = df[df['ORDEM_EXERC'] == 'ÚLTIMO']
            df = df.drop(columns=['ORDEM_EXERC'])
        except Exception as e:
            # print(e)
            pass

        # # remove extra rows
        # try:
        #     df = df[(df['DT_INI_EXERC'] == pd.to_datetime(str(year) + '-01-01')) | df['DT_INI_EXERC'].isna()]
        # except Exception as e:
        #     pass
        
        # Clean up Text
        try:
            df['DENOM_CIA'] = df['DENOM_CIA'].apply(clean_text)
        except Exception as e:
            pass

        # to datetime
        try:
            df[col_datetime] = df[col_datetime].apply(pd.to_datetime)
        except:
            pass


        # remove specific words
        df['DENOM_CIA'] = df['DENOM_CIA'].apply(clean_cell)

        dict_of_df[year] = df

    return dict_of_df


In [1093]:
# make yearly dict_of_df
def yearly(df_list):
    df_y = {}
    print('group by year')
    start_time = time.time()

    # Iterate through each DataFrame in the 'demo' list
    for i, df in enumerate(df_list):
        # Get the year from the 'ANO' column
        # year = int(df['ANO'].iloc[0])  # Assuming the 'ANO' value is the same for all rows in a DataFrame
        year = pd.to_datetime(df['DT_REFER']).dt.year.iloc[0]  # Extracting the year from the date
        print(year, remaining_time(start_time, len(df_list), i))

        # Check if the year is already a key in the dictionary, if not, create a list for it
        if year not in df_y:
            df_y[year] = []
        
        # Append the DataFrame to the list for the respective year
        df_y[year].append(df)

    print('concatenating')
    start_time = time.time()
    for i, (year, df_list) in enumerate(df_y.items()):
        print(year, remaining_time(start_time, len(df_y), i))
        df_y[year] = pd.concat(df_list, ignore_index=True)

    return df_y

In [1094]:
# group_by_year dict
def group_by_year(dataframes):
    demo = [df for df in dataframes if len(df) > 0 and ('con' in df['FILE_NAME'][0] or 'ind' in df['FILE_NAME'][0])]
    links = [df for df in dataframes if len(df) > 0 and ('con' not in df['FILE_NAME'][0] and 'ind' not in df['FILE_NAME'][0])]

    demo = yearly(demo)
    links = yearly(links)

    # print('clean up dataframes')
    demo = clean_dataframes(demo)
    links = clean_dataframes(links)

    # Rename column for consistency
    for year in links.keys():
        links[year].rename(columns={'VERSAO': 'VERSAO_LINK'}, inplace=True)
    
    return demo, links

In [1095]:
# função auxiliar para download pdf
def download_pdf(df, url):
    # Base directory to save PDFs
    output_dir = 'assets/pdf'
    os.makedirs(output_dir, exist_ok=True)

    total_size = 0  # Initialize cumulative total size

    for i, row in df.iterrows():
        response = requests.get(url.format(ID_DOC=row['ID_DOC']))
        if response.status_code == 200:
            with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
                file_size = len(response.content)/(1024 ** 2)
                total_size += file_size
                for file_info in zip_ref.infolist():
                    if file_info.filename.lower().endswith('.pdf'):
                        pdf_content = zip_ref.read(file_info.filename)
                        filename = f"{clean_text(row['DENOM_CIA'])} {row['DT_REFER']} VERSAO_{row['VERSAO']} {row['ID_DOC']}.pdf"
                        filepath = os.path.join(output_dir, filename)
                        with open(filepath, 'wb') as pdf_file:
                            pdf_file.write(pdf_content)
                        print(f'{i+1}/{len(df)}, {filename}, {file_size:.3f} Mb, {total_size:.3f} Mb total')
    return df

In [1096]:
# get list of companies by AGRUPAMENTO ['ind', 'con']
def get_companies_by_str_port(df):
    # str_port = structured report = relatório estruturado
    
    # Create a pivot table to count the occurrences of 'ind' and 'con' for each 'DENOM_CIA' and 'DT_REFER'
    pivot_table = df.pivot_table(index=['DENOM_CIA', 'DT_REFER'], columns='AGRUPAMENTO', aggfunc='size', fill_value=0)

    # Apply a conditional mapping to convert counts to 1 if count > 0, and 0 otherwise
    pivot_table = pivot_table.applymap(lambda x: True if x > 0 else False)
    pivot_table = pivot_table[['ind'] + [col for col in pivot_table.columns if col != 'ind' and col != 'con'] + ['con']]

    # Get the unique combinations of rows as tuples
    combinations = set(map(tuple, pivot_table.to_numpy()))

    # Create a dictionary to store the combinations of 'con' and 'ind' as keys and corresponding 'DENOM_CIA' as values
    companies_by_str_port  = {}

    # Iterate through the unique combinations and find matching 'DENOM_CIA'
    for combination in combinations:
        relest_individual = combination[0]
        relest_consolidado = combination[1]
        cias = pivot_table[(pivot_table['ind'] == relest_individual) & (pivot_table['con'] == relest_consolidado)].index.get_level_values('DENOM_CIA').unique()
        key = ('ind', 'con')
        if relest_consolidado and not relest_individual:
            key = 'con'
        if not relest_consolidado and relest_individual:
            key = 'ind'

        companies_by_str_port [key] = cias

    return companies_by_str_port

## Content

### Download, clean and organize databases

In [1104]:
# Get Base Info
files_list = gather_links(base_url)
files_list_df = get_files_dates(files_list, xpath)

meta_dict = get_metadados(files_list)
categories = get_categories(files_list)

demonstrativos = []
for cat in categories:
    term = 'DOC/'
    if term in cat:
        demonstrativos.append(cat.replace(term,'').lower())

# Imprimir resultados
total_fields = sum((i + 1) * len(d) for i, d in enumerate(meta_dict.values()))
print(f'{base_url}')
print(f'Encontradas {len(categories)} categorias com {len(meta_dict)} arquivos meta contendo {total_fields} campos')
print(demonstrativos)


https://dados.cvm.gov.br/dados/CIA_ABERTA/
Encontradas 8 categorias com 74 arquivos meta contendo 39641 campos
['cgvn', 'dfp', 'fca', 'fre', 'ipe', 'itr', 'vlmo']


In [1109]:
# download_raw_sheets
start_time = time.time()
update = False

demos = ['itr', 'dfp']
file_name = f'dataframes.csv'
dataframes = download_database(demos, update=update)

# if update == True:
#     # load part from csv and part from web, anc concat them
#     dataframes_csv = pd.read_csv(file_name)
# else:
#     # load everything from web, no updates, download all this should be an empty saved csv files
#     # frist update = false to load everything, then create an empty dt
#     dataframes_csv = pd.DataFrame('empty')

# dataframes_web = download_database(demonstrativos, update=update)
# dataframes = pd.concat([dataframes_csv, dataframes_web], ignore_index=True)


In [None]:
# clean and group by year
start_time = time.time()

demo, links = group_by_year(dataframes)
demo = clean_DT_INI_EXERC(demo)


In [None]:
demo = load_pkl('dataframes')

### By Year

In [None]:
last_quarters = ['3', '4']
all_quarters = ['6', '7']
n = 1000000000

start_time = time.time()
for n1, (year, demonstrativo) in enumerate(demo.items()):

    # companies by structured report (ind, con)
    companies_by_str_port = get_companies_by_str_port(demonstrativo)
    print(f"{year} {len(demonstrativo):,.0f} lines, {len(demonstrativo['DENOM_CIA'].unique())} companies, {'/'.join([f'{len(companies)} {key}' for key, companies in companies_by_str_port.items()])}")
    print(year, remaining_time(start_time, len(demo), n1))

    # a mathport is a group of 'DENOM_CIA' and 'AGRUPAMENTO' and 'DT_REFER'.dt.year
    groups = demonstrativo.groupby(['DENOM_CIA', 'AGRUPAMENTO'], group_keys=False)
    # math_keys = list(math_port.groups.keys())

    # # strport is a group of 'DENOM_CIA', 'DT_REFER' and 'AGRUPAMENTO'. 
    # str_port = df.groupby(['DENOM_CIA', 'AGRUPAMENTO', 'DT_REFER'], group_keys=False)
    # str_keys = list(str_port.groups.keys())

    start_time_2 = time.time()
    for n2, (key, group) in enumerate(groups):
        print('  ', remaining_time(start_time_2, len(groups), n2))
        company = key[0]
        agg = key[1]
        # print(f"{{n2}/{len(groups)-n2}. {year}, {company}, {agg}, {len(group)}")
        subgroups = group.groupby(['CD_CONTA', 'DS_CONTA'], group_keys=False)
        # conta_keys = list(conta_port.groups.keys())

        start_time_3 = time.time()
        for n3, (index, df) in enumerate(subgroups):
            # print('  ', '  ', remaining_time(start_time_3, len(subgroups), n3))
            conta = index[0]
            conta_first = index[0][0]
            descricao = index[1]
            # print(f"{n1}/{len(demo)-n1}. {n2}/{len(groups)-n2}. {n3}/{len(subgroups)-n3}. {year}, {company}, {agg}, {len(subgroups)}, {conta}, {descricao} {df['VL_CONTA'].values}")

            update = False
            if all(q in df['DT_REFER'].dt.quarter.values for q in [1, 2, 3, 4]):
            # do the magic
                i1 = df[df['DT_REFER'].dt.quarter == 1].index[0]
                i2 = df[df['DT_REFER'].dt.quarter == 2].index[0]
                i3 = df[df['DT_REFER'].dt.quarter == 3].index[0]
                i4 = df[df['DT_REFER'].dt.quarter == 4].index[0]

                q1 = df[df['DT_REFER'].dt.quarter == 1]['VL_CONTA'].iloc[0] #.sum() #.mean() #.max() etc
                q2 = df[df['DT_REFER'].dt.quarter == 2]['VL_CONTA'].iloc[0]
                q3 = df[df['DT_REFER'].dt.quarter == 3]['VL_CONTA'].iloc[0]
                q4 = df[df['DT_REFER'].dt.quarter == 4]['VL_CONTA'].iloc[0]

            
            try:
                if conta_first in last_quarters:
                    if not demonstrativo.loc[i4, 'MATH_MAGIC']:
                        q4 = q4 - (q3 + q2 + q1)
                    update = True
                elif conta_first in all_quarters:
                    if not demonstrativo.loc[i2, 'MATH_MAGIC']:
                        q2 = q2 - (q1)
                    if not demonstrativo.loc[i3, 'MATH_MAGIC']:
                        q3 = q3 - (q2 + q1)
                    if not demonstrativo.loc[i4, 'MATH_MAGIC']:
                        q4 = q4 - (q3 + q2 + q1)
                    update = True
            except Exception as e:
                update = False

            if update:
                demonstrativo.loc[i2, ['VL_CONTA', 'MATH_MAGIC']] = [q2, True]
                demonstrativo.loc[i3, ['VL_CONTA', 'MATH_MAGIC']] = [q3, True]
                demonstrativo.loc[i4, ['VL_CONTA', 'MATH_MAGIC']] = [q4, True]

                # print(f"{year}, {company}, {agg}, {len(subgroups)}, {conta}, {descricao} {df['VL_CONTA'].values}")
            if n3 > n:
                break
        if n2 > n:
            break
    if n1 > n:
        break


In [None]:
# save by company
## get an aggregated list of company, agg in all demo dict
## create a demo_company dict, transform from year to company

## must create a setor, subsetor, segmento for companies, then use as keys?

In [1139]:
# Get all unique companies across all years in demo
all_companies = set()
for i, (year, df) in enumerate(demo.items()):
    all_companies.update(df['DENOM_CIA'].unique())

# Initialize the final dictionary with companies as keys
companies = {}

start_time = time.time()
# Populate the company_dict
for i1, company in enumerate(all_companies):
    print(remaining_time(start_time, len(all_companies), i1))
    company_data = []  # This will hold dataframes for each year for the company
    for i2, (year, df) in enumerate(demo.items()):
        company_df_for_year = df[df['DENOM_CIA'] == company]
        company_data.append(company_df_for_year)
    
    # Concatenate the data for the company across all years
    companies[company] = pd.concat(company_data, ignore_index=True)


0.12% 1+818, 0.000000s per item, Remaining: 0h 00m 00s
0.24% 2+817, 0.313227s per item, Remaining: 0h 04m 15s
0.37% 3+816, 0.389546s per item, Remaining: 0h 05m 17s
0.49% 4+815, 0.436658s per item, Remaining: 0h 05m 55s
0.61% 5+814, 0.516280s per item, Remaining: 0h 07m 00s
0.73% 6+813, 0.534735s per item, Remaining: 0h 07m 14s
0.85% 7+812, 0.547055s per item, Remaining: 0h 07m 24s
0.98% 8+811, 0.547774s per item, Remaining: 0h 07m 24s
1.10% 9+810, 0.552637s per item, Remaining: 0h 07m 27s
1.22% 10+809, 0.553718s per item, Remaining: 0h 07m 27s
1.34% 11+808, 0.564766s per item, Remaining: 0h 07m 36s
1.47% 12+807, 0.570910s per item, Remaining: 0h 07m 40s
1.59% 13+806, 0.577431s per item, Remaining: 0h 07m 45s
1.71% 14+805, 0.578160s per item, Remaining: 0h 07m 45s
1.83% 15+804, 0.583786s per item, Remaining: 0h 07m 49s
1.95% 16+803, 0.590893s per item, Remaining: 0h 07m 54s
2.08% 17+802, 0.601585s per item, Remaining: 0h 08m 02s
2.20% 18+801, 0.603117s per item, Remaining: 0h 08m 03s
2

In [1225]:
companies_list = sorted(list(companies.keys()))
df = companies['ALPARGATAS SA']


In [None]:
df = demo[2014]

m_company = df['DENOM_CIA'] == 'CENTRAIS ELET BRAS SA  ELETROBRAS'
m_conta = df['BALANCE_SHEET'] == 'DRE'
m_conta_len = df['CD_CONTA'].str.len() <= 4
m_agg = df['AGRUPAMENTO'] == 'ind'
mask = m_company & m_conta & m_conta_len & m_agg

# Filter the DataFrame using the mask
filtered_df = df[mask][['DS_CONTA', 'DT_REFER', 'VL_CONTA']]

# Pivot the filtered DataFrame using pivot_table and aggregation function
pivot_df = filtered_df.pivot_table(index='DT_REFER', columns='DS_CONTA', values='VL_CONTA', aggfunc='sum')

# Plot the pivoted DataFrame
try:
    pivot_df.plot()
except:
    pass


In [None]:
df['CD_CONTA'].unique()

In [None]:
m_company = df['DENOM_CIA'] == 'ALPARGATAS SA'
m_conta = df['BALANCE_SHEET'] == 'DRE'
m_conta_len = df['CD_CONTA'].str.len() <= 4
m_agg = df['AGRUPAMENTO'] == 'ind'
mask = m_company & m_conta & m_conta_len & m_agg
df[mask]

In [None]:
# o que caracteriza uma df completa, um relatório estruturado? DENOM_CIA + DT_REFER
# como saber quais empresas são 'con' e quais são 'ind' e como criar uma terceira view mae = con - ind?


In [None]:
companies_by_relest

In [None]:
# SÓ INDIVIDUAL ind = True, con = False: '3A COMPANHIA SECURITIZADORA'
# SÓ CONSOLIDADO ind = False, con = True: 'BANCO SANTANDER SA'
# AMBOS ind = True, con = True: 'ADVANCED DIGITAL HEALTH MEDICINA PREVENTIVA SA'


In [None]:
companies_by_relest['individual']

In [None]:
for k, v in companies_by_relest.items():
    print(k)

In [None]:
relest_individual = True
relest_consolidado = False
pivot_table[(pivot_table['ind'] == relest_individual) & (pivot_table['con'] == relest_consolidado)].index.get_level_values('DENOM_CIA').unique()[0]

In [None]:
cia_cols = ['CNPJ_CIA', 'DENOM_CIA', 'CD_CVM'] # 683 rows
bal_cols = ['FILE_NAME', 'DEMONSTRATIVO', 'BALANCE_SHEET', 'AGRUPAMENTO', 'GRUPO_DFP'] # 32 rows
dt_cols = ['DT_REFER', 'DT_FIM_EXERC'] # 4 rows
cod_conta_cols = ['CD_CONTA', ] # 1543 rows
desc_conta_cols = ['DS_CONTA', ] # 38597 rows
vlr_cta_cols = ['VL_CONTA',]

ubiq_cols = ['ANO', 'MOEDA'] # 1 row
unique__independ_cols = ['VERSAO', 'ESCALA_MOEDA',  'ST_CONTA_FIXA', 'DT_INI_EXERC', 'COLUNA_DF']

In [None]:
df_temp = df[cia_cols+bal_cols+dt_cols+cod_conta_cols+desc_conta_cols+vlr_cta_cols+unique__independ_cols]
mask = df_temp['CD_CVM'] == 2437
mask2 = df_temp['AGRUPAMENTO'] == 'con'
mask3 = df_temp['BALANCE_SHEET'] == 'DRE'
mask4 = df_temp['DT_REFER'].dt.month == 12
mask5 = df_temp['CD_CONTA'] == '3.01'

df_temp[mask&mask2&mask3&mask5].drop_duplicates()

In [None]:
df

In [None]:
# group by year

# clean df



In [None]:
sorted_df.groupby('CD_CVM')['VERSAO'].idxmax()


In [None]:
df = df_demo_y[year]
if 1 == 1:
    df = df.copy()  # Create a copy of the DataFrame

    # filter only last one
    df = df[df['ORDEM_EXERC'] == 'ÚLTIMO']
    df = df.drop(columns=['ORDEM_EXERC'])

    # Clean up Text
    df['DENOM_CIA'] = df['DENOM_CIA'].apply(clean_text)

    # rows to remove
    words_to_remove = ['LIQUIDACAO', 'JUDICIAL', ]
    df = df[~df['DENOM_CIA'].str.contains('|'.join(words_to_remove))]

    # DateTime
    date_columns = ['DT_REFER', 'DT_RECEB', 'DT_FIM_EXERC', ]
    # Convert the specified date columns to datetime
    for col in date_columns:
        try:
            df = df.assign(**{col: pd.to_datetime(df[col])})
        except Exception as e:
            # Handle invalid date values here
            pass

    # Sort the DataFrame by 'CD_CVM' and 'VERSAO' in descending order
    sorted_df = df.sort_values(by=['CD_CVM', 'VERSAO'], ascending=[True, False])

    # After sorting
    sorted_df = df.sort_values(by=['CD_CVM', 'VERSAO'], ascending=[True, False])
    print("After sorting:", sorted_df.shape)

    # After the groupby and idxmax operations
    indices_of_max_version = sorted_df.groupby('CD_CVM')['VERSAO'].idxmax()
    print("Indices of max version:", indices_of_max_version)


    # Get the indices of the rows with the highest version for each 'CD_CVM'
    indices_of_max_version = sorted_df.groupby('CD_CVM')['VERSAO'].idxmax()

    # Filter the DataFrame to keep only the rows with the highest version for each 'CD_CVM'
    filtered_df = sorted_df.loc[indices_of_max_version]

    df = sorted_df = df.sort_values(by=['DENOM_CIA'], ascending=[True])


    # return df.reset_index(drop=True)

In [None]:
df_demo_y = group_by_year(df_demo, df_demo_links)

In [None]:
df_demo_y[year]

In [None]:
df_demo_links_y[year][df_demo_links_y[year]['CD_CVM'] == 15253].head(30)

In [None]:
merged_df = df.merge(df_demo_links[df_demo_links['ANO'] == '2023'][cols_common + cols_to_add], on=cols_common, how='left')
merged_df

In [None]:
df[cols_common].drop_duplicates()

In [None]:
df_demo_links[cols_common].drop_duplicates()

In [None]:
# save to csv

dfp.to_csv('dfp.csv')
dfp_links.to_csv('dfp_links.csv')
itr.to_csv('itr.csv')
itr_links.to_csv('itr_links.csv')


In [None]:
# read from csv

dfp = pd.read_csv('dfp.csv')
dfp_links = pd.read_csv('dfp_links.csv')
itr = pd.read_csv('itr.csv')
itr_links = pd.read_csv('itr_links.csv')

In [None]:
# Base URL
url_raw =        'https://www.rad.cvm.gov.br/ENET/frmGerenciaPaginaFRE.aspx?CodigoTipoInstituicao=1&NumeroSequencialDocumento={ID_DOC}'
url_download = 'http://www.rad.cvm.gov.br/ENETCONSULTA/frmDownloadDocumento.aspx?CodigoInstituicao=1&NumeroSequencialDocumento={ID_DOC}'
url_relatorio_administracao = 'https://www.rad.cvm.gov.br/ENET/frmExibirArquivoFRE.aspx?NumeroSequencialDocumento=8299&CodigoGrupo=1653&CodigoQuadro=0&Tipo=PDF&RelatorioRevisaoEspecial=Sem+Ressalva&CodTipoDocumento=4'

# Generate list of URLs
dfp_url_list = [url_raw.format(ID_DOC=id_doc) for id_doc in dfp_links['ID_DOC']]
dfp_download_list = [url_download.format(ID_DOC=id_doc) for id_doc in dfp_links['ID_DOC']]
dfp_relatorio_administracao_list = [url_relatorio_administracao.format(ID_DOC=id_doc) for id_doc in dfp_links['ID_DOC']]
itr_url_list = [url_raw.format(ID_DOC=id_doc) for id_doc in itr_links['ID_DOC']]
itr_download_list = [url_download.format(ID_DOC=id_doc) for id_doc in itr_links['ID_DOC']]
itr_relatorio_administracao_list = [url_relatorio_administracao.format(ID_DOC=id_doc) for id_doc in itr_links['ID_DOC']]


In [None]:
def update_version_old(df, links):
    cols_to_add = ['VERSAO_DOC', 'CATEG_DOC', 'ID_DOC', 'DT_RECEB']
    
    # Create the columns if they don't exist
    for col in cols_to_add:
        if col not in df.columns:
            df[col] = None
    
    # Group the df DataFrame by the conditions
    grouped_df = df.groupby(['ANO', 'DEMONSTRATIVO', 'CNPJ_CIA', 'CD_CVM', 'DT_REFER'])
    
    # Iterate through links and update grouped_df
    for i, row in links.iterrows():
        key = (row['ANO'], row['DEMONSTRATIVO'], row['CNPJ_CIA'], row['CD_CVM'], row['DT_REFER'])
        group = grouped_df.get_group(key)
        
        cols_to_update = ['VERSAO', 'CATEG_DOC', 'ID_DOC', 'DT_RECEB']
        df.loc[group.index, cols_to_add] = row[cols_to_update].values
        
        print(f"{i+1}/{len(links)-i} {row['DEMONSTRATIVO']} {row['ANO']} - {row['CD_CVM']} {row['CNPJ_CIA']} - {row['DT_REFER']} versão {row['VERSAO']}")
    
    while i > 20:
        print('break')
        break
    return df


In [None]:
def update_version(df, links, grouped_df):
    cols_to_add = ['VERSAO_DOC', 'CATEG_DOC', 'ID_DOC', 'DT_RECEB']

    # Create the columns if they don't exist
    for col in cols_to_add:
        if col not in df.columns:
            df[col] = None

    # Convert columns to object type
    for col in cols_to_add:
        try:
            df[col] = df[col].astype('object')
        except Exception as e:
            pass

    for i, key in enumerate(grouped_df.groups.keys()):
        print(f'{i}/{len(grouped_df)} {key}')
        
        grouped_df.get_group(key)
        ano, demonstrativo, cnpj_cia, cd_cvm, dt_refer = key

        # Filter links based on the group's conditions
        mask = (
            (links['ANO'] == ano) &
            (links['DEMONSTRATIVO'] == demonstrativo) &
            (links['CNPJ_CIA'] == cnpj_cia) &
            (links['CD_CVM'] == cd_cvm) &
            (links['DT_REFER'] == dt_refer)
        )


        df.loc[grouped_df.get_group(key).index, cols_to_add] = links[mask][cols_to_add].values

    # Convert columns to object type
    for col in cols_to_add:
        try:
            df[col] = df[col].astype('category')
        except Exception as e:
            pass

    return df


In [None]:
# Group the df DataFrame by the conditions
group_columns = ['ANO', 'DEMONSTRATIVO', 'CNPJ_CIA', 'CD_CVM', 'DT_REFER']
grouped_df = itr.groupby(group_columns)

itr = update_version(itr, itr_links, grouped_df)


# # Group the df DataFrame by the conditions
# group_columns = ['ANO', 'DEMONSTRATIVO', 'CNPJ_CIA', 'CD_CVM', 'DT_REFER']
# grouped_df = dfp.groupby(group_columns)

# dfp = update_version(dfp, dfp_links, grouped_df)