## Imports and Defs

### Imports

In [80]:
import requests
from bs4 import BeautifulSoup
import zipfile
import os
import io

from urllib.parse import urljoin
from io import BytesIO
import pandas as pd
pd.set_option('display.max_colwidth', None)

import unidecode
import string

import time
import datetime
import pickle


### Variables

In [2]:
prefix = 'dfp'
radical = 'cia_aberta'
extension = '.zip'
demo_cvmnstracoes_financeiras = ['DRA', 'DMPL', 'DFC_MD', 'DFC_MI', 'BPA', 'BPP', 'DRE', 'DVA']  # Add all other items
demo_cvmnstracoes_financeiras_dict = {
    'BPA': 'Balanço Patrimonial Ativo (BPA)',
    'BPP': 'Balanço Patrimonial Passivo (BPP)',
    'DFC_MD': 'demo_cvmnstração de Fluxo de Caixa - Método Direto (DFC-MD)',
    'DFC_MI': 'demo_cvmnstração de Fluxo de Caixa - Método Indireto (DFC-MI)',
    'DMPL': 'demo_cvmnstração de Mutações do Patrimônio Líquido (DMPL)',
    'DRA': 'demo_cvmnstração de Resultado Abrangente (DRA)',
    'DRE': 'demo_cvmnstração de Resultado (DRE)',
    'DVA': 'demo_cvmnstração de Valor Adicionado (DVA)'
}
base_de_consolidacao = ['ind', 'con']  # Add other variables if existing
base_de_consolidacao_dict = {
    'ind': 'Individual',
    'con': 'Consolidado'
}



In [3]:
# URL base do site
base_cvm = "https://dados.cvm.gov.br/dados/CIA_ABERTA/"

# Inicializar uma sessão
session = requests.Session()

# Lista para armazenar links de arquivos CSV e ZIP
files_list = []

# Conjunto para armazenar subpastas já visitadas
visited_subfolders = set()



In [4]:
col_category = ['FILE_NAME', 'demo_cvmNSTRATIVO', 'BALANCE_SHEET', 'ANO', 'AGRUPAMENTO', 'CNPJ_CIA', 'VERSAO', 'DENOM_CIA', 'CD_CVM', 'GRUPO_DFP', 'MOEDA', 'ESCALA_MOEDA', 'ORDEM_EXERC', 'CD_CONTA', 'DS_CONTA', 'ST_CONTA_FIXA', 'COLUNA_DF']
col_datetime = ['DT_REFER', 'DT_FIM_EXERC', 'DT_INI_EXERC']
col_float = ['VL_CONTA']



### Defs

In [5]:
# save and load pkl
def save_pkl(demo_cvm, file_name):
    # Save dump pickle
    with open(f'{file_name}.pkl', 'wb') as f:
        pickle.dump(demo_cvm, f)
    return demo_cvm

def load_pkl(file_name):
    # Read load pickle
    with open(f'{file_name}.pkl', 'rb') as f:
        demo_cvm = pickle.load(f)
    return demo_cvm

In [6]:
def remaining_time(start_time, size, i):
    # Calculate the number of items processed and the number of remaining items
    counter = i + 1
    remaining_items = size - counter
    
    # Calculate the percentage of completion
    percentage = counter / size
    
    # Calculate the elapsed time
    running_time = time.time() - start_time
    
    # Calculate the average time taken per item
    avg_time_per_item = running_time / counter
    
    # Calculate the remaining time based on the average time per item
    remaining_time = remaining_items * avg_time_per_item
    
    # Convert remaining time to hours, minutes, and seconds
    hours, remainder = divmod(int(remaining_time), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    # Format remaining time as a string
    remaining_time_formatted = f'{int(hours)}h {int(minutes):02}m {int(seconds):02}s'
    
    # Create a progress string with all the calculated values
    progress = (
        f'{percentage:.2%} '
        f'{counter}+{remaining_items}, '
        f'{avg_time_per_item:.6f}s per item, '
        f'Remaining: {remaining_time_formatted}'
    )

    return progress

In [7]:
# Função auxiliar para reunir links
def gather_links(url):
    visited_subfolders.add(url)  # Marcar a subpasta como visitada
    response = session.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    for link in soup.find_all("a"):
        href = link.get("href")
        full_link = urljoin(url, href)

        if full_link.startswith(base_cvm) and full_link not in visited_subfolders:
            if full_link.endswith((".csv", ".zip", ".txt")):
                files_list.append(full_link)
            elif full_link.endswith("/"):
                gather_links(full_link)
    return files_list



In [8]:
# helper for categories
def get_categories(files_list):
    # Extrair categorias e arquivos meta
    categories = set()
    meta_files = [file_link for file_link in files_list if "meta" in file_link]
    files = [file_link for file_link in files_list if "meta" not in file_link]

    for file_link in files_list:
        cat = '/'.join(file_link.replace(base_cvm,'').split('/')[:-2])
        categories.add(cat)
    categories = sorted(list(categories))

    return categories


In [9]:
# Função auxiliar para extrair metadados
def extract_meta(content):
    meta_dict = {}
    blocks = content.split("-----------------------\r\nCampo: ")[1:]
    
    for block in blocks:
        lines = block.strip().split("\r\n")
        campo = lines[0]
        descricao = None
        
        for line in lines:
            if 'Descrição' in line or 'Descrio' in line:
                descricao = line.split(':')[1].strip()
                break
        
        if descricao is not None:
            meta_dict[campo] = descricao
    
    return meta_dict



In [10]:
# Extrair e processar metadados
def get_metadados(files_list):
    meta_dict = {}
    meta_files = [file_link for file_link in files_list if "meta" in file_link]

    for file in meta_files:
        response = session.get(file)
        response.raise_for_status()

        if file.endswith('.zip'):
            zip_file = zipfile.ZipFile(io.BytesIO(response.content))

            for file_in_zip in zip_file.namelist():
                with zip_file.open(file_in_zip) as zip_file_content:
                    file_content = zip_file_content.read().decode('utf-8', errors='ignore')
                    d = extract_meta(file_content)
                    meta_dict[file_in_zip.split('.')[0]] = d
        elif file.endswith('.txt'):
            file_content = response.content.decode('iso-8859-1')
            d = extract_meta(file_content)
            file_name = file.split('/')[-1].split('.')[0]
            meta_dict[file_name] = d

    return meta_dict

In [11]:
# extract financial sheets from web zip files
def download_database(demonstrativos_cvm):
    # Initialize variables
    fin_sheet = []
    fin_sheet_links = []
    total_size = 0  
    total_size_csv = 0
    total_rows = 0
    dataframes = []

    for i, demo_cvmnstrativo in enumerate(demonstrativos_cvm):
        print(remaining_time(start_time, len(demonstrativos_cvm), i))
        # Retrieve the list of files based on the specified 'demo_cvmnstrativo'
        download_files = [file_link for file_link in files_list if 'meta' not in file_link and demo_cvmnstrativo in file_link]

        # Iterate through the list of URLs
        start_time_2 = time.time()
        for j, zip_url in enumerate(download_files):
            print('  ' + remaining_time(start_time_2, len(download_files), j))
            # Download the zip file
            response = requests.get(zip_url)
            
            # Check if the download was successful
            if response.status_code == 200:
                # Get the size of the downloaded file
                file_size = len(response.content)/(1024 ** 2)
                total_size += file_size
                print(f'{zip_url}, {file_size:.3f} Mb, {total_size:.3f} Mb total')

                # Extract the zip file in memory
                with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
                    # Iterate through the files in the zip
                    start_time_3 = time.time()
                    for k, file_info in enumerate(zip_ref.infolist()):
                        # print('  ' + '  ' + remaining_time(start_time_3, len(zip_ref.infolist()), k))
                        # Check if the file is a CSV
                        if file_info.filename.lower().endswith('.csv'):
                            # file size
                            csv_size = file_info.file_size/(1024 ** 2)
                            total_size_csv += csv_size/(1024 ** 1)

                            # Extract the CSV file
                            csv_content = zip_ref.read(file_info.filename)
                            csv_filename = os.path.basename(file_info.filename)

                            # Extract metadata from the CSV filename
                            meta_csv = csv_filename.replace('cia_aberta_', '').replace('.csv', '').split('_')
                            ano = meta_csv[-1]
                            demo_cvmnstrativo = meta_csv[0]
                            meta_csv = meta_csv[1:-1]
                            if len(meta_csv) > 0:
                                agrupamento = meta_csv[-1]
                                meta_csv = meta_csv[:-1]
                            else:
                                agrupamento = ''
                            balance_sheet = '_'.join(meta_csv)

                            # Read CSV content into a pandas DataFrame
                            csv_data = pd.read_csv(BytesIO(csv_content), encoding='iso-8859-1', sep=';')

                            # Add metadata columns to the DataFrame
                            csv_data.insert(0, 'FILE_NAME', csv_filename)
                            csv_data.insert(1, 'demo_cvmNSTRATIVO', demo_cvmnstrativo)
                            csv_data.insert(2, 'BALANCE_SHEET', balance_sheet)
                            csv_data.insert(3, 'ANO', ano)
                            csv_data.insert(4, 'AGRUPAMENTO', agrupamento)

                            # Append the DataFrame to the list
                            dataframes.append(csv_data)
                            total_rows += len(csv_data)
                            # print(f'{file_info.filename} {csv_size:.3f} Mb, {total_size_csv:.3f} Gb total, {total_rows:,.0f} total rows')
    print(f'Total {len(dataframes)} databases found and {total_rows} downloaded')


    return dataframes

In [12]:
# clean text
def clean_text(text):
    if isinstance(text, str):  # Check if the value is a string
        cleaned_text = unidecode.unidecode(text).translate(str.maketrans('', '', string.punctuation)).upper().strip()
        return cleaned_text

In [13]:
# remove words from cell
def clean_cell(cell):
    words_to_remove = ['  EM LIQUIDACAO', ' EM LIQUIDACAO', ' EXTRAJUDICIAL', '  EM RECUPERACAO JUDICIAL', '  EM REC JUDICIAL', ' EM RECUPERACAO JUDICIAL']
    for word in words_to_remove:
        if word in cell:
            cell = cell.replace(word, '').strip()
    return cell



In [14]:
def clean_DT_INI_EXERC(demo_cvm):
    print('double clean dataframes')
    try:
        lines_removed = 0
        for i, (year, df) in enumerate(demo_cvm.items()):
            size = len(df)
            
            # Apply the condition to filter the DataFrame
            mask = (df['DT_INI_EXERC'] == pd.to_datetime(str(year) + '-01-01')) | df['DT_INI_EXERC'].isna()
            df_filtered = df[mask].copy()  # Make a copy to avoid modifying the original DataFrame
            
            # Update the 'MATH_MAGIC' column for the filtered rows using .loc indexer
            df_filtered.loc[:, 'MATH_MAGIC'] = False
            
            # Update the dictionary with the filtered DataFrame
            demo_cvm[year] = df_filtered
            
            lines_removed += size - len(df_filtered)
            print(year, remaining_time(start_time, len(demo_cvm), i))
        print(f'{lines_removed} lines removed')
    except Exception as e:
        pass
    return demo_cvm


In [15]:
# clean dataframes
def clean_dataframes(dict_of_df):
    print('clean dataframes')
    col_datetime = ['DT_REFER', 'DT_FIM_EXERC', 'DT_INI_EXERC']

    for i, (year, df) in enumerate(dict_of_df.items()):
        print(year, remaining_time(start_time, len(dict_of_df), i))
        # remove extra rows
        try:
            df = df[df['ORDEM_EXERC'] == 'ÚLTIMO']
            df = df.drop(columns=['ORDEM_EXERC'])
        except Exception as e:
            # print(e)
            pass

        # # remove extra rows
        # try:
        #     df = df[(df['DT_INI_EXERC'] == pd.to_datetime(str(year) + '-01-01')) | df['DT_INI_EXERC'].isna()]
        # except Exception as e:
        #     pass
        
        # Clean up Text
        try:
            df['DENOM_CIA'] = df['DENOM_CIA'].apply(clean_text)
        except Exception as e:
            pass

        # to datetime
        try:
            df[col_datetime] = df[col_datetime].apply(pd.to_datetime)
        except:
            pass


        # remove specific words
        df['DENOM_CIA'] = df['DENOM_CIA'].apply(clean_cell)

        dict_of_df[year] = df

    return dict_of_df


In [16]:
# make yearly dict_of_df
def yearly(df_list):
    df_y = {}
    print('group by year')
    start_time = time.time()

    # Iterate through each DataFrame in the 'demo_cvm' list
    for i, df in enumerate(df_list):
        # Get the year from the 'ANO' column
        # year = int(df['ANO'].iloc[0])  # Assuming the 'ANO' value is the same for all rows in a DataFrame
        year = pd.to_datetime(df['DT_REFER']).dt.year.iloc[0]  # Extracting the year from the date
        print(year, remaining_time(start_time, len(df_list), i))

        # Check if the year is already a key in the dictionary, if not, create a list for it
        if year not in df_y:
            df_y[year] = []
        
        # Append the DataFrame to the list for the respective year
        df_y[year].append(df)

    print('concatenating')
    start_time = time.time()
    for i, (year, df_list) in enumerate(df_y.items()):
        print(year, remaining_time(start_time, len(df_y), i))
        df_y[year] = pd.concat(df_list, ignore_index=True)

    return df_y

In [17]:
# group_by_year dict
def group_by_year(dataframes):
    demo_cvm = [df for df in dataframes if len(df) > 0 and ('con' in df['FILE_NAME'][0] or 'ind' in df['FILE_NAME'][0])]
    links = [df for df in dataframes if len(df) > 0 and ('con' not in df['FILE_NAME'][0] and 'ind' not in df['FILE_NAME'][0])]

    demo_cvm = yearly(demo_cvm)
    links = yearly(links)

    # print('clean up dataframes')
    demo_cvm = clean_dataframes(demo_cvm)
    links = clean_dataframes(links)

    # Rename column for consistency
    for year in links.keys():
        links[year].rename(columns={'VERSAO': 'VERSAO_LINK'}, inplace=True)
    
    return demo_cvm, links

In [18]:
# função auxiliar para download pdf
def download_pdf(df, url):
    # Base directory to save PDFs
    output_dir = 'assets/pdf'
    os.makedirs(output_dir, exist_ok=True)

    total_size = 0  # Initialize cumulative total size

    for i, row in df.iterrows():
        response = requests.get(url.format(ID_DOC=row['ID_DOC']))
        if response.status_code == 200:
            with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
                file_size = len(response.content)/(1024 ** 2)
                total_size += file_size
                for file_info in zip_ref.infolist():
                    if file_info.filename.lower().endswith('.pdf'):
                        pdf_content = zip_ref.read(file_info.filename)
                        filename = f"{clean_text(row['DENOM_CIA'])} {row['DT_REFER']} VERSAO_{row['VERSAO']} {row['ID_DOC']}.pdf"
                        filepath = os.path.join(output_dir, filename)
                        with open(filepath, 'wb') as pdf_file:
                            pdf_file.write(pdf_content)
                        print(f'{i+1}/{len(df)}, {filename}, {file_size:.3f} Mb, {total_size:.3f} Mb total')
    return df

In [19]:
# get list of companies by AGRUPAMENTO ['ind', 'con']
def get_companies_by_str_port(df):
    # str_port = structured report = relatório estruturado
    
    # Create a pivot table to count the occurrences of 'ind' and 'con' for each 'DENOM_CIA' and 'DT_REFER'
    pivot_table = df.pivot_table(index=['DENOM_CIA', 'DT_REFER'], columns='AGRUPAMENTO', aggfunc='size', fill_value=0)

    # Apply a conditional mapping to convert counts to 1 if count > 0, and 0 otherwise
    pivot_table = pivot_table.applymap(lambda x: True if x > 0 else False)
    pivot_table = pivot_table[['ind'] + [col for col in pivot_table.columns if col != 'ind' and col != 'con'] + ['con']]

    # Get the unique combinations of rows as tuples
    combinations = set(map(tuple, pivot_table.to_numpy()))

    # Create a dictionary to store the combinations of 'con' and 'ind' as keys and corresponding 'DENOM_CIA' as values
    companies_by_str_port  = {}

    # Iterate through the unique combinations and find matching 'DENOM_CIA'
    for combination in combinations:
        relest_individual = combination[0]
        relest_consolidado = combination[1]
        cias = pivot_table[(pivot_table['ind'] == relest_individual) & (pivot_table['con'] == relest_consolidado)].index.get_level_values('DENOM_CIA').unique()
        key = ('ind', 'con')
        if relest_consolidado and not relest_individual:
            key = 'con'
        if not relest_consolidado and relest_individual:
            key = 'ind'

        companies_by_str_port [key] = cias

    return companies_by_str_port

## Content

### Download clean and organize databases

In [20]:
base_cvm = 'https://dados.cvm.gov.br/dados/CIA_ABERTA/'

In [21]:
import pickle
from urllib.parse import urljoin
import zipfile
from lxml import html

In [33]:
def get_filelink_df(base_cvm):
    """
    Retrieves file links and associated dates from a list of URLs.

    This function takes a list of URLs (filelist) and extracts folder URLs from the list.
    It then visits each folder URL, extracts file information using an XPath expression,
    and filters the information based on the current year.
    
    Args:
        filelist (list): A list of URLs containing file links.

    Returns:
        pandas.DataFrame: A DataFrame containing file names and dates for the current year.
    """
    filelist = gather_links(base_cvm)

    folders = set()

    # Extract folder URLs from file links
    for url in filelist:
        folder_url = '/'.join(url.split('/')[:-1])
        folders.add(folder_url)

    fileinfo_df = []
    start_time = time.time()

    # Loop through folder URLs and extract file information
    for i, url in enumerate(folders):
        print(remaining_time(start_time, len(folders), i))
        response = requests.get(url)
        response.raise_for_status()
        tree = html.fromstring(response.content)
        contents = tree.xpath(xpath_cvm) 

        # Extract content and process file information
        for content in contents:
            lines = content.text_content().split('\r\n')
            for line in lines:
                parts = line.split()
                if len(parts) >= 3:
                    filename = url + '/' + parts[0]
                    date = pd.to_datetime(f'{parts[1]} {parts[2]}', format='%d-%b-%Y %H:%M')
                    fileinfo_df.append([filename, date])

    # Create and filter DataFrame for the current year
    fileinfo_df = pd.DataFrame(fileinfo_df, columns=['filename', 'date'])
    # try:
    #     last_update = pd.read_csv(f'datasets/last_update.csv').iloc[0, 0]
    # except Exception as e:
    #     last_update = '1970-01-01'
    # fileinfo_df = fileinfo_df[fileinfo_df['date'] > pd.to_datetime(last_update)]

    return fileinfo_df

In [23]:
xpath_cvm = '/html/body/div[1]/pre'

In [24]:
last_update = pd.Timestamp('1970-01-01')


In [66]:
try:
  with open('datasets/last_update.csv', 'r') as f:
    last_update = f.read().strip()
except Exception as e:
  last_update = '1970-01-01'
print(last_update)
filelist_df = get_filelink_df(base_cvm)
filelist_df = filelist_df[filelist_df['date'] > pd.to_datetime(last_update)]
filelist = filelist_df['filename'].to_list()
# # update last_update.csv
# with open('datasets/last_update.csv', 'w') as f:
#     f.write(filelist_df['date'].max().strftime('%Y-%m-%d'))

2023-07-01
6.25% 1+15, 0.000000s per item, Remaining: 0h 00m 00s
12.50% 2+14, 0.198280s per item, Remaining: 0h 00m 02s
18.75% 3+13, 0.274468s per item, Remaining: 0h 00m 03s
25.00% 4+12, 0.287965s per item, Remaining: 0h 00m 03s
31.25% 5+11, 0.286563s per item, Remaining: 0h 00m 03s
37.50% 6+10, 0.287555s per item, Remaining: 0h 00m 02s
43.75% 7+9, 0.292515s per item, Remaining: 0h 00m 02s
50.00% 8+8, 0.296148s per item, Remaining: 0h 00m 02s
56.25% 9+7, 0.297435s per item, Remaining: 0h 00m 02s
62.50% 10+6, 0.298515s per item, Remaining: 0h 00m 01s
68.75% 11+5, 0.299267s per item, Remaining: 0h 00m 01s
75.00% 12+4, 0.298183s per item, Remaining: 0h 00m 01s
81.25% 13+3, 0.297709s per item, Remaining: 0h 00m 00s
87.50% 14+2, 0.296854s per item, Remaining: 0h 00m 00s
93.75% 15+1, 0.297354s per item, Remaining: 0h 00m 00s
100.00% 16+0, 0.302506s per item, Remaining: 0h 00m 00s


In [81]:
def create_demo_file():
    """Creates a demo dictionary by loading pickled dataframes for each year.

    Args:
        start_year (int): The starting year for loading dataframes.

    Returns:
        dict: A dictionary containing loaded demo dataframes for each year.
    """
    try:
        demo_cvm = {}
        years = range(2010, datetime.datetime.now().year + 1)
        start_time = time.time()

        for i, year in enumerate(years):
            print(remaining_time(start_time, len(years), i))
            dataframe = load_pkl(f'datasets/dataframe_{year}')
            demo_cvm[year] = dataframe
    except Exception as e:
        print(e)
        pass
    return demo_cvm


In [70]:
demo_cvm = load_pkl('demo_cvm')


In [82]:
demo_cvm_existing = create_demo_file()
len(demo_cvm_existing)

7.14% 1+13, 0.024634s per item, Remaining: 0h 00m 00s
14.29% 2+12, 0.990742s per item, Remaining: 0h 00m 11s
21.43% 3+11, 2.590608s per item, Remaining: 0h 00m 28s
28.57% 4+10, 4.678944s per item, Remaining: 0h 00m 46s
35.71% 5+9, 6.545985s per item, Remaining: 0h 00m 58s
42.86% 6+8, 6.291357s per item, Remaining: 0h 00m 50s
50.00% 7+7, 6.154502s per item, Remaining: 0h 00m 43s
57.14% 8+6, 5.955993s per item, Remaining: 0h 00m 35s
64.29% 9+5, 6.280854s per item, Remaining: 0h 00m 31s
71.43% 10+4, 6.136764s per item, Remaining: 0h 00m 24s
78.57% 11+3, 6.053713s per item, Remaining: 0h 00m 18s
85.71% 12+2, 6.112623s per item, Remaining: 0h 00m 12s
92.86% 13+1, 6.535261s per item, Remaining: 0h 00m 06s
100.00% 14+0, 6.743198s per item, Remaining: 0h 00m 00s


14

In [98]:
key_columns = ['CNPJ_CIA', 'CD_CONTA', 'DT_REFER', 'ANO', 'AGRUPAMENTO']
year = 2019
df1 = demo_cvm_existing[year]
df2 = demo_cvm[year]

In [99]:
start_time = time.time()
temp = dict()
temp[2019] = df2
temp = clean_dataframes(temp)
df2 = temp[2019]

clean dataframes
2019 100.00% 1+0, 0.000000s per item, Remaining: 0h 00m 00s


In [100]:
df1_updated = df1[~df1.set_index(key_columns).index.isin(df2.set_index(key_columns).index)]
final_df = pd.concat([df1_updated, df2], ignore_index=True)
final_df.head()

Unnamed: 0,FILE_NAME,DEMONSTRATIVO,BALANCE_SHEET,ANO,AGRUPAMENTO,CNPJ_CIA,DT_REFER,VERSAO,DENOM_CIA,CD_CVM,...,ESCALA_MOEDA,DT_FIM_EXERC,CD_CONTA,DS_CONTA,VL_CONTA,ST_CONTA_FIXA,DT_INI_EXERC,COLUNA_DF,MATH_MAGIC,FILENAME
0,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2,COMPANHIA ENERGETICA DE BRASILIA CEB,14451,...,MIL,2019-03-31,6.01,Caixa Líquido Atividades Operacionais,51444.0,S,2019-01-01,,False,
1,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2,COMPANHIA ENERGETICA DE BRASILIA CEB,14451,...,MIL,2019-03-31,6.02,Caixa Líquido Atividades de Investimento,-7266.0,S,2019-01-01,,False,
2,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2,COMPANHIA ENERGETICA DE BRASILIA CEB,14451,...,MIL,2019-03-31,6.03,Caixa Líquido Atividades de Financiamento,-43295.0,S,2019-01-01,,False,
3,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2,COMPANHIA ENERGETICA DE BRASILIA CEB,14451,...,MIL,2019-03-31,6.04,Variação Cambial s/ Caixa e Equivalentes,0.0,S,2019-01-01,,False,
4,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2,COMPANHIA ENERGETICA DE BRASILIA CEB,14451,...,MIL,2019-03-31,6.05,Aumento (Redução) de Caixa e Equivalentes,883.0,S,2019-01-01,,False,


In [97]:
df1_updated = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['VL_CONTA_y', '_merge'])
df1_updated = df1_updated.rename(columns={'VL_CONTA_x': 'VL_CONTA'})
final_df = pd.concat([df1_updated, df2], ignore_index=True)
final_df.head()


Unnamed: 0,FILE_NAME,DEMONSTRATIVO_x,BALANCE_SHEET_x,ANO,AGRUPAMENTO,CNPJ_CIA,DT_REFER,VERSAO_x,DENOM_CIA_x,CD_CVM_x,...,CD_CVM,GRUPO_DFP,MOEDA,ESCALA_MOEDA,DT_FIM_EXERC,DS_CONTA,ST_CONTA_FIXA,DT_INI_EXERC,COLUNA_DF,MATH_MAGIC
0,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2.0,COMPANHIA ENERGETICA DE BRASILIA CEB,14451.0,...,,,,,NaT,,,NaT,,
1,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2.0,COMPANHIA ENERGETICA DE BRASILIA CEB,14451.0,...,,,,,NaT,,,NaT,,
2,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2.0,COMPANHIA ENERGETICA DE BRASILIA CEB,14451.0,...,,,,,NaT,,,NaT,,
3,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2.0,COMPANHIA ENERGETICA DE BRASILIA CEB,14451.0,...,,,,,NaT,,,NaT,,
4,itr_cia_aberta_DFC_MD_con_2019.csv,itr,DFC_MD,2019,con,00.070.698/0001-11,2019-03-31,2.0,COMPANHIA ENERGETICA DE BRASILIA CEB,14451.0,...,,,,,NaT,,,NaT,,


In [None]:
# Get Base Info
files_list = gather_links(base_cvm)
meta_dict = get_metadados(files_list)
categories = get_categories(files_list)
demonstrativos_cvm = []
for cat in categories:
    term = 'DOC/'
    if term in cat:
        demonstrativos_cvm.append(cat.replace(term,'').lower())

# Imprimir resultados
total_fields = sum((i + 1) * len(d) for i, d in enumerate(meta_dict.values()))
print(f'{base_cvm}')
print(f'Encontradas {len(categories)} categorias com {len(meta_dict)} arquivos meta contendo {total_fields} campos')
print(demonstrativos_cvm)


In [None]:
# download_raw_sheets
start_time = time.time()

demo_cvms = ['itr', 'dfp']
dataframes = download_database(demo_cvms)

# if update == True:
#     # load part from csv and part from web, anc concat them
#     dataframes_csv = pd.read_csv(file_name)
# else:
#     # load everything from web, no updates, download all this should be an empty saved csv files
#     # frist update = false to load everything, then create an empty dt
#     dataframes_csv = pd.DataFrame('empty')

# dataframes_web = download_database(demonstrativos_cvm, update=update)
# dataframes = pd.concat([dataframes_csv, dataframes_web], ignore_index=True)


In [None]:
# clean and group by year
start_time = time.time()

demo_cvm, links = group_by_year(dataframes)
demo_cvm = clean_DT_INI_EXERC(demo_cvm)


In [None]:
demo_cvm = load_pkl(demo_cvm, 'dataframes')
keys_to_delete = []
k_year = 2013
for year, df in demo_cvm.items():
    if year != k_year:
        keys_to_delete.append(year)

for key in keys_to_delete:
    del demo_cvm[key]


### By Year

In [36]:
def by_year(demo_cvm):
    last_quarters = ['3', '4']
    all_quarters = ['6', '7']
    n = 1000000000

    start_time = time.time()
    for n1, (year, demo_cvmnstrativo) in enumerate(demo_cvm.items()):

        # companies by structured report (ind, con)
        companies_by_str_port = get_companies_by_str_port(demo_cvmnstrativo)
        print(f"{year} {len(demo_cvmnstrativo):,.0f} lines, {len(demo_cvmnstrativo['DENOM_CIA'].unique())} companies, {'/'.join([f'{len(companies)} {key}' for key, companies in companies_by_str_port.items()])}")
        print(year, remaining_time(start_time, len(demo_cvm), n1))

        # a mathport is a group of 'DENOM_CIA' and 'AGRUPAMENTO' and 'DT_REFER'.dt.year
        groups = demo_cvmnstrativo.groupby(['DENOM_CIA', 'AGRUPAMENTO'], group_keys=False)
        # math_keys = list(math_port.groups.keys())

        # # strport is a group of 'DENOM_CIA', 'DT_REFER' and 'AGRUPAMENTO'. 
        # str_port = df.groupby(['DENOM_CIA', 'AGRUPAMENTO', 'DT_REFER'], group_keys=False)
        # str_keys = list(str_port.groups.keys())

        start_time_2 = time.time()
        for n2, (key, group) in enumerate(groups):
            print('  ', remaining_time(start_time_2, len(groups), n2))
            company = key[0]
            agg = key[1]
            # print(f"{{n2}/{len(groups)-n2}. {year}, {company}, {agg}, {len(group)}")
            subgroups = group.groupby(['CD_CONTA', 'DS_CONTA'], group_keys=False)
            # conta_keys = list(conta_port.groups.keys())

            start_time_3 = time.time()
            for n3, (index, df) in enumerate(subgroups):
                # print('  ', '  ', remaining_time(start_time_3, len(subgroups), n3))
                conta = index[0]
                conta_first = index[0][0]
                descricao = index[1]
                # print(f"{n1}/{len(demo_cvm)-n1}. {n2}/{len(groups)-n2}. {n3}/{len(subgroups)-n3}. {year}, {company}, {agg}, {len(subgroups)}, {conta}, {descricao} {df['VL_CONTA'].values}")

                update = False
                if all(q in df['DT_REFER'].dt.quarter.values for q in [1, 2, 3, 4]):
                # do the magic
                    i1 = df[df['DT_REFER'].dt.quarter == 1].index[0]
                    i2 = df[df['DT_REFER'].dt.quarter == 2].index[0]
                    i3 = df[df['DT_REFER'].dt.quarter == 3].index[0]
                    i4 = df[df['DT_REFER'].dt.quarter == 4].index[0]

                    q1 = df[df['DT_REFER'].dt.quarter == 1]['VL_CONTA'].iloc[0] #.sum() #.mean() #.max() etc
                    q2 = df[df['DT_REFER'].dt.quarter == 2]['VL_CONTA'].iloc[0]
                    q3 = df[df['DT_REFER'].dt.quarter == 3]['VL_CONTA'].iloc[0]
                    q4 = df[df['DT_REFER'].dt.quarter == 4]['VL_CONTA'].iloc[0]

                
                try:
                    if conta_first in last_quarters:
                        if not demo_cvmnstrativo.loc[i4, 'MATH_MAGIC']:
                            q4 = q4 - (q3 + q2 + q1)
                        update = True
                    elif conta_first in all_quarters:
                        if not demo_cvmnstrativo.loc[i2, 'MATH_MAGIC']:
                            q2 = q2 - (q1)
                        if not demo_cvmnstrativo.loc[i3, 'MATH_MAGIC']:
                            q3 = q3 - (q2 + q1)
                        if not demo_cvmnstrativo.loc[i4, 'MATH_MAGIC']:
                            q4 = q4 - (q3 + q2 + q1)
                        update = True
                except Exception as e:
                    update = False

                if update:
                    demo_cvmnstrativo.loc[i2, ['VL_CONTA', 'MATH_MAGIC']] = [q2, True]
                    demo_cvmnstrativo.loc[i3, ['VL_CONTA', 'MATH_MAGIC']] = [q3, True]
                    demo_cvmnstrativo.loc[i4, ['VL_CONTA', 'MATH_MAGIC']] = [q4, True]

                    # print(f"{year}, {company}, {agg}, {len(subgroups)}, {conta}, {descricao} {df['VL_CONTA'].values}")
                if n3 > n:
                    break
            if n2 > n:
                break
        if n1 > n:
            break
    return demo_cvm

In [None]:
file_name = f'dataframe_{k_year}'
demo_cvm[k_year] = save_pkl(demo_cvm[k_year], file_name)

##### temp

In [40]:
demo_cvm = load_pkl('dataframes')
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
for k_year in years:
    for y in [2010, 1011, 2012, 2013, 2014]:
        try:
            del demo_cvm[y]
        except:
            pass
    print(k_year)
    keys_to_delete = []
    for year, df in demo_cvm.items():
        if year != k_year:
            keys_to_delete.append(year)
    for key in keys_to_delete:
        del demo_cvm[key]
    demo_cvm = by_year(demo_cvm)
    
    file_name = f'dataframe_{k_year}'
    demo_cvm[k_year] = save_pkl(demo_cvm[k_year], file_name)
    demo_cvm = load_pkl('dataframes')


2015
2015 1,824,685 lines, 665 companies, 334 ind/349 ('ind', 'con')/8 con
2015 100.00% 1+0, 15.335876s per item, Remaining: 0h 00m 00s
   0.10% 1+1013, 89.236152s per item, Remaining: 25h 06m 36s
   0.20% 2+1012, 44.714320s per item, Remaining: 12h 34m 10s
   0.30% 3+1011, 29.852419s per item, Remaining: 8h 23m 00s
   0.39% 4+1010, 22.863010s per item, Remaining: 6h 24m 51s
   0.49% 5+1009, 18.607219s per item, Remaining: 5h 12m 54s
   0.59% 6+1008, 15.822672s per item, Remaining: 4h 25m 49s
   0.69% 7+1007, 13.824033s per item, Remaining: 3h 52m 00s
   0.79% 8+1006, 12.311978s per item, Remaining: 3h 26m 25s
   0.89% 9+1005, 11.145991s per item, Remaining: 3h 06m 41s
   0.99% 10+1004, 10.241553s per item, Remaining: 2h 51m 22s
   1.08% 11+1003, 9.467110s per item, Remaining: 2h 38m 15s
   1.18% 12+1002, 8.849418s per item, Remaining: 2h 27m 47s
   1.28% 13+1001, 8.304349s per item, Remaining: 2h 18m 32s
   1.38% 14+1000, 7.759257s per item, Remaining: 2h 09m 19s
   1.48% 15+999, 7.28

##### continue

In [49]:
years = range(2010, 2024)
demo_cvm = {}
start_time = time.time()
for i, year in enumerate(years):
    print(year)
    demo_cvm[year] = load_pkl(f'dataframe_{year}')


7.14% 1+13, 0.000000s per item, Remaining: 0h 00m 00s
14.29% 2+12, 1.013370s per item, Remaining: 0h 00m 12s
21.43% 3+11, 0.898703s per item, Remaining: 0h 00m 09s
28.57% 4+10, 0.842455s per item, Remaining: 0h 00m 08s
35.71% 5+9, 0.807410s per item, Remaining: 0h 00m 07s
42.86% 6+8, 0.783481s per item, Remaining: 0h 00m 06s
50.00% 7+7, 0.773238s per item, Remaining: 0h 00m 05s
57.14% 8+6, 0.768163s per item, Remaining: 0h 00m 04s
64.29% 9+5, 0.775538s per item, Remaining: 0h 00m 03s
71.43% 10+4, 0.781384s per item, Remaining: 0h 00m 03s
78.57% 11+3, 0.790937s per item, Remaining: 0h 00m 02s
85.71% 12+2, 0.799111s per item, Remaining: 0h 00m 01s
92.86% 13+1, 0.845550s per item, Remaining: 0h 00m 00s
100.00% 14+0, 0.858929s per item, Remaining: 0h 00m 00s


In [None]:
# save by company
## get an aggregated list of company, agg in all demo_cvm dict
## create a demo_cvm_company dict, transform from year to company

## must create a setor, subsetor, segmento for companies, then use as keys?

In [None]:
# Get all unique companies across all years
all_companies = set()
for df in years:
    all_companies.update(df['company-agg'].unique())

# Initialize the final dictionary with companies as keys
company_dict = {}

# Populate the company_dict
for company in all_companies:
    company_data = []  # This will hold dataframes for each year for the company
    for year, df in year_dict.items():
        company_df_for_year = df[df['company-agg'] == company]
        company_data.append(company_df_for_year)
    
    # Concatenate the data for the company across all years
    company_dict[company] = pd.concat(company_data, ignore_index=True)


In [None]:
df = demo_cvm[2014]

m_company = df['DENOM_CIA'] == 'CENTRAIS ELET BRAS SA  ELETROBRAS'
m_conta = df['BALANCE_SHEET'] == 'DRE'
m_conta_len = df['CD_CONTA'].str.len() <= 4
m_agg = df['AGRUPAMENTO'] == 'ind'
mask = m_company & m_conta & m_conta_len & m_agg

# Filter the DataFrame using the mask
filtered_df = df[mask][['DS_CONTA', 'DT_REFER', 'VL_CONTA']]

# Pivot the filtered DataFrame using pivot_table and aggregation function
pivot_df = filtered_df.pivot_table(index='DT_REFER', columns='DS_CONTA', values='VL_CONTA', aggfunc='sum')

# Plot the pivoted DataFrame
try:
    pivot_df.plot()
except:
    pass


In [None]:
df['CD_CONTA'].unique()

In [None]:
m_company = df['DENOM_CIA'] == 'ALPARGATAS SA'
m_conta = df['BALANCE_SHEET'] == 'DRE'
m_conta_len = df['CD_CONTA'].str.len() <= 4
m_agg = df['AGRUPAMENTO'] == 'ind'
mask = m_company & m_conta & m_conta_len & m_agg
df[mask]

In [None]:
# o que caracteriza uma df completa, um relatório estruturado? DENOM_CIA + DT_REFER
# como saber quais empresas são 'con' e quais são 'ind' e como criar uma terceira view mae = con - ind?


In [None]:
companies_by_relest

In [None]:
# SÓ INDIVIDUAL ind = True, con = False: '3A COMPANHIA SECURITIZADORA'
# SÓ CONSOLIDADO ind = False, con = True: 'BANCO SANTANDER SA'
# AMBOS ind = True, con = True: 'ADVANCED DIGITAL HEALTH MEDICINA PREVENTIVA SA'


In [None]:
companies_by_relest['individual']

In [None]:
for k, v in companies_by_relest.items():
    print(k)

In [None]:
relest_individual = True
relest_consolidado = False
pivot_table[(pivot_table['ind'] == relest_individual) & (pivot_table['con'] == relest_consolidado)].index.get_level_values('DENOM_CIA').unique()[0]

In [None]:
cia_cols = ['CNPJ_CIA', 'DENOM_CIA', 'CD_CVM'] # 683 rows
bal_cols = ['FILE_NAME', 'demo_cvmNSTRATIVO', 'BALANCE_SHEET', 'AGRUPAMENTO', 'GRUPO_DFP'] # 32 rows
dt_cols = ['DT_REFER', 'DT_FIM_EXERC'] # 4 rows
cod_conta_cols = ['CD_CONTA', ] # 1543 rows
desc_conta_cols = ['DS_CONTA', ] # 38597 rows
vlr_cta_cols = ['VL_CONTA',]

ubiq_cols = ['ANO', 'MOEDA'] # 1 row
unique__independ_cols = ['VERSAO', 'ESCALA_MOEDA',  'ST_CONTA_FIXA', 'DT_INI_EXERC', 'COLUNA_DF']

In [None]:
df_temp = df[cia_cols+bal_cols+dt_cols+cod_conta_cols+desc_conta_cols+vlr_cta_cols+unique__independ_cols]
mask = df_temp['CD_CVM'] == 2437
mask2 = df_temp['AGRUPAMENTO'] == 'con'
mask3 = df_temp['BALANCE_SHEET'] == 'DRE'
mask4 = df_temp['DT_REFER'].dt.month == 12
mask5 = df_temp['CD_CONTA'] == '3.01'

df_temp[mask&mask2&mask3&mask5].drop_duplicates()

In [None]:
df

In [None]:
# group by year

# clean df



In [None]:
sorted_df.groupby('CD_CVM')['VERSAO'].idxmax()


In [None]:
df = df_demo_cvm_y[year]
if 1 == 1:
    df = df.copy()  # Create a copy of the DataFrame

    # filter only last one
    df = df[df['ORDEM_EXERC'] == 'ÚLTIMO']
    df = df.drop(columns=['ORDEM_EXERC'])

    # Clean up Text
    df['DENOM_CIA'] = df['DENOM_CIA'].apply(clean_text)

    # rows to remove
    words_to_remove = ['LIQUIDACAO', 'JUDICIAL', ]
    df = df[~df['DENOM_CIA'].str.contains('|'.join(words_to_remove))]

    # DateTime
    date_columns = ['DT_REFER', 'DT_RECEB', 'DT_FIM_EXERC', ]
    # Convert the specified date columns to datetime
    for col in date_columns:
        try:
            df = df.assign(**{col: pd.to_datetime(df[col])})
        except Exception as e:
            # Handle invalid date values here
            pass

    # Sort the DataFrame by 'CD_CVM' and 'VERSAO' in descending order
    sorted_df = df.sort_values(by=['CD_CVM', 'VERSAO'], ascending=[True, False])

    # After sorting
    sorted_df = df.sort_values(by=['CD_CVM', 'VERSAO'], ascending=[True, False])
    print("After sorting:", sorted_df.shape)

    # After the groupby and idxmax operations
    indices_of_max_version = sorted_df.groupby('CD_CVM')['VERSAO'].idxmax()
    print("Indices of max version:", indices_of_max_version)


    # Get the indices of the rows with the highest version for each 'CD_CVM'
    indices_of_max_version = sorted_df.groupby('CD_CVM')['VERSAO'].idxmax()

    # Filter the DataFrame to keep only the rows with the highest version for each 'CD_CVM'
    filtered_df = sorted_df.loc[indices_of_max_version]

    df = sorted_df = df.sort_values(by=['DENOM_CIA'], ascending=[True])


    # return df.reset_index(drop=True)

In [None]:
df_demo_cvm_y = group_by_year(df_demo_cvm, df_demo_cvm_links)

In [None]:
df_demo_cvm_y[year]

In [None]:
df_demo_cvm_links_y[year][df_demo_cvm_links_y[year]['CD_CVM'] == 15253].head(30)

In [None]:
merged_df = df.merge(df_demo_cvm_links[df_demo_cvm_links['ANO'] == '2023'][cols_common + cols_to_add], on=cols_common, how='left')
merged_df

In [None]:
df[cols_common].drop_duplicates()

In [None]:
df_demo_cvm_links[cols_common].drop_duplicates()

In [None]:
# save to csv

dfp.to_csv('dfp.csv')
dfp_links.to_csv('dfp_links.csv')
itr.to_csv('itr.csv')
itr_links.to_csv('itr_links.csv')


In [None]:
# read from csv

dfp = pd.read_csv('dfp.csv')
dfp_links = pd.read_csv('dfp_links.csv')
itr = pd.read_csv('itr.csv')
itr_links = pd.read_csv('itr_links.csv')

In [None]:
# Base URL
url_raw =        'https://www.rad.cvm.gov.br/ENET/frmGerenciaPaginaFRE.aspx?CodigoTipoInstituicao=1&NumeroSequencialDocumento={ID_DOC}'
url_download = 'http://www.rad.cvm.gov.br/ENETCONSULTA/frmDownloadDocumento.aspx?CodigoInstituicao=1&NumeroSequencialDocumento={ID_DOC}'
url_relatorio_administracao = 'https://www.rad.cvm.gov.br/ENET/frmExibirArquivoFRE.aspx?NumeroSequencialDocumento=8299&CodigoGrupo=1653&CodigoQuadro=0&Tipo=PDF&RelatorioRevisaoEspecial=Sem+Ressalva&CodTipoDocumento=4'

# Generate list of URLs
dfp_url_list = [url_raw.format(ID_DOC=id_doc) for id_doc in dfp_links['ID_DOC']]
dfp_download_list = [url_download.format(ID_DOC=id_doc) for id_doc in dfp_links['ID_DOC']]
dfp_relatorio_administracao_list = [url_relatorio_administracao.format(ID_DOC=id_doc) for id_doc in dfp_links['ID_DOC']]
itr_url_list = [url_raw.format(ID_DOC=id_doc) for id_doc in itr_links['ID_DOC']]
itr_download_list = [url_download.format(ID_DOC=id_doc) for id_doc in itr_links['ID_DOC']]
itr_relatorio_administracao_list = [url_relatorio_administracao.format(ID_DOC=id_doc) for id_doc in itr_links['ID_DOC']]


In [None]:
def update_version_old(df, links):
    cols_to_add = ['VERSAO_DOC', 'CATEG_DOC', 'ID_DOC', 'DT_RECEB']
    
    # Create the columns if they don't exist
    for col in cols_to_add:
        if col not in df.columns:
            df[col] = None
    
    # Group the df DataFrame by the conditions
    grouped_df = df.groupby(['ANO', 'demo_cvmNSTRATIVO', 'CNPJ_CIA', 'CD_CVM', 'DT_REFER'])
    
    # Iterate through links and update grouped_df
    for i, row in links.iterrows():
        key = (row['ANO'], row['demo_cvmNSTRATIVO'], row['CNPJ_CIA'], row['CD_CVM'], row['DT_REFER'])
        group = grouped_df.get_group(key)
        
        cols_to_update = ['VERSAO', 'CATEG_DOC', 'ID_DOC', 'DT_RECEB']
        df.loc[group.index, cols_to_add] = row[cols_to_update].values
        
        print(f"{i+1}/{len(links)-i} {row['demo_cvmNSTRATIVO']} {row['ANO']} - {row['CD_CVM']} {row['CNPJ_CIA']} - {row['DT_REFER']} versão {row['VERSAO']}")
    
    while i > 20:
        print('break')
        break
    return df


In [None]:
def update_version(df, links, grouped_df):
    cols_to_add = ['VERSAO_DOC', 'CATEG_DOC', 'ID_DOC', 'DT_RECEB']

    # Create the columns if they don't exist
    for col in cols_to_add:
        if col not in df.columns:
            df[col] = None

    # Convert columns to object type
    for col in cols_to_add:
        try:
            df[col] = df[col].astype('object')
        except Exception as e:
            pass

    for i, key in enumerate(grouped_df.groups.keys()):
        print(f'{i}/{len(grouped_df)} {key}')
        
        grouped_df.get_group(key)
        ano, demo_cvmnstrativo, cnpj_cia, cd_cvm, dt_refer = key

        # Filter links based on the group's conditions
        mask = (
            (links['ANO'] == ano) &
            (links['demo_cvmNSTRATIVO'] == demo_cvmnstrativo) &
            (links['CNPJ_CIA'] == cnpj_cia) &
            (links['CD_CVM'] == cd_cvm) &
            (links['DT_REFER'] == dt_refer)
        )


        df.loc[grouped_df.get_group(key).index, cols_to_add] = links[mask][cols_to_add].values

    # Convert columns to object type
    for col in cols_to_add:
        try:
            df[col] = df[col].astype('category')
        except Exception as e:
            pass

    return df


In [None]:
# Group the df DataFrame by the conditions
group_columns = ['ANO', 'demo_cvmNSTRATIVO', 'CNPJ_CIA', 'CD_CVM', 'DT_REFER']
grouped_df = itr.groupby(group_columns)

itr = update_version(itr, itr_links, grouped_df)


# # Group the df DataFrame by the conditions
# group_columns = ['ANO', 'demo_cvmNSTRATIVO', 'CNPJ_CIA', 'CD_CVM', 'DT_REFER']
# grouped_df = dfp.groupby(group_columns)

# dfp = update_version(dfp, dfp_links, grouped_df)