#### Downloading stock market data from the CVM website.
#### The data will consist of quarterly reports spanning from 2012 to 2022.

In [1]:
#pd.set_option('display.max_columns', 10)
pd.reset_option('display.max_columns')
pd.set_option('display.width', 200)
#pd.reset_option('display.width')

NameError: name 'pd' is not defined

In [None]:
#
# install libs
#
%pip install requests beautifulsoup4
%pip install chardet
%pip install files
%pip install chardet

In [None]:
#
# download balance sheets files
#
import os
import requests
from urllib.parse import urljoin
import zipfile
import chardet

# Function to check if a file's encoding is UTF-8
def is_utf8(file_path):
    try:
        with open(file_path, 'rb') as file:
            raw_data = file.read()
            result = chardet.detect(raw_data)
            return result['encoding'] == 'utf-8'
    except Exception:
        return False

# URL of the website containing the ZIP files
base_url = "https://dados.cvm.gov.br/dados/CIA_ABERTA/DOC/ITR/DADOS/"

# Create a directory to save the downloaded files
download_dir = "downloaded_files"
os.makedirs(download_dir, exist_ok=True)

# Define the years you want to download (2012 to 2022)
years_to_download = set(str(year) for year in range(2012, 2023))

# Send an HTTP GET request to the URL
response = requests.get(base_url)
response.raise_for_status()

# Parse the HTML content using BeautifulSoup
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all links on the page
links = soup.find_all("a")

# Iterate through the links and download ZIP files for the specified years
for link in links:
    file_url = urljoin(base_url, link["href"])
    
    # Example: Assuming ZIP files are named like "itr_cia_aberta_YEAR.zip"
    if file_url.endswith(".zip"):
        zip_file_name = os.path.basename(file_url)
        
        # Extract the year from the ZIP file name
        year_part = zip_file_name.split("_")[-1].split(".")[0]
        
        # Check if the ZIP file is from a year within the specified range
        if year_part in years_to_download:
            # Check if the ZIP file already exists
            if not os.path.exists(os.path.join(download_dir, zip_file_name)):
                print(f"Downloading: {zip_file_name}")
                with open(os.path.join(download_dir, zip_file_name), "wb") as file:
                    file_response = requests.get(file_url)
                    file.write(file_response.content)
            else:
                print(f"File already exists: {zip_file_name}")

# Process the downloaded ZIP files (extract, remove "ind" files, and convert)
for zip_file_name in os.listdir(download_dir):
    if zip_file_name.endswith(".zip"):
        zip_file_path = os.path.join(download_dir, zip_file_name)
        zip_subdir = os.path.splitext(zip_file_name)[0]  # Use ZIP file name without extension as subdirectory name
        zip_subdir_path = os.path.join(download_dir, zip_subdir)

        # Check if the ZIP file has already been extracted
        if not os.path.exists(zip_subdir_path):
            print(f"Unzipping: {zip_file_name} -> {zip_subdir_path}")
            with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
                zip_ref.extractall(zip_subdir_path)

            # Remove files with "ind" in their names
            for root, _, files in os.walk(zip_subdir_path):
                for file_name in files:
                    if "ind" in file_name:
                        file_path = os.path.join(root, file_name)
                        os.remove(file_path)
                        print(f"Removed: {file_path}")

            # Remove files with "itr_cia_aberta_20" in their names
            for root, _, files in os.walk(zip_subdir_path):
                for file_name in files:
                    if "itr_cia_aberta_20" in file_name:
                        file_path = os.path.join(root, file_name)
                        os.remove(file_path)
                        print(f"Removed: {file_path}")

            # Remove files with "itr_cia_aberta_DFC_MD_con_20" in their names
            for root, _, files in os.walk(zip_subdir_path):
                for file_name in files:
                    if "itr_cia_aberta_DFC_MD_con_20" in file_name:
                        file_path = os.path.join(root, file_name)
                        os.remove(file_path)
                        print(f"Removed: {file_path}")

            # Convert CSV files from ISO-8859-1 to UTF-8 only if they are not already UTF-8
            for csv_file_name in os.listdir(zip_subdir_path):
                if csv_file_name.endswith(".csv"):
                    csv_file_path = os.path.join(zip_subdir_path, csv_file_name)
                    if not is_utf8(csv_file_path):
                        print(f"Converting encoding: {csv_file_path}")
                        with open(csv_file_path, 'r', encoding='ISO-8859-1') as source_file:
                            content = source_file.read()
                        with open(csv_file_path, 'w', encoding='utf-8') as target_file:
                            target_file.write(content)
                    else:
                        print(f"File already in UTF-8: {csv_file_path}")

print("Download, extraction, file removal, and encoding conversion completed.")


In [None]:
#
# create the datasets
#
import pandas as pd
import numpy as np
import re

bpa_df = None
bpp_df = None
dfc_mi_df = None
dmpl_df = None
dra_df = None
dre_df = None
dva_df = None

# Load files into Data Frames
for dir_name in os.listdir(download_dir):
    if not dir_name.endswith(".zip"):
        dir_path = os.path.join(download_dir, dir_name)

        for csv_file_name in os.listdir(dir_path):
            csv_file_path = os.path.join(dir_path, csv_file_name)
            print("\n-------------------------------------------------------")
            print(f"Reading csv file: {csv_file_path}")

            csv_file_type = os.path.splitext(csv_file_name)[0][15:19].replace("_", "")
            print(f"CSV type: {csv_file_type}")

            #local_df = pd.read_csv(csv_file_path, sep=';', index_col='CD_CVM')
            local_df = pd.read_csv(csv_file_path, sep=';')
            print(f"Dataframe size: {len(local_df)}")

            match csv_file_type:
                case 'BPA':
                    if bpa_df is None:
                        bpa_df = local_df
                    else:
                        bpa_df = pd.concat([bpa_df, local_df])
                    print(f"Dataframe size - after concat: {len(bpa_df)}")

                case 'BPP':
                    if bpp_df is None:
                        bpp_df = local_df
                    else:
                        bpp_df = pd.concat([bpp_df, local_df])
                    print(f"Dataframe size - after concat: {len(bpp_df)}")

                case 'DFC':
                    if dfc_mi_df is None:
                        dfc_mi_df = local_df
                    else:
                        dfc_mi_df = pd.concat([dfc_mi_df, local_df])
                    print(f"Dataframe size - after concat: {len(dfc_mi_df)}")

                case 'DMPL':
                    if dmpl_df is None:
                        dmpl_df = local_df
                    else:
                        dmpl_df = pd.concat([dmpl_df, local_df])
                    print(f"Dataframe size - after concat: {len(dmpl_df)}")

                case 'DRA':
                    if dra_df is None:
                        dra_df = local_df
                    else:
                        dra_df = pd.concat([dra_df, local_df])
                    print(f"Dataframe size - after concat: {len(dra_df)}")

                case 'DRE':
                    if dre_df is None:
                        dre_df = local_df
                    else:
                        dre_df = pd.concat([dre_df, local_df])
                    print(f"Dataframe size - after concat: {len(dre_df)}")

                case 'DVA':
                    if dva_df is None:
                        dva_df = local_df
                    else:
                        dva_df = pd.concat([dva_df, local_df])
                    print(f"Dataframe size - after concat: {len(dva_df)}")

# Adding missing column for BPA and BPP types
bpa_df.insert(8, 'DT_INI_EXERC', bpa_df['DT_FIM_EXERC'].apply(lambda x: re.sub('(\d{4}-\d{2})-(\d{2})', '\\1-01', x)))
bpp_df.insert(8, 'DT_INI_EXERC', bpp_df['DT_FIM_EXERC'].apply(lambda x: re.sub('(\d{4}-\d{2})-(\d{2})', '\\1-01', x)))



In [None]:
# analising data

def print_df(df_name, df):
    print(f'\n\n{df_name}: {len(df):,} records')
    print('-------------------------------------------------------------------')
    print('\t\t\t TYPES')
    print(df.dtypes)

    print('\t\t\t NULL VALUES')
    print('CNPJ_CIA: ' + str(df['CNPJ_CIA'].isnull().sum().sum()))
    print('DT_REFER: ' + str(df['DT_REFER'].isnull().sum().sum()))
    print('VERSAO: ' + str(df['VERSAO'].isnull().sum().sum()))
    print('DENOM_CIA: ' + str(df['DENOM_CIA'].isnull().sum().sum()))
    print('CD_CVM: ' + str(df['CD_CVM'].isnull().sum().sum()))
    print('GRUPO_DFP: ' + str(df['GRUPO_DFP'].isnull().sum().sum()))
    print('MOEDA: ' + str(df['MOEDA'].isnull().sum().sum()))
    print('ESCALA_MOEDA: ' + str(df['ESCALA_MOEDA'].isnull().sum().sum()))
    print('ORDEM_EXERC: ' + str(df['ORDEM_EXERC'].isnull().sum().sum()))
    print('DT_INI_EXERC: ' + str(df['DT_INI_EXERC'].isnull().sum().sum()))
    print('DT_FIM_EXERC: ' + str(df['DT_FIM_EXERC'].isnull().sum().sum()))
    print('CD_CONTA: ' + str(df['CD_CONTA'].isnull().sum().sum()))
    print('DS_CONTA: ' + str(df['DS_CONTA'].isnull().sum().sum()))
    print('VL_CONTA: ' + str(df['VL_CONTA'].isnull().sum().sum()))
    print('ST_CONTA_FIXA: ' + str(df['ST_CONTA_FIXA'].isnull().sum().sum()))

    print('\t\t\t COUNT UNIQUE VALUES')
    print('CNPJ_CIA: ' + str(len(df['CNPJ_CIA'].unique())))
    print('DT_REFER: ' + str(len(df['DT_REFER'].unique())))
    print('VERSAO: ' + str(len(df['VERSAO'].unique())))
    print('DENOM_CIA: ' + str(len(df['DENOM_CIA'].unique())))
    print('CD_CVM: ' + str(len(df['CD_CVM'].unique())))
    print('GRUPO_DFP: ' + str(len(df['GRUPO_DFP'].unique())))
    print('MOEDA: ' + str(len(df['MOEDA'].unique())))
    print('ESCALA_MOEDA: ' + str(len(df['ESCALA_MOEDA'].unique())))
    print('ORDEM_EXERC: ' + str(len(df['ORDEM_EXERC'].unique())))
    print('DT_INI_EXERC: ' + str(len(df['DT_INI_EXERC'].unique())))
    print('DT_FIM_EXERC: ' + str(len(df['DT_FIM_EXERC'].unique())))
    print('CD_CONTA: ' + str(len(df['CD_CONTA'].unique())))
    print('DS_CONTA: ' + str(len(df['DS_CONTA'].unique())))
    print('VL_CONTA: ' + str(len(df['VL_CONTA'].unique())))
    print('ST_CONTA_FIXA: ' + str(len(df['ST_CONTA_FIXA'].unique())))

    print('\t\t\t UNIQUE VALUES')
    print('CNPJ_CIA: ' + str(len(df['CNPJ_CIA'].unique())))
    print('DT_REFER: ' + str(len(df['DT_REFER'].unique())))
    print('VERSAO: ' + str(df['VERSAO'].unique()))
    print('DENOM_CIA: ' + str(len(df['DENOM_CIA'].unique())))
    print('CD_CVM: ' + str(len(df['CD_CVM'].unique())))
    print('GRUPO_DFP: ' + str(df['GRUPO_DFP'].unique()))
    print('MOEDA: ' + str(df['MOEDA'].unique()))
    print('ESCALA_MOEDA: ' + str(df['ESCALA_MOEDA'].unique()))
    print('ORDEM_EXERC: ' + str(df['ORDEM_EXERC'].unique()))
    print('DT_INI_EXERC: ' + str(len(df['DT_INI_EXERC'].unique())))
    print('DT_FIM_EXERC: ' + str(len(df['DT_FIM_EXERC'].unique())))
    print('CD_CONTA: ' + str(len(df['CD_CONTA'].unique())))
    print('DS_CONTA: ' + str(len(df['DS_CONTA'].unique())))
    print('VL_CONTA: ' + str(len(df['VL_CONTA'].unique())))
    print('ST_CONTA_FIXA: ' + str(df['ST_CONTA_FIXA'].unique()))


print_df('bpa_df', bpa_df)
print_df('bpp_df', bpp_df)
print_df('dfc_mi_df', dfc_mi_df)
print_df('dmpl_df', dmpl_df)
print_df('dra_df', dra_df)
print_df('dre_df', dre_df)
print_df('dva_df', dva_df)
