#### Downloading stock market data from the CVM website.
#### The data will consist of quarterly reports spanning from 2011 to 2023.

In [3]:
#
# install libs
#
!pip install requests beautifulsoup4
!pip install chardet




In [None]:
#
# download balance sheets files
#
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import zipfile
import chardet

# Function to check if a file's encoding is UTF-8
def is_utf8(file_path):
    try:
        with open(file_path, 'rb') as file:
            raw_data = file.read()
            result = chardet.detect(raw_data)
            return result['encoding'] == 'utf-8'
    except Exception:
        return False

# URL of the website containing the ZIP files
base_url = "https://dados.cvm.gov.br/dados/CIA_ABERTA/DOC/ITR/DADOS/"

# Send an HTTP GET request to the URL
response = requests.get(base_url)
response.raise_for_status()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all links on the page
links = soup.find_all("a")

# Create a directory to save the downloaded files
download_dir = "downloaded_files"
os.makedirs(download_dir, exist_ok=True)

# Iterate through the links and download ZIP files
for link in links:
    file_url = urljoin(base_url, link["href"])
    if file_url.endswith(".zip"):
        zip_file_name = os.path.basename(file_url)
        zip_subdir = os.path.splitext(zip_file_name)[0]  # Use ZIP file name without extension as subdirectory name
        zip_subdir_path = os.path.join(download_dir, zip_subdir)

        # Check if the ZIP file already exists
        if not os.path.exists(zip_file_name):
            print(f"Downloading: {zip_file_name}")
            with open(zip_file_name, "wb") as file:
                file_response = requests.get(file_url)
                file.write(file_response.content)
        else:
            print(f"File already exists: {zip_file_name}")
        
        # Create a subdirectory for the ZIP file
        os.makedirs(zip_subdir_path, exist_ok=True)

        # Check if the ZIP file has already been extracted
        if not os.listdir(zip_subdir_path):
            print(f"Unzipping: {zip_file_name} -> {zip_subdir_path}")
            with zipfile.ZipFile(zip_file_name, "r") as zip_ref:
                zip_ref.extractall(zip_subdir_path)
        else:
            print(f"Files already extracted: {zip_file_name} -> {zip_subdir_path}")

        # Convert CSV files from ISO-8859-1 to UTF-8 only if they are not already UTF-8
        for csv_file_name in os.listdir(zip_subdir_path):
            if csv_file_name.endswith(".csv"):
                csv_file_path = os.path.join(zip_subdir_path, csv_file_name)
                if not is_utf8(csv_file_path):
                    print(f"Converting encoding: {csv_file_path}")
                    with open(csv_file_path, 'r', encoding='ISO-8859-1') as source_file:
                        content = source_file.read()
                    with open(csv_file_path, 'w', encoding='utf-8') as target_file:
                        target_file.write(content)
                else:
                    print(f"File already in UTF-8: {csv_file_path}")

print("Download, extraction, and encoding conversion completed.")


Downloading: itr_cia_aberta_2011.zip
Files already extracted: itr_cia_aberta_2011.zip -> downloaded_files/itr_cia_aberta_2011
File already in UTF-8: downloaded_files/itr_cia_aberta_2011/itr_cia_aberta_2011.csv
File already in UTF-8: downloaded_files/itr_cia_aberta_2011/itr_cia_aberta_BPA_con_2011.csv
File already in UTF-8: downloaded_files/itr_cia_aberta_2011/itr_cia_aberta_BPA_ind_2011.csv


In [None]:
#
# create the datasets
#