# Data Science Notebook 

In [1]:
import os, sys
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from sklearn import set_config

In [2]:
# Turn off jedi
%config IPCompleter.use_jedi=False

# Set sklearn output to pandas
set_config(transform_output = "pandas")

workdir = os.path.dirname(os.getcwd())

sys.path.append(workdir)

pd.options.display.max_columns = None

warnings.filterwarnings("ignore")

# display(
#     HTML(
#         """
#         <style>
#         .container { width:100% !important; }
#         </style>
#         """
#     )
# )

In [3]:
import os
from basix import files
from src.data.load import get_links_by_types, download_file, extract_filename, extract_type, read_csv_within_zip
from src.config import PATH_ZIP_DIRECTORY

urls = get_links_by_types()

database_type = 'Empresas'

url = urls['Empresas'][1]
filename = extract_filename(url)
directory_path = os.path.join(PATH_ZIP_DIRECTORY, database_type)
local_filepath = os.path.join(directory_path, filename)
files.make_directory(directory_path)
download_file(url, local_filepath)

df = read_csv_within_zip(local_filepath)

In [4]:
df.sample(3)

Unnamed: 0,cnpj_base,razao_social,natureza_juridica,qualificacao,capital_social,porte_empresa,ente_federativo
2802612,2891503,ASSOCIACAO MORADORES AGROVILA MONTE ALTO,3999,16,0,5.0,
2813093,2902033,M RIBAS CONSULTORES ASSOCIADOS LTDA,2062,49,0,1.0,
3822229,3942771,SHIRATORI E RODRIGUES BAZAR LTDA,2062,49,0,1.0,


In [5]:
import pandas as pd
import zipfile

zipfile_path = 

zf = zipfile.ZipFile('data/raw/Empresas0.zip') 
df = pd.read_csv(zf.open('intfile.csv'))

In [10]:
from src.config import DATABASE_COLUMNS

DATABASE_COLUMNS[database_type]

['cnpj_base',
 'razao_social',
 'natureza_juridica',
 'qualificacao',
 'capital_social',
 'porte_empresa',
 'ente_federativo']

In [12]:
import zipfile
from src.config import DATABASE_COLUMNS

def read_csv_within_zip(zip_filepath: str) -> pd.DataFrame:
    """
    This function reads all csv files within a zip file and concatenates them into a single dataframe.
    
    Args:
        zip_filepath (str): the file path to the zip file.

    Returns:
        pd.DataFrame: a dataframe containing the concatenated data from all csv files within the zip file.
    """

    zf = zipfile.ZipFile(zip_filepath)

    database_type = os.path.dirname(local_filepath).split("/")[-1]

    pdread_opts = dict(sep=";", encoding="latin-1", header=None)

    dataframe = pd.DataFrame()

    for filename in zf.namelist():
        try:
            temp = pd.read_csv(zf.open(filename), **pdread_opts)
            temp.columns = DATABASE_COLUMNS[database_type]
            dataframe = pd.concat(
                [
                    dataframe,
                    temp
                ]
            )

        except Exception as err:
            logger.warning(
                f"It was not possible to read the file {filename}"
                f"within de zip file {local_filepath}"
            )
            
    return dataframe

In [15]:
dataframe['porte_empresa']

0           1.0
1           1.0
2           1.0
3           1.0
4           1.0
           ... 
11890346    1.0
11890347    1.0
11890348    5.0
11890349    1.0
11890350    5.0
Name: porte_empresa, Length: 11890351, dtype: float64

In [12]:
[
    "cnpj_base", 
    "razao_social", 
    "natureza_juridica", 
    "qualificacao", 
    "capital_social", 
    "porte_empresa", 
    "ente_federativo"
]

Unnamed: 0,0,1,2,3,4,5,6
0,41273597,PACHARRUS QUEIROZ DA COSTA E SILVA 03618384335,2135,50,500000,1.0,
1,41273598,GLORIA VIANA DIAS DA SILVA 13118961716,2135,50,110000,1.0,
2,41273599,ANA PAULA DA SILVA DE PAULA 04659802957,2135,50,200000,1.0,
3,41273600,AVANILSON BRUNO MATIAS DA SILVA 08778601495,2135,50,5000000,1.0,
4,41273601,GABRIELA HELENA FACINI DA SILVA 47022415838,2135,50,200000,1.0,
...,...,...,...,...,...,...,...
11890346,98819550,ANTONIO ORIQUES CARDOSO,2135,50,000,1.0,
11890347,98819568,ELI ORIQUES CARDOSO,2135,50,000,1.0,
11890348,98819600,PAULINO LEMOS DA SILVA,2135,50,000,5.0,
11890349,98819832,JOAO FELTRIN,2135,50,000,1.0,


In [21]:


from bs4 import BeautifulSoup, SoupStrainer
import requests


def get_links():
    page = requests.get(BASE_URL)    
    data = page.text
    soup = BeautifulSoup(data)

    for link in soup.find_all('a'):
        if str(link.get('href')).endswith('.zip'): 
            cam = link.get('href')
            # if cam.startswith('http://http'):
            #     cam = 'http://' + cam[len('http://http//'):] 
            if not cam.startswith('http'):
                yield BASE_URL+cam
            else:
                yield cam

In [24]:
links = list(get_links())

In [13]:
from src.config import DATABASE_TYPES

{k: None for k in DATABASE_TYPES}

{'Cnaes': None,
 'Empresas': None,
 'Estabelecimentos': None,
 'Motivos': None,
 'Municipios': None,
 'Naturezas': None,
 'Paises': None,
 'Qualificacoes': None,
 'Simples': None,
 'Socios': None}