### Importação de bibliotecas

In [None]:
import pandas as pd

### Ler dados

In [None]:
def load_data_csv(file_path: str, low_memory=False):
    """
    Função para carregar dados a partir de um arquivo CSV.
    
    :param file_path: Caminho para o arquivo CSV.
    :param dtype: Dicionário de tipos de dados para colunas específicas.
    :param compression: Tipo de compressão do arquivo (gzip por padrão).
    :param low_memory: Flag para evitar problemas de leitura em arquivos grandes.
    :return: DataFrame do pandas com os dados carregados.
    """
    return pd.read_csv(file_path, low_memory=low_memory)

def load_data_excel(url: str):
    """
    Função para carregar dados de um arquivo Excel.
    
    :param url: URL ou caminho local do arquivo Excel.
    :return: Dicionário de DataFrames com os dados de cada planilha.
    """
    return pd.read_excel(url)

df = load_data_csv("../data/Loan_status_2007-2020Q3.gzip")
df2 = load_data_excel("https://resources.lendingclub.com/LCDataDictionary.xlsx")


In [36]:
df2

{'LoanStats':                    LoanStatNew                                        Description
 0               acc_now_delinq  The number of accounts on which the borrower i...
 1         acc_open_past_24mths         Number of trades opened in past 24 months.
 2                   addr_state  The state provided by the borrower in the loan...
 3                     all_util              Balance to credit limit on all trades
 4                   annual_inc  The self-reported annual income provided by th...
 5             annual_inc_joint  The combined self-reported annual income provi...
 6             application_type  Indicates whether the loan is an individual ap...
 7                  avg_cur_bal            Average current balance of all accounts
 8               bc_open_to_buy          Total open to buy on revolving bankcards.
 ..                         ...                                                ...
 144       debt_settlement_flag  Flags whether or not the borrower, who ha

### Decoradores

In [4]:
def remove_values(values=None):
    def decorator(function):
        def wrapper(data) -> pd.DataFrame:
            dataframe = function(data) # Call the original function

            # DataFrame with rows removed that have the given values.
            return dataframe[~dataframe.isin(values).any(axis=1)]
        return wrapper
    return decorator

In [None]:
X_Train, X_Test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=42,  stratify=y)

print(X_Train.shape, X_Test.shape, y_train.shape, y_test.shape)
print(y_train.mean(), y_test.mean())

### Exploração de dados

In [None]:
@remove_values(values=[0])
def missing_summary(dataframe):
    missing_data = {
      "missing_count": dataframe.isnull().sum(),
      "missing_percentage": dataframe.isnull().mean() * 100,
    }

    missing_dataframe = pd.DataFrame(missing_data)
    return missing_dataframe.sort_values(by="missing_percentage", ascending=False)

In [None]:
missing_summary(X_Train).transpose()