### Importação de bibliotecas

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Ler dados

In [3]:
class DataLoader:
    def __init__(self, file_path: str, file_type: str, **kwargs):
        self.file_path = file_path
        self.file_type = file_type
        self.kwargs = kwargs
        self.load_functions = {
            "csv": self.load_csv,
            "excel": self.load_excel
        }

    def load_data(self):
        try:
            return self.load_functions[self.file_type]()
        except KeyError:
            raise ValueError(f"Unsupported file type: {self.file_type}")

    def load_csv(self):
        return pd.read_csv(self.file_path, **self.kwargs)

    def load_excel(self):
        return pd.read_excel(self.file_path, **self.kwargs)
    

# Exemplos de uso:
csv_loader = DataLoader("../data/Loan_status_2007-2020Q3.gzip", file_type="csv", low_memory=False)
df = csv_loader.load_data()

excel_loader = DataLoader("https://resources.lendingclub.com/LCDataDictionary.xlsx", file_type="excel")
df2 = excel_loader.load_data()


In [4]:
print(df.head())
print(df.tail())
print(df2.head())
print(df2.tail())

   Unnamed: 0       id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0           0  1077501     5000.0       5000.0           4975.0   36 months   
1           1  1077430     2500.0       2500.0           2500.0   60 months   
2           2  1077175     2400.0       2400.0           2400.0   36 months   
3           3  1076863    10000.0      10000.0          10000.0   36 months   
4           4  1075358     3000.0       3000.0           3000.0   60 months   

  int_rate  installment grade sub_grade  ... hardship_start_date  \
0   10.65%       162.87     B        B2  ...                 NaN   
1   15.27%        59.83     C        C4  ...                 NaN   
2   15.96%        84.33     C        C5  ...                 NaN   
3   13.49%       339.31     C        C1  ...                 NaN   
4   12.69%        67.79     B        B5  ...                 NaN   

  hardship_end_date payment_plan_start_date  hardship_length hardship_dpd  \
0               NaN                    

### Pré-processamento de dados
Esse é um termo amplo que cobre a filtragem, transformação e qualquer outro tipo de preparação que esteja sendo feito antes da análise ou modelagem.

In [5]:
# Filtrando apenas as linhas com 'Fully Paid' ou 'Charged Off'
df = df.loc[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]

# Convertendo 'Fully Paid' para 0 e 'Charged Off' para 1
df['loan_status'] = df['loan_status'].map({
    'Fully Paid': 0,
    'Charged Off': 1
})


### Amostragem de Dados (Sampling)

In [6]:
# Separação de Features e Target
target = 'loan_status'
features = df.drop("loan_status", axis=1).columns.to_list()

# Dividir conjunto de dados em features (X) e variável alvo (y)
X = df[features]
y = df[target]

# Método para dividir os dados em treino e teste
X_Train, X_Test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verificando as dimensões dos conjuntos de dados de treino e teste
print(X_Train.shape, X_Test.shape, y_train.shape, y_test.shape)

# Taxa de resposta nos conjuntos de treino e teste
print(y_train.mean(), y_test.mean())


(1488264, 141) (372067, 141) (1488264,) (372067,)
0.19488343465944213 0.19488425471756432


### Decoradores

In [7]:
def remove_values(values=None):
    def decorator(function):
        def wrapper(data) -> pd.DataFrame:
            dataframe = function(data) # Call the original function

            # DataFrame with rows removed that have the given values.
            return dataframe[~dataframe.isin(values).any(axis=1)]
        return wrapper
    return decorator


### Exploração de dados

In [8]:
y_train.value_counts()

loan_status
0    1198226
1     290038
Name: count, dtype: int64

In [None]:
@remove_values(values=[0])
def missing_summary(dataframe):
    missing_data = {
      "missing_count": dataframe.isnull().sum(),
      "missing_percentage": dataframe.isnull().mean() * 100,
    }

    missing_dataframe = pd.DataFrame(missing_data)
    return missing_dataframe.sort_values(by="missing_percentage", ascending=False)

missing_summary()

Unnamed: 0,next_pymnt_d,hardship_end_date,hardship_loan_status,hardship_dpd,deferral_term,hardship_length,payment_plan_start_date,hardship_status,hardship_reason,hardship_type,...,title,hardship_flag,last_pymnt_d,revol_util,dti,pub_rec_bankruptcies,collections_12_mths_ex_med,chargeoff_within_12_mths,last_credit_pull_d,tax_liens
missing_count,1488264.0,1478093.0,1478101.0,1478093.0,1478093.0,1478093.0,1478093.0,1478093.0,1478092.0,1478093.0,...,17036.0,7485.0,2638.0,1116.0,882.0,559.0,47.0,47.0,48.0,32.0
missing_percentage,100.0,99.317,99.317,99.317,99.317,99.317,99.317,99.317,99.317,99.317,...,1.145,0.503,0.177,0.075,0.059,0.038,0.003,0.003,0.003,0.002


: 