In [443]:
import pandas as pd

df = pd.read_csv('../data/raw/credit_risk_dataset.csv')


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
6,26,77100,RENT,8.0,EDUCATION,B,35000,12.42,1,0.45,N,3
7,24,78956,RENT,5.0,MEDICAL,B,35000,11.11,1,0.44,N,4
8,24,83000,RENT,8.0,PERSONAL,A,35000,8.9,1,0.42,N,2
9,21,10000,OWN,6.0,VENTURE,D,1600,14.74,1,0.16,N,3


# Outliers - Suppression des valeurs aberrantes

In [445]:
mode_remplacement = 3
while True:
    print("Méthode de substitution des valeurs manquantes :")
    print("1. Moyenne")
    print("2. Mode")
    print("3. Supprimer")

    reponse = input("Choisissez une option (1, 2 ou 3) : ")
    mode_remplacement = reponse
    if reponse == "1":
        print("Vous avez choisi : Moyenne")
        break
    elif reponse == "2":
        print("Vous avez choisi : Mode")
        break
    elif reponse == "3":
        print("Vous avez choisi : Supprimer")
        break
    else:
        print("Option invalide, veuillez réessayer.")

Méthode de substitution des valeurs manquantes :
1. Moyenne
2. Mode
3. Supprimer


Choisissez une option (1, 2 ou 3) :  3


Vous avez choisi : Supprimer


In [446]:
import numpy as np
import pandas as pd

def valeurs_aberrantes(col):
    print(f'\nChamp: {col}')
    # Supprimer les valeurs nulles
    colonne_sans_nuls = df[col].dropna()
    
    # Convertir en numérique
    colonne_sans_nuls = pd.to_numeric(colonne_sans_nuls)
    mean = colonne_sans_nuls.mean()
    
    # Calculer les quartiles
    q1 = colonne_sans_nuls.quantile(0.25)
    q3 = colonne_sans_nuls.quantile(0.75)
    
    # Calculer l'intervalle interquartile
    iqr = q3 - q1
    
    # Définir les limites pour les valeurs aberrantes
    limite_inférieure = q1 - 1.5 * iqr
    limite_supérieure = q3 + 1.5 * iqr
    
    # Lister les valeurs aberrantes
    valeur_aberrantes = df[col][(df[col].notna()) & 
                                           ((pd.to_numeric(df[col]) < limite_inférieure) | 
                                            (pd.to_numeric(df[col]) > limite_supérieure))]
    
    print(f'Valeur moyenne: {mean}')
    print(f'Min valeur aberrante: {min(valeur_aberrantes)}')
    print(f'Max valeur aberrante: {max(valeur_aberrantes)}')
    

In [447]:
valeurs_aberrantes('person_age')
valeurs_aberrantes('person_emp_length')
valeurs_aberrantes('cb_person_cred_hist_length')



Champ: person_age
Valeur moyenne: 27.73459992019889
Min valeur aberrante: 41
Max valeur aberrante: 144

Champ: person_emp_length
Valeur moyenne: 4.789686296787225
Min valeur aberrante: 15.0
Max valeur aberrante: 123.0

Champ: cb_person_cred_hist_length
Valeur moyenne: 5.804211043246064
Min valeur aberrante: 16
Max valeur aberrante: 30


Nous pouvons constater que les champs person_age et person_emp_length présentent des valeurs aberrantes. 
Définissons ces valeurs comme nulles

In [449]:
df.loc[df['person_age'] >= 80, 'person_age'] = None
df.loc[df['person_emp_length'] >= 60, 'person_emp_length'] = None

In [450]:
def traitement_na(champ, df):
    if (mode_remplacement == '1'):
        df[champ].fillna(df[champ].mean(), inplace=True)
    elif (mode_remplacement == '2'):
        df[champ].fillna(df[champ].mean(), inplace=True)
    elif (mode_remplacement == '3'):
        num_lignes_avant = len(df)
        df = df[df[champ].notna()]
        print(f'Champ: {champ} - Lignes supprimées {num_lignes_avant - len(df)}')
    return df

df = traitement_na('person_age', df)
df = traitement_na('person_emp_length', df)
df = traitement_na('loan_int_rate', df)

Champ: person_age - Lignes supprimées 8
Champ: person_emp_length - Lignes supprimées 897
Champ: loan_int_rate - Lignes supprimées 3046


# Encodage

## One Hot Encoder - Loan_intent

In [453]:

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

encoded_loan_intent = encoder.fit_transform(df[['loan_intent']]).toarray()

encoded_df = pd.DataFrame(encoded_loan_intent, columns=encoder.get_feature_names_out(['loan_intent']), 
                          index=df.index)

df = pd.concat([df, encoded_df], axis=1)

## One Hot Encoder - person_home_ownership

In [455]:
encoded_person_home_ownership = encoder.fit_transform(df[['person_home_ownership']]).toarray()

encoded_df = pd.DataFrame(encoded_person_home_ownership, columns=encoder.get_feature_names_out(['person_home_ownership']), 
                          index=df.index)

df = pd.concat([df, encoded_df], axis=1)

## OrdinalEncoder - loan_grade_encoded

In [457]:
from sklearn.preprocessing import OrdinalEncoder

ordre = np.sort(df['loan_grade'].unique())

# Création de l'Ordinal Encoder
oe = OrdinalEncoder(categories=[ordre])

# Transformation de la colonne
df['loan_grade_encoded'] = oe.fit_transform(df[['loan_grade']])

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,...,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_grade_encoded
1,21.0,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,25.0,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
3,23.0,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
4,24.0,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
5,21.0,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57.0,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0
32577,54.0,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
32578,65.0,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
32579,56.0,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [458]:
from sklearn.preprocessing import LabelEncoder


# Création d'un Label Encoder
le = LabelEncoder()

# Fit et transform
df['cb_person_default_on_file_encoded'] = le.fit_transform(df['cb_person_default_on_file'])

## Scaler 

In [460]:
numeric_features = ["person_age","person_income","person_emp_length","loan_int_rate", "loan_amnt", "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length"]
categorical_features = ["person_home_ownership","loan_grade", "loan_intent","cb_person_default_on_file", "cb_person_default_on_file"]

In [461]:
###TODO VERIFIER RobustScaler POUR LES OUTLIERS 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

In [474]:
df.to_csv('../data/processed/dados.csv', index=False)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,...,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_grade_encoded,cb_person_default_on_file_encoded
1,0.017241,0.002751,OWN,0.121951,EDUCATION,B,0.014493,0.321348,0,0.120482,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
2,0.086207,0.002751,MORTGAGE,0.024390,MEDICAL,C,0.144928,0.418539,1,0.686747,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0
3,0.051724,0.030209,RENT,0.097561,MEDICAL,C,1.000000,0.551124,1,0.638554,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0
4,0.068966,0.024757,RENT,0.195122,MEDICAL,C,1.000000,0.497191,1,0.662651,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1
5,0.017241,0.002898,OWN,0.048780,VENTURE,A,0.057971,0.096629,1,0.301205,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,0.637931,0.024069,MORTGAGE,0.024390,PERSONAL,C,0.153623,0.434831,0,0.132530,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0
32577,0.586207,0.056981,MORTGAGE,0.097561,PERSONAL,A,0.496377,0.116292,0,0.180723,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
32578,0.775862,0.035367,RENT,0.073171,HOMEIMPROVEMENT,B,1.000000,0.312921,1,0.554217,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
32579,0.620690,0.071717,MORTGAGE,0.121951,PERSONAL,B,0.420290,0.340449,0,0.120482,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0
