# Partie 0 - Constantes, imports et outils

In [1]:
from pathlib import Path
import matplotlib.pyplot as plt
from requests import get, Response
from hashlib import sha256
from tqdm.notebook import tqdm
from zipfile import ZipFile
from IPython.display import display, Markdown
import pandas as pd
import missingno as msno
import seaborn as sns

from deliverables.utils.image_inverter import save

_cache_folder = Path('~/.cache/gn_p7').expanduser()
_cache_folder.mkdir(parents=True, exist_ok=True)

_ds_url = 'https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/Parcours_data_scientist/Projet+-+Impl%C3%A9menter+un+mod%C3%A8le+de+scoring/Projet+Mise+en+prod+-+home-credit-default-risk.zip'

graph_folder: Path = Path("./graphs")


def save_figure(figure: plt.Figure, folder: str, figure_name: str) -> None:
    folder = graph_folder / folder
    folder.mkdir(parents=True, exist_ok=True)
    save(figure, folder / f'{figure_name}.png', close=True)


def download(url: str) -> Path:
    url_id: str = sha256(url.encode('utf-8')).hexdigest()
    local_path: Path = _cache_folder / url_id
    local_path.parent.mkdir(parents=True, exist_ok=True)
    if not local_path.exists():
        tmp_path: Path = _cache_folder / (url_id + '.tmp')
        res: Response = get(url, stream=True)
        with tmp_path.open('wb') as f, tqdm(
                total=int(res.headers.get('content-length')),
                desc=f'Downloading {url}',
                unit_scale=True) as q:
            for chunk in res.iter_content(chunk_size=8192):
                q.update(len(chunk))
                f.write(chunk)
        tmp_path.replace(local_path)
    return local_path


def download_zip_archive(url: str) -> Path:
    """Download a zip archive, extract it then return the folder containing its content"""
    archive_path: Path = download(url)
    archive_folder: Path = Path(archive_path.as_posix() + '.dir')

    if not archive_folder.exists():
        print(f'Extracting archive {url}...', flush=True)
        archive_temp: Path = Path(archive_path.as_posix() + '.tmp')
        archive_temp.mkdir(parents=True, exist_ok=True)
        archive: ZipFile = ZipFile(archive_path)
        archive.extractall(path=archive_temp)
        archive_temp.replace(archive_folder)
        print(f'Extracting archive {url}...done', flush=True)

    return archive_folder


datasets: dict[str, pd.DataFrame] = {}


def get_dataset(name: str) -> pd.DataFrame:
    folder = download_zip_archive(_ds_url)
    if not name.endswith('.csv'):
        name = f'{name}.csv'
    try:
        return datasets[name]
    except KeyError:
        try:
            _df = pd.read_csv(folder / name)
        except FileNotFoundError:
            display(Markdown(f'# ERROR: Dataset {name!r} not found, available datasets are:\n' + '\n'.join(
                f'- {p.name}' for p in sorted(folder.iterdir(), key=(lambda x: x.name.lower())))))
            raise KeyError(name) from None
        else:
            datasets[name] = _df
            return _df.copy()


# Partie 1 - EDA

## Partie 1.1 - Chargement des données

In [2]:
train, test = map(get_dataset, ('application_train', 'application_test'))

In [3]:
train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,0,,,,,,
48741,456223,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


## Partie 1.2 - Analyse de la cible

La cible est présente dans le dataset d'entrainement mais pas dans le dataset de test, pour éviter les fuites de données.

In [5]:
save_figure(train.TARGET.value_counts().plot.pie(
    title='Répartition des cibles (0=paiement complet, 1=retards de paiement)'
).figure, '1_model', '0_target')

## Partie 1.3 - Analyse des features (hors cible)

In [6]:
# Function to calculate missing values by column# Funct
def missing_values_table(df):
    # Total missing values
    mis_val = df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
        columns={0: 'Missing Values', 1: '% of Total Values'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

    # Print some summary information
    print("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
                                                              "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")

    # Return the dataframe with missing information
    return mis_val_table_ren_columns


def missing_stats():
    for fold in ('train', 'test'):
        msno.matrix(get_dataset('application_' + fold + '.csv'), fontsize=12)
        plt.title(f'Missing Values Count ({fold.title()}ing fold)', fontsize=16)
        save_figure(plt.gcf(), '1_model', '1_missing_' + fold.title())

    for fold in ('train', 'test'):
        df = get_dataset('application_' + fold + '.csv')
        msno.matrix(df[list(sorted(df.columns, key=(lambda col: int(df[col].notna().sum()))))], fontsize=12)
        plt.title(f'Missing Values Count ({fold.title()}ing fold)', fontsize=16)
        save_figure(plt.gcf(), '1_model', '2_sorted_missing_' + fold.title())

    for fold in ('train', 'test'):
        missing_test_values = missing_values_table(get_dataset('application_' + fold + '.csv'))

        # TODO: Set the plot style for dark mode when exporting to png
        plt.figure(figsize=(16, 12))  # There are a lot of columns
        sns.barplot(x=missing_test_values['% of Total Values'], y=missing_test_values.index)
        plt.title(f'Percentage of Missing Values by Feature ({fold.title()}ing fold)', fontsize=16)
        plt.xlabel('% of Total Values', fontsize=12)
        plt.ylabel('Features', fontsize=12)

        # Add percentage text on the bars
        for index, value in enumerate(missing_test_values['% of Total Values']):
            plt.text(value, index, f' {value}%', va='center')

        plt.xlim(0, 110)  # Set x-limit to give space for text
        plt.tight_layout()
        save_figure(plt.gcf(), '1_model', '3_graph_missing' + fold.title())


missing_stats()

Your selected dataframe has 122 columns.
There are 67 columns that have missing values.
Your selected dataframe has 121 columns.
There are 64 columns that have missing values.


Nous pouvons voir qu'à peu près la moitié des colonnes manquent au moins une valeur, et que le reste est défini à environ 45-75%
Si nous nous intéressons

In [7]:
assert not len(train.columns[
                   (train.dtypes != 'int64') &
                   (train.dtypes != 'float64') &
                   (train.dtypes != 'object')]), 'Plus de types de colonnes sont présentes'
display(Markdown('Il existe trois types de données en entrée, int64 et float64, numériques, et object, catégorielles'))
display(
    Markdown('Il arrive parfois que des données numériques soient accidentellement catégorisées en "object" si elles'
             ' contiennent des valeurs non numérique, ce n\'est pas le cas ici'))

Il existe trois types de données en entrée, int64 et float64, numériques, et object, catégorielles

Il arrive parfois que des données numériques soient accidentellement catégorisées en "object" si elles contiennent des valeurs non numérique, ce n'est pas le cas ici

## Partie 1.2 - Définition d'un pipeline de prétraitement

L'avantage que le pipeline de prétraitement a est la robustesse contrer le data leakage, en effet les modèles d'apprentissages et de traitement seront entraîné sur les mêmes données, ce qui sera obligatoire pour pouvoir utiliser des techniques de K Fold en s'assurant que les folds soient indépendants les uns des autres