# Imports

## Librairies

In [2]:
import pandas as pd
import numpy as np
import time
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, ClassificationPreset
from evidently import ColumnMapping

from sklearn.metrics import accuracy_score

## Data

In [3]:
df = pd.read_csv("../data/cleaned/df_sample.csv", sep=";")
app_train = df.sample(frac=0.7, random_state=42)
app_test = df.drop(app_train.index)

# Data-Drift

In [4]:
app_train = app_train.drop(columns=["SK_ID_CURR", "TARGET"])
app_test = app_test.drop(columns=["SK_ID_CURR", "TARGET"])

In [5]:
ignored_cols = ["SK_ID_CURR", "TARGET"]

categorical_columns = [
    col for col in app_train.columns
    if set(app_train[col].unique()).issubset({0, 1})
]

numerical_columns = [
    col for col in app_train.columns
    if col not in categorical_columns + ignored_cols
]

In [6]:
column_mapping = ColumnMapping()
column_mapping.categorical_features = categorical_columns
column_mapping.numerical_features = numerical_columns

In [7]:
report = Report(metrics=[
    DataDriftPreset(
        num_stattest="ks", cat_stattest="psi",
        num_stattest_threshold=0.2, cat_stattest_threshold=0.2,
    )
])

report.run(reference_data=app_train, current_data=app_test, column_mapping=column_mapping)
report.save_html("data_drift_report.html")