In [20]:
from cleanlab.classification import CleanLearning
from sklearn.ensemble import ExtraTreesClassifier
import pandas as pd


# Вспомогательные элементы для наполнения пайплайна
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder, MinMaxScaler

# Вспомогательные блоки организации для пайплайна
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn import set_config
set_config(transform_output="pandas")

In [21]:
cl = CleanLearning(ExtraTreesClassifier())
cl

In [22]:
train = pd.read_csv('../data/train.csv').drop(columns=['id', 'CustomerId', 'Surname'])

In [23]:
train

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...
165029,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [24]:
numerical_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

In [27]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop='first'))])
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, numerical_features),
    ("categorical", categorical_transformer, categorical_features)])

preprocessor

In [36]:
X_prep = preprocessor.fit_transform(train)
X = X_prep.rename(columns={column: column.split('__')[1] for column in X_prep.columns})

In [38]:
label_issues = cl.find_label_issues(X, train['Exited'])

In [41]:
label_issues

Unnamed: 0,is_label_issue,label_quality,given_label,predicted_label
0,False,1.00,0,0
1,False,1.00,0,0
2,False,0.94,0,0
3,False,0.98,0,0
4,False,1.00,0,0
...,...,...,...,...
165029,False,0.86,0,0
165030,False,0.89,0,0
165031,False,0.95,0,0
165032,False,0.88,0,0
