Импорты

In [73]:
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import numpy as np

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve

Открытие файлов

In [59]:
df = pd.read_csv('train.csv', low_memory=False)
df_valid = pd.read_csv('valid.csv', low_memory=False)

pd.options.display.max_columns = None
pd.options.display.max_rows = None

Создание переменных

In [60]:
# Переменные train
df1 = df.copy(deep = True)  
df_baseline = df.copy(deep = True)

# Переменные valid
df_valid1 = df_valid.copy(deep = True)
df_valid_baseline = df_valid.copy(deep = True)

Сырая очистка датасетов для baseline-модели

In [None]:
dellist = []
miss = df_baseline.isna().mean()*100
for col, percent in miss.items():
    if percent > 45:
        dellist.append(col)
for col in df_baseline.loc[:, df_baseline.nunique() == 1].columns.tolist():
    dellist.append(col)

def baseclean(df: pd.DataFrame, dellist):
    df = df.select_dtypes(include = np.number)
    if 'client_id' in df.columns:
        df = df.drop('client_id', axis=1)
    for col in dellist:
        if col in df.columns:
            del df[col]
    print("Размер датасета после удаления колонок: ", df.shape)
    df = df.fillna(0)
    return df

df_train = baseclean(df_baseline, dellist)
df_val_base = baseclean(df_valid_baseline, dellist)

Создание и обучение baseline-модели

In [None]:
X_train_base = df_train.drop('target', axis=1)
X_valid_base = df_val_base.drop('target', axis=1)

y_train_base = df_train['target']
y_valid_base = df_val_base['target']

X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_train_base, y_train_base, test_size=0.2, random_state=42)

rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_base, y_train_base)

y_proba_test_base = rfc.predict_proba(X_test_base)[:, 1]
roc_auc_test_base = roc_auc_score(y_test_base, y_proba_test_base)
print(roc_auc_test_base, "- rocauc test")

y_proba_valid_base = rfc.predict_proba(X_valid_base)[:, 1]
roc_auc_valid_base = roc_auc_score(y_valid_base, y_proba_valid_base)
print(roc_auc_valid_base, "- rocauc val")

Очистка train датасета

In [None]:
def clean_train(data): 
    data = data.loc[:, data.notna().sum() > data.shape[0] * 0.55] 
    print("Размер датасета после создания trashhold-а: ", data.shape) 
    data = data.loc[:, data.nunique() != 1]
    print("Размер датасета после удаления одинаковых столбцов: ", data.shape) 
    data = data.replace([np.inf, -np.inf, np.nan], 101010) 

    def create_golden_record(group): 
        group = group.sort_values(by='report_date').iloc[-1] 
        for col in group.index: 
            if group[col] == 101010: 
                group[col] = group.drop(columns=[col]).dropna().iloc[0] if not group.drop(columns=[col]).dropna().empty else 101010 
        return group
    data = data.groupby('client_id').apply(create_golden_record).reset_index(drop=True) 

    for col in data.columns: 
        data[col] = pd.to_numeric(data[col], errors='coerce') 
    data = data.replace([np.inf, -np.inf, np.nan], 101010) 

    float_columns = data.select_dtypes(include=np.number).columns 
    for col in float_columns: 
        temp = data[col].fillna(0) 
        if temp.apply(lambda x: x == int(x)).all(): 
            data[col] = data[col].astype(int) 

    data = data[data.isnull().mean(axis=1) < 0.50] 
    data = data.replace(101010, np.nan) 

    for col in data.select_dtypes(include=[np.number]).columns : 
        if len(data[col].value_counts()) <= 3: 
            data[col] = data[col].bfill()
        elif data[col].isna().mean() <= 0.1: 
            data[col] = data[col].fillna(data[col].median()) 
        else: 
            data[col] = KNNImputer(n_neighbors=5).fit_transform(data[[col]])[:, 0] 
    if data.isna().any().any(): 
        for col in data.columns: 
            if data[col].isna().any(): 
                data[col] = data[col].fillna(data[col].median()) 
 
    return data

df_gold = clean_train(df1)
print("Информация о датасете: \n")
print(df_gold.shape, "\n")
print(df_gold.info())

In [None]:
df_gold.head(10)

Очистка valid датасета

In [None]:
def find_and_remove_missing_columns(df_train, df_valid):
    columns_in_train = set(df_train.columns)
    columns_in_valid = set(df_valid.columns)
    columns_to_remove = columns_in_valid - columns_in_train

    df_valid_cleaned = df_valid.drop(columns=columns_to_remove, errors='ignore')
    df_valid_cleaned = df_valid_cleaned.sort_values(by='report_date').groupby('client_id').last().reset_index()
    print("Размер датасета после удаления лишних столбцов:", df_valid_cleaned.shape)

    msno.bar(df_valid_cleaned,color=(0.59,0.98,0.59), figsize=[15,25], fontsize=6)
    plt.xticks(np.arange(0,1.0,0.05))
    plt.grid(True,linestyle="-",alpha=1)
    plt.show

    df_valid_cleaned = df_valid_cleaned.replace(101010, np.nan)
    for col in df_valid_cleaned.columns:
        if df_valid_cleaned[col].isna().any():
            df_valid_cleaned[col] = df_valid_cleaned[col].fillna(df_valid_cleaned[col].dropna().iloc[0] if not df_valid_cleaned[col].dropna().empty else np.nan)
    print("Соотношение типов после обработки:", df_valid_cleaned.dtypes)

    df_valid_cleaned = df_valid_cleaned[df_valid_cleaned.isnull().mean(axis=1) < 0.50]
    print("Размер df_valid после удаления строк с >50% пропусков:", df_valid_cleaned.shape[0:10])

    return df_valid_cleaned.select_dtypes(include=np.number)

df_valid_gold = find_and_remove_missing_columns(df_gold, df_valid1)

Удаление последних лишних колонок

In [None]:
df_gold = df_gold.drop(['col1454', 'report_date'], axis=1)
df_valid_gold.shape

In [None]:
df_valid_gold.head(10)

Вывод графиков пропусков

In [None]:
msno.bar(df_gold,color=(0.59,0.98,0.59), figsize=[15,25], fontsize=6)
plt.xticks(np.arange(0,1.0,0.05))
plt.grid(True,linestyle="-",alpha=1)
plt.show

In [None]:
msno.bar(df_valid_gold,color=(0.59,0.98,0.59), figsize=[15,25], fontsize=6)
plt.xticks(np.arange(0,1.0,0.05))
plt.grid(True,linestyle="-",alpha=1)
plt.show

In [None]:
df_gold.info()
df_valid_gold.info()

Обучение модели

In [None]:
X_train = (df_gold.select_dtypes(include=np.number)).drop(['client_id', 'target'], axis=1)
y_train = df_gold['target']

X_valid = (df_valid_gold.select_dtypes(include=np.number)).drop(['client_id', 'target'], axis=1)
y_valid = df_valid_gold['target']

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

rnd_frs = RandomForestClassifier(random_state=384)
rnd_frs.fit(X_train, y_train)

y_predict = rnd_frs.predict(X_valid)
print(classification_report(y_valid, y_predict))

y_proba_test = rnd_frs.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_proba_test)
print(roc_auc_test, "rocauc test")

y_proba_valid = rfc.predict_proba(X_valid)[:, 1]
roc_auc_valid = roc_auc_score(y_valid, y_proba_valid)
print(roc_auc_valid, "rocauc val")

Построение ROC кривой

In [None]:
fpr, tpr, _ = roc_curve(y_valid,  y_proba_valid)

plt.plot(fpr, tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()