# Курс Спортивный анализ данных. Платформа Kaggle

# Курсовой проект. Model

## Подключение библиотек и скриптов

In [57]:
import time
import numpy as np
import pandas as pd
import catboost as cb
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from scipy.stats import gmean

from pathlib import Path

pd.set_option('display.max_columns', None)

In [58]:
def get_input(dataset_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.
    
    Parametrs
    ---------
    dataset_path: str
        Название файла
        
    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame
    """
    
    data_root = Path('D:/DS_materials/208_kaggle/data_comp/')
    dataset = pd.read_csv(f'{data_root}/{dataset_path}')
    dataset.columns = [col.lower() for col in dataset.columns]
    print(f"{dataset_path} shape: {dataset.shape[0]} rows, {dataset.shape[1]} cols")

    return dataset

In [60]:
def xgboost_cross_validation(params, fit_params, X, y, cv):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признаков для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_train, y_train), (x_valid, y_valid)],
            **fit_params
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [63]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()
    """
    # Create binary features to check if the example is has missing values for all features that have missing values
    for feature in X.columns:
        if np.any(np.isnan(X[feature])):
            df["is_" + feature + "_missing"] = np.isnan(X[feature]) * 1
    """
    
    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
            )


    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']

    X["ratio_salary_to_per_family_size"] = X["total_salary"] / X["family_size"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение временных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    """
    # Сделать конкатенацию признаков, рассматривая их как категориальные
    features_1 = ['childrens', 'family_size']
    for feature in features_1:
        X[feature] = X[feature].astype('str')
    
    X["gender_childrens"] = X['gender'] + " | " + X['childrens']
    X["gender_family_status"] = X['gender'] + " | " + X['family_status']    
    X["gender_family_size"] = X['gender'] + " | " + X['family_size'] 
    X["gender_childrens_family_status"] = X['gender_childrens'] + " | " + X['family_status']  
    X["gender_childrens_family_size"] = X['gender_childrens'] + " | " + X['family_size']   
    X["gender_family_status_family_size"] = X['gender_family_status'] + " | " + X['family_size']       
    X["family_status_family_size"] = X['family_status'] + " | " + X['family_size']  
    X["childrens_family_size"] = X['childrens'] + " | " + X['family_size']
    X["childrens_family_status"] = X['childrens'] + " | " + X['family_status']  
    X["childrens_family_status_family_size"] = X['childrens'] + " | " + X['family_status_family_size']  
    """
    
    # Сделать FrequencyEncoder для категориальных признаков
    features = ['gender', 'childrens', 'family_status', 'family_size']   
    for feature in features:
        freq_enc = X[feature].value_counts(normalize=True)
        X[feature + "_freq_enc"] = X[feature].map(freq_enc)    
          
    return X

In [64]:
def encode_cat_feats(train, test, categorial):
    for feature in categorial:
        le = LabelEncoder()
        # train[feature] = le.fit_transform(train[feature].fillna('null'))
        # test[feature] = le.transform(test[feature].fillna('null'))
        train[feature] = le.fit_transform(train[feature].astype("str").fillna("NA"))
        test[feature] = le.transform(test[feature].astype("str").fillna("NA"))
    return train, test

# Подготовка датасета

In [65]:
train = get_input("train.csv")
test = get_input("test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

train.csv shape: 110093 rows, 3 cols
test.csv shape: 165141 rows, 2 cols


Unnamed: 0,application_number,target,name_contract_type
0,123687442,0.0,Cash
1,123597908,1.0,Cash


In [66]:
client_profile = get_input("client_profile.csv")
client_profile = create_client_profile_features(client_profile)
client_profile.head(n=2)

client_profile.csv shape: 250000 rows, 24 cols


  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_salary_to_per_family_size,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,gender_freq_enc,childrens_freq_enc,family_status_freq_enc,family_size_freq_enc
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549.0,,1,0,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0,3.0,1.428571,0.052832,2.930959,0.236315,0.678568,0.414784,0.329471,0.036237,20.0,0.085714,1.714286,78750.0,1.577103,31.542056,18.399533,101.678502,174.306004,8.7153,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797,0.65858,0.70004,0.097008,0.514472
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,,,0,0,2.0,,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0,2.0,-1.875,,,0.442295,0.802745,0.62252,0.62252,0.032481,18.86105,0.105433,1.988583,135000.0,1.227714,23.155971,11.644456,,,,,5466.42,,,,237475.743779,431008.094056,,12590.802122,22851.755462,0.65858,0.70004,0.639384,0.514472


In [67]:
data = data.merge(
    client_profile, how="left", on="application_number"
)

In [68]:
def get_train_test(X: pd.DataFrame) -> pd.DataFrame:
    """
    Формирование обучающей выборки и тестовой выборки
    
    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходными данными.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    train: pandas.core.frame.DataFrame
        Обучающая выборка
        
    test: pandas.core.frame.DataFrame
        Тестовая выборка
        
    target: pd.Series
        Целевая переменная
        
    test_id: pd.Series
        id пользователей для тестовой выборки
    
    categorial: str
        Список категориальных признаков.
    """
    mask = X["target"].isnull()
    features_to_drop = ["application_number", "target"]

    train, test = X.loc[~mask], X.loc[mask]

    target, test_id = train["target"], test["application_number"]
    train = train.drop(features_to_drop, axis=1)
    test = test.drop(features_to_drop, axis=1)

    categorial = train.dtypes[train.dtypes == "object"].index
    numerical = list(set(train.columns) - set(categorial))

    train = train.replace(np.inf, np.nan)
    train = train.replace(-np.inf, np.nan)
    
#    test[numerical] = test[numerical].astype(float)
#    test[categorial] = test[categorial].astype(str)
    
    return train, test, target, test_id, categorial

In [69]:
train, test, target, test_id, categorial = get_train_test(data)

In [70]:
CAT_FEATS = categorial.tolist()
print(f"Categorical Features Count: {len(CAT_FEATS)}")
CAT_FEATS

Categorical Features Count: 4


['name_contract_type', 'gender', 'education_level', 'family_status']

In [71]:
train_encode, test_encode = encode_cat_feats(train, test, categorial)

## XGBoost model

In [72]:
seed = 42
np.random.seed(seed)

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

In [73]:
%%time

xgb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "max_bin": 20,
    "max_depth": 6,
    "nthread": 6,
    "l2_leaf_reg": 20,    
    "seed": seed
}
xgb_fit_params = {
    "early_stopping_rounds": 50,
    "verbose":50,    
    "eval_metric": "auc"    
}

xgb_estimators, xgb_oof_preds = xgboost_cross_validation(
    params=xgb_params, fit_params=xgb_fit_params, X=train_encode, y=target, cv=cv
)

Mon Nov  1 10:36:05 2021, Cross-Validation, 110093 rows, 57 cols




Parameters: { "l2_leaf_reg" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-auc:0.70718	validation_1-auc:0.69074
[50]	validation_0-auc:0.72979	validation_1-auc:0.70027
[100]	validation_0-auc:0.73670	validation_1-auc:0.70180
[150]	validation_0-auc:0.74275	validation_1-auc:0.70298
[200]	validation_0-auc:0.75051	validation_1-auc:0.70576
[250]	validation_0-auc:0.75551	validation_1-auc:0.70859
[300]	validation_0-auc:0.76198	validation_1-auc:0.71080
[350]	validation_0-auc:0.76922	validation_1-auc:0.71343
[400]	validation_0-auc:0.77623	validation_1-auc:0.71428
[450]	validation_0-auc:0.78350	validation_1-auc:0.71605
[500]	validation_0-auc:0.79011	validation_1-auc:0.71851
[550]	validation_0-auc:0.79577	validation_1-auc:0.72015
[600]	validat



Parameters: { "l2_leaf_reg" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-auc:0.70677	validation_1-auc:0.69640
[50]	validation_0-auc:0.72887	validation_1-auc:0.70707
[100]	validation_0-auc:0.73381	validation_1-auc:0.70756
[150]	validation_0-auc:0.73873	validation_1-auc:0.71065
[200]	validation_0-auc:0.74658	validation_1-auc:0.71351
[250]	validation_0-auc:0.75191	validation_1-auc:0.71552
[300]	validation_0-auc:0.75979	validation_1-auc:0.71887
[350]	validation_0-auc:0.76782	validation_1-auc:0.72167
[400]	validation_0-auc:0.77524	validation_1-auc:0.72257
[450]	validation_0-auc:0.78238	validation_1-auc:0.72435
[500]	validation_0-auc:0.78765	validation_1-auc:0.72590
[550]	validation_0-auc:0.79231	validation_1-auc:0.72692
[600]	validat



Parameters: { "l2_leaf_reg" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-auc:0.70523	validation_1-auc:0.69588
[50]	validation_0-auc:0.72659	validation_1-auc:0.70957
[100]	validation_0-auc:0.73378	validation_1-auc:0.71105
[150]	validation_0-auc:0.74013	validation_1-auc:0.71264
[200]	validation_0-auc:0.74623	validation_1-auc:0.71692
[250]	validation_0-auc:0.75292	validation_1-auc:0.72013
[300]	validation_0-auc:0.76050	validation_1-auc:0.72339
[350]	validation_0-auc:0.76735	validation_1-auc:0.72621
[400]	validation_0-auc:0.77428	validation_1-auc:0.72883
[450]	validation_0-auc:0.78101	validation_1-auc:0.73109
[500]	validation_0-auc:0.78714	validation_1-auc:0.73280
[550]	validation_0-auc:0.79260	validation_1-auc:0.73347
[600]	validat



Parameters: { "l2_leaf_reg" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-auc:0.70948	validation_1-auc:0.68769
[50]	validation_0-auc:0.72882	validation_1-auc:0.70053
[100]	validation_0-auc:0.73365	validation_1-auc:0.70339
[150]	validation_0-auc:0.74125	validation_1-auc:0.70610
[200]	validation_0-auc:0.74932	validation_1-auc:0.70738
[250]	validation_0-auc:0.75545	validation_1-auc:0.70898
[300]	validation_0-auc:0.76028	validation_1-auc:0.71193
[350]	validation_0-auc:0.76748	validation_1-auc:0.71460
[400]	validation_0-auc:0.77399	validation_1-auc:0.71734
[450]	validation_0-auc:0.78144	validation_1-auc:0.71884
[500]	validation_0-auc:0.78752	validation_1-auc:0.71982
[550]	validation_0-auc:0.79309	validation_1-auc:0.72135
[600]	validat



Parameters: { "l2_leaf_reg" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-auc:0.70842	validation_1-auc:0.69386
[50]	validation_0-auc:0.72713	validation_1-auc:0.70053
[100]	validation_0-auc:0.73517	validation_1-auc:0.70244
[150]	validation_0-auc:0.73988	validation_1-auc:0.70425
[200]	validation_0-auc:0.74681	validation_1-auc:0.70972
[250]	validation_0-auc:0.75297	validation_1-auc:0.71228
[300]	validation_0-auc:0.75939	validation_1-auc:0.71458
[350]	validation_0-auc:0.76757	validation_1-auc:0.71674
[400]	validation_0-auc:0.77517	validation_1-auc:0.71874
[450]	validation_0-auc:0.78240	validation_1-auc:0.72130
[500]	validation_0-auc:0.78817	validation_1-auc:0.72325
[550]	validation_0-auc:0.79380	validation_1-auc:0.72477
[600]	validat

In [74]:
# Score by each fold: [0.72219, 0.72763, 0.73625, 0.7254, 0.72702]

xgb_oof_score = roc_auc_score(
    target, xgb_oof_preds
)
print(f"OOF-score = {round(xgb_oof_score, 5)}")
# OOF-score = 0.72668

OOF-score = 0.72668


### Прогнозные значения

In [75]:
def predict_by_estimators(test, estimators, test_id=test_id):
    y_pred = np.zeros(test.shape[0])

    for estimator in estimators:
        y_pred += estimator.predict_proba(test)[:, 1]
        
    y_pred = pd.DataFrame({
        "APPLICATION_NUMBER": test_id,
        "TARGET": y_pred / len(estimators)
    })
    return y_pred

In [76]:
xgb_y_pred = predict_by_estimators(test_encode, xgb_estimators)

In [77]:
# Сохранение прогнозных значений 
xgb_y_pred.to_csv("baseline_submit_xgb.csv", index=False)