# Спортивный анализ данных. Платформа Kaggle

# Практическое задание урока 4
Домашнее задание - построить базовую модель для курсового проекта, зафиксировать прогнозы на лидерборде.

https://www.kaggle.com/c/geekbrains-competitive-data-analysis/leaderboard - соревнование, которое мы решаем на курсе.

Для получения оценки "Отлично" нужно превзойти baseline 2. Для получения оценки "Хорошо" нужно превзойти baseline 1.

## Описание источников данных:
1.	train.csv - пары "заявка - целевая переменная", для этой выборки нужно собрать признаки и обучить модель;
2.	test.csv - пары "заявки - прогнозное значение", для этой выборки нужно собрать признаки и построить прогнозы;
3.	bki.csv - данные БКИ о предыдущих кредитах клиента;
4.	client_profile.csv - клиентский профиль, некоторые знания, которые есть у компании о клиенте;
5.	payments.csv - история платежей клиента;
6.	applications_history.csv - история предыдущих заявок клиента.


## Подключение библиотек и скриптов

In [48]:
import time
import numpy as np
import pandas as pd
import catboost as cb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from pathlib import Path

pd.set_option('display.max_columns', None)

In [49]:
def get_input(dataset_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.
    
    Parametrs
    ---------
    dataset_path: str
        Название файла
        
    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame
    """
    
    data_root = Path('D:/DS_materials/208_kaggle/data_comp/')
    dataset = pd.read_csv(f'{data_root}/{dataset_path}')
    dataset.columns = [col.lower() for col in dataset.columns]
    print(f"{dataset_path} shape: {dataset.shape[0]} rows, {dataset.shape[1]} cols")

    return dataset

In [50]:
def catboost_cross_validation(params, X, y, cv,
                                categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    if categorical:
        categorical = list(set(categorical) & set(X.columns))
        X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
        
    return estimators, oof_preds


def catboost_hold_out_validation(params, X, y, split_params = [0.7, 0.2, 0.1], categorical = None):
    """
    Hold-Out валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    split_params: List[float], optional, default = [0.7, 0.2, 0.1]
        Параметры (доли) разбиения выборки.
        Опциональный параметр, по умолчанию, равен [0.7, 0.2, 0.1].

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimator: catboost.core.CatBoostClassifier
        Обученный классификатор catboost.

    test_prediction: np.array, optional
        Вектор прогнозов для тестовой выборки.
        Опциональный объект, возвращается только, если split_params
        содержит 3 значения.

    """
    numeric = list(set(X.columns) - set(categorical))
    x_train, x_valid = train_test_split(
        X, train_size=split_params[0], random_state=27
    )
    y_train, y_valid = train_test_split(
        y, train_size=split_params[0], random_state=27
    )

    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])

        x_valid, x_test = train_test_split(
            x_valid, test_size=test_size, random_state=72
        )
        y_valid, y_test = train_test_split(
            y_valid, test_size=test_size, random_state=72
        )

    estimator = cb.CatBoostClassifier(**params)
    estimator.fit(
        x_train, y_train, categorical,
        eval_set=[(x_train, y_train), (x_valid, y_valid)]
    )

    print("="*80)
    valid_score = roc_auc_score(y_valid, estimator.predict_proba(x_valid)[:, 1])
    print(f"Valid Score = {round(valid_score, 4)}")

    if len(split_params) == 3:

        test_prediction = estimator.predict_proba(x_test)[:, 1]
        test_score = roc_auc_score(y_test, test_prediction)
        print(f"Test Score = {round(test_score, 4)}")

        return estimator, test_prediction

    else:
        return estimator

In [51]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()
        
        
    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3
#     X["external_scoring_prod_log"] = np.log(X["external_scoring_prod"])

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
            )

#     X['external_scoring_rating_mean_to_age'] = X["external_scoring_rating_mean"] / X["age"]

    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']

    X["ratio_salary_to_per_family_size"] = X["total_salary"] / X["family_size"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение временных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    """
    # Неиспользованные признаки из бейзлайна с вебинара
    X["external_scoring_rating_1_plus_2"] = np.nansum(
        X[["external_scoring_rating_1", "external_scoring_rating_2"]], axis=1
    )
    X["external_scoring_rating_1_plus_3"] = np.nansum(
        X[["external_scoring_rating_1", "external_scoring_rating_3"]], axis=1
    )    
    X["external_scoring_rating_2_plus_3"] = np.nansum(
        X[["external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
    )
    X["external_scoring_rating_1_is_nan"] = np.isnan(
        X["external_scoring_rating_1"]
    )
    X["external_scoring_rating_2_is_nan"] = np.isnan(
        X["external_scoring_rating_2"]
    )
    X["external_scoring_rating_3_is_nan"] = np.isnan(
        X["external_scoring_rating_3"]
    )
    
    # дополнительные признаки
    X['ratio_credit_per_family_size'] = X['amount_credit'] / X['family_size']
    X['ratio_credit_per_childrens'] = X['amount_credit'] / (1 + X['childrens'])
    X['ratio_children_to_family_size'] = X['childrens'] / X['family_size']
    X['ratio_salary_per_family_size'] = X['total_salary'] / X['family_size']
    X['ratio_salary_per_child'] = X['total_salary'] / X['childrens']
    X['non_child'] = X['family_size'] - X['childrens']
    X['ratio_child_to_non_child'] = X['childrens'] / X['non_child']
    X['ratio_salary_per_non_child'] = X['total_salary'] / X['non_child']
    X['ratio_credit_per_non_child'] = X['amount_credit'] / X['non_child']
    """      
    return X

## Загрузка данных

In [52]:
train = get_input("train.csv")
test = get_input("test.csv")
#history = get_input("applications_history.csv")
client_profile = get_input("client_profile.csv")
#payments = get_input("payments.csv")
#bki = get_input("bki.csv")

train.csv shape: 110093 rows, 3 cols
test.csv shape: 165141 rows, 2 cols
client_profile.csv shape: 250000 rows, 24 cols


In [53]:
data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

Unnamed: 0,application_number,target,name_contract_type
0,123687442,0.0,Cash
1,123597908,1.0,Cash


In [54]:
client_profile = create_client_profile_features(client_profile)
client_profile.head(2)

  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_salary_to_per_family_size,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549.0,,1,0,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0,3.0,1.428571,0.052832,2.930959,0.236315,0.678568,0.414784,0.329471,0.036237,20.0,0.085714,1.714286,78750.0,1.577103,31.542056,18.399533,101.678502,174.306004,8.7153,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,,,0,0,2.0,,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0,2.0,-1.875,,,0.442295,0.802745,0.62252,0.62252,0.032481,18.86105,0.105433,1.988583,135000.0,1.227714,23.155971,11.644456,,,,,5466.42,,,,237475.743779,431008.094056,,12590.802122,22851.755462


In [55]:
data = data.merge(
    client_profile, how="left", on="application_number")

## baseline

In [56]:
def get_train_test(X: pd.DataFrame) -> pd.DataFrame:
    """
    Формирование обучающей выборки и тестовой выборки
    
    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходными данными.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    train: pandas.core.frame.DataFrame
        Обучающая выборка
        
    test: pandas.core.frame.DataFrame
        Тестовая выборка
        
    target: pd.Series
        Целевая переменная
        
    test_id: pd.Series
        id пользователей для тестовой выборки
    
    categorial: str
        Список категориальных признаков.
    """
    mask = X["target"].isnull()
    features_to_drop = ["application_number", "target"]

    train, test = X.loc[~mask], data.loc[mask]

    target, test_id = train["target"], test["application_number"]
    train = train.drop(features_to_drop, axis=1)
    test = test.drop(features_to_drop, axis=1)

    categorial = train.dtypes[train.dtypes == "object"].index.tolist()
    numerical = list(set(train.columns) - set(categorial))

    train = train.replace(np.inf, np.nan)
    train = train.replace(-np.inf, np.nan)
    
    test[numerical] = test[numerical].astype(float)
    test[categorial] = test[categorial].astype(str)
    
    return train, test, target, test_id, categorial

In [57]:
train, test, target, test_id, categorial = get_train_test(data)

## KFold

In [58]:
features_to_drop = [
    'region_population',
    'family_size',
    'amt_req_credit_bureau_week',
    'amt_req_credit_bureau_year',
    'flag_phone',
    'flag_email',
    'ratio_salary_to_region_population',
    'childrens',
    'amt_req_credit_bureau_day',
    'amt_req_credit_bureau_hour',
    'bki_requests_count',
    'amt_req_credit_bureau_mon'
]

In [59]:
cb_params = {
    "n_estimators": 2000,
    #"learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 50,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=train.drop(features_to_drop, axis=1), y=target, cv=cv, categorical=categorial
)

Fri Oct 22 20:26:08 2021, Cross-Validation, 110093 rows, 41 cols
0:	test: 0.6591096	test1: 0.6567798	best: 0.6567798 (0)	total: 223ms	remaining: 7m 25s
50:	test: 0.7132005	test1: 0.7060518	best: 0.7060518 (50)	total: 8.79s	remaining: 5m 36s
100:	test: 0.7230663	test1: 0.7132894	best: 0.7132894 (100)	total: 17.5s	remaining: 5m 28s
150:	test: 0.7305601	test1: 0.7181439	best: 0.7181439 (150)	total: 26s	remaining: 5m 18s
200:	test: 0.7358915	test1: 0.7202399	best: 0.7202399 (200)	total: 34.4s	remaining: 5m 8s
250:	test: 0.7403319	test1: 0.7218933	best: 0.7219677 (249)	total: 42.6s	remaining: 4m 56s
300:	test: 0.7436588	test1: 0.7229261	best: 0.7229261 (300)	total: 50.8s	remaining: 4m 46s
350:	test: 0.7465954	test1: 0.7228932	best: 0.7232622 (337)	total: 59.1s	remaining: 4m 37s
400:	test: 0.7488953	test1: 0.7237313	best: 0.7237355 (399)	total: 1m 6s	remaining: 4m 27s
450:	test: 0.7512605	test1: 0.7240448	best: 0.7241576 (435)	total: 1m 15s	remaining: 4m 17s
500:	test: 0.7536475	test1: 0.724

In [60]:
# Score by each fold: [0.72297, 0.7264, 0.73189, 0.71947, 0.72715]

oof_score = roc_auc_score(
    target, oof_preds
)
print(f'OOF-score = {round(oof_score, 5)}')
# OOF-score = 0.72473

OOF-score = 0.72558


## Подготовка прогноза

In [61]:
y_pred = np.zeros(test.shape[0])

for estimator in estimators:
    y_pred += estimator.predict_proba(test.drop(features_to_drop, axis=1))[:, 1]      

y_pred = pd.DataFrame({
    "APPLICATION_NUMBER": test_id,
    "TARGET": y_pred / cv.n_splits
})   

y_pred.to_csv("baseline_submit.csv", index=False)
y_pred.head()

Unnamed: 0,APPLICATION_NUMBER,TARGET
110093,123724268,0.05865
110094,123456549,0.218495
110095,123428178,0.180643
110096,123619984,0.084428
110097,123671104,0.0211
