In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb, lightgbm as lgbm, catboost as catb

%matplotlib inline

ModuleNotFoundError: No module named 'lightgbm'

### Загрузка, обзор данных

In [None]:
data_train = pd.read_csv('./data/train.csv')
data_test = pd.read_csv('./data/test.csv')

In [None]:
def save_prepared_data(df, name):
    df.to_csv(f'./data/prepared_{name}.csv', index=False)

In [None]:
data_train.info()

**Глобальные переменные**

In [None]:
home_own = 'Home Ownership' # домовладение
ann_income = 'Annual Income' # годовой доход
years_on_job = 'Years in current job' # количество лет на текущем месте работы
tax_liens = 'Tax Liens' # налоговые льготы
numb_accounts = 'Number of Open Accounts' # количество открытых счетов
years_cred_hist = 'Years of Credit History' # количество лет кредитной истории
max_credit = 'Maximum Open Credit' # наибольший открытый кредит
cred_problems = 'Number of Credit Problems' # количество проблем с кредитом
months_last_delinq = 'Months since last delinquent' # количество месяцев с последней просрочки платежа
bancrupcies = 'Bankruptcies' # банкротства
purpose = 'Purpose' # цель кредита
term = 'Term' # срок кредита
cur_loan_amount = 'Current Loan Amount' # текущая сумма кредита
cur_cred_bal = 'Current Credit Balance' # текущий кредитный баланс
month_debt = 'Monthly Debt' # ежемесячный долг
cred_score = 'Credit Score' # ???
cred_default = 'Credit Default' # факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

**Обработка пропусков**

In [None]:
def print_na(df):
    print(len(df) - df.count())
    
print_na(data_train)

In [None]:
def fill_annual_income(df):
    df[ann_income] = df[ann_income].fillna(df[ann_income].mean())
    return df

In [None]:
def fill_years_cur_job(df):
    df[years_on_job] = df[years_on_job].fillna(df[years_on_job].mode()[0])
    return df

In [None]:
def fill_months_last_del(df):
    df[months_last_delinq] = df[months_last_delinq].fillna(df[months_last_delinq].mean())
    return df

In [None]:
def fill_bankruptcies(df):
    df[bancrupcies] = df[bancrupcies].fillna(df[bancrupcies].mean())
    return df

In [None]:
def fill_na(df):
    df = fill_annual_income(df)
    df = fill_years_cur_job(df)
    df = fill_months_last_del(df)
    df = fill_bankruptcies(df)
    return df

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
def prepare_data(df):
    df = df.copy()
    df = fill_na(df)
    df = df.select_dtypes(include=numerics)
    df = df.dropna()
    return df

In [None]:
train_prepared = prepare_data(data_train)

**Отбор признаков**

In [None]:
TARGET_NAME = cred_score
FEATURE_NAMES_SELECTED = train_prepared.columns.drop([TARGET_NAME, cred_default]).tolist()

In [None]:
def plot_corr(features):
    corr_with_target = train_prepared[features + [TARGET_NAME]].corr().iloc[:-1, -1].sort_values(ascending=False)

    plt.figure(figsize=(10, 8))

    sns.barplot(x=corr_with_target.values, y=corr_with_target.index)

    plt.title('Correlation with target variable')
    plt.show()

In [None]:
plot_corr(FEATURE_NAMES_SELECTED)

**Нормализация данных**

In [None]:
def normalize(df):
    scaler = StandardScaler()

    df_norm = df.copy()
    df_norm[FEATURE_NAMES_SELECTED] = scaler.fit_transform(df_norm[FEATURE_NAMES_SELECTED])

    return df_norm.copy()

In [None]:
# train_prepared = normalize(train_prepared)

**Разбиение test и train**

In [None]:
def split(df):
    X = df[FEATURE_NAMES_SELECTED]
    y = df[TARGET_NAME]

    return train_test_split(X, y, shuffle=True, test_size=0.25, random_state=211)

In [None]:
X_train, X_test, y_train, y_test = split(train_prepared)

### CatBoost

**Подбор гиперпараметров**

In [None]:
model_catb = catb.CatBoostRegressor(silent=True, random_state=21)
model_catb.fit(X_train, y_train)

In [None]:
def get_prediction(ds, y=None, r2=False):
    pred = model_catb.predict(ds)
    if r2:
        print('R2:', r2_score(y, pred))
    return pred

In [None]:
get_prediction(X_train, y_train, True)

In [None]:
get_prediction(X_test, y_test, True)

In [None]:
y_test.head()

In [None]:
feat_imp = pd.DataFrame({'feat': FEATURE_NAMES_SELECTED, 'importance': model_catb.get_feature_importance()})
feat_imp.sort_values('importance', ascending=False).head(10)

In [None]:
with open('./data/credit_score_reg', 'wb') as file:
    pickle.dump(model_catb, file)