# ДЗ 5. Feature Engineering, Feature Selection, part I

Продолжим работу с данными, которые были использованы в ДЗ2 и 3, продолжим решать задачу обнаружения мошеннических транзакций, что позволит получить полное решение задачи / полный пайплайн.

In [99]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

pd.options.display.max_columns = 450
import warnings
warnings.simplefilter("ignore")

**Подготовим данные**

In [None]:
data = pd.read_csv('./data/assignment_2_train.csv')
lb = pd.read_csv('./data/assignment_2_test.csv')

In [None]:
X_data = data.drop('isFraud', axis=1)
y_data = data['isFraud']
X_lb = lb.drop('isFraud', axis=1)
y_lb = lb['isFraud']

In [None]:
result = pd.DataFrame(columns=['train_mean', 'train_std', 'valid_mean', 'valid_std', 'valid_conf_interval','auc_lb'])

## Обучить и провалидировать модель

**Задание 0:** выбрать любую модель машинного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага.

In [101]:
def evaluation_model(X, y, lb_X, lb_y, operation=None):
    
    # Преобразование категориальных признаков
    cat_features = X.select_dtypes(exclude=np.number).columns.to_list()
    X[cat_features] = X[cat_features].astype('category')
    lb_X[cat_features] = lb_X[cat_features].astype('category')
    
    # Обучение модели
    X_train, X_valid = train_test_split(X, train_size=0.7, shuffle=True, random_state=5)
    y_train, y_valid = train_test_split(y, train_size=0.7, shuffle=True, random_state=5)

    model = lgb.LGBMClassifier(objective="binary", n_estimators=1000, random_state=5)

    model.fit(X=X_train, y=y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                categorical_feature=cat_features, # "auto",
                early_stopping_rounds=25,
                eval_metric="auc",
                verbose=100)
    
    # Кросс-валидация
    fold_train_scores, fold_valid_scores = [], []
    
    cv_strategy = KFold(n_splits=5, random_state=1)
    
    for fold_number, (train_idx, valid_idx) in enumerate(cv_strategy.split(X, y)):
        X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]

        y_train_pred = model.predict(X_train)
        y_valid_pred = model.predict(X_valid)

        fold_train_scores.append(roc_auc_score(y_train, y_train_pred))
        fold_valid_scores.append(roc_auc_score(y_valid, y_valid_pred))
        
    # Доверительный интервал
    conf_interval = 0.95 
        
    left_bound = np.percentile(fold_valid_scores, ((1 - conf_interval) / 2) * 100)
    right_bound = np.percentile(fold_valid_scores, (conf_interval + ((1 - conf_interval) / 2)) * 100)
    
    # Статистика
    if operation != None:
        
        result.loc[f'{operation}', 'train_mean'] = round(np.mean(fold_train_scores), 4)
        result.loc[f'{operation}', 'valid_mean'] = round(np.mean(fold_valid_scores), 4)
        result.loc[f'{operation}', 'train_std'] = round(np.std(fold_train_scores), 3)
        result.loc[f'{operation}', 'valid_std'] = round(np.std(fold_valid_scores), 3)
        result.loc[f'{operation}', 'valid_conf_interval'] = f'{round(left_bound, 3)}/{round(right_bound, 3)}'

        auc_lb = round(roc_auc_score(lb_y, model.predict_proba(lb_X)[:, 1]), 4)
        result.loc[f'{operation}', 'auc_lb'] = auc_lb

        return result

In [102]:
X_data_base = X_data.copy()
X_lb_base = X_lb.copy()

result = evaluation_model(X_data_base, y_data, X_lb_base, y_lb, operation='baseline')
result

Training until validation scores don't improve for 25 rounds
[100]	training's auc: 0.975608	training's binary_logloss: 0.0432565	valid_1's auc: 0.940201	valid_1's binary_logloss: 0.0565885
[200]	training's auc: 0.991572	training's binary_logloss: 0.0310524	valid_1's auc: 0.94739	valid_1's binary_logloss: 0.0519806
[300]	training's auc: 0.996377	training's binary_logloss: 0.0233609	valid_1's auc: 0.949776	valid_1's binary_logloss: 0.0495208
[400]	training's auc: 0.998267	training's binary_logloss: 0.0180339	valid_1's auc: 0.951945	valid_1's binary_logloss: 0.0480682
Early stopping, best iteration is:
[455]	training's auc: 0.999023	training's binary_logloss: 0.0155787	valid_1's auc: 0.952796	valid_1's binary_logloss: 0.047524


Unnamed: 0,train_mean,train_std,valid_mean,valid_std,valid_conf_interval,auc_lb
baseline,0.8892,0.004,0.8877,0.012,0.872/0.907,0.8584


## Преобразование TransactionDT в datetime

**Задание 1:** признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.

In [103]:
def transform_datetime(data):
    data = data.copy()
    data["DT"] = pd.to_datetime(data["TransactionDT"], unit='s', origin='2017-11-30')
    data["year"] = data["DT"].dt.year
    data["month"] = data["DT"].dt.month
    data["day"] = data["DT"].dt.day
    data["hour"] = data["DT"].dt.hour
    data["day_of_week"] = data["DT"].dt.weekday
    data = data.drop("DT", axis=1)
    return data

In [104]:
X_data_dt = transform_datetime(data)
X_lb_dt = transform_datetime(lb)

result = evaluation_model(X_data_dt, y_data, X_lb_dt, y_lb, operation='transform_datetime')
result

Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[1]	training's auc: 1	training's binary_logloss: 0.0471359	valid_1's auc: 1	valid_1's binary_logloss: 0.0458656


Unnamed: 0,train_mean,train_std,valid_mean,valid_std,valid_conf_interval,auc_lb
baseline,0.8892,0.004,0.8877,0.012,0.872/0.907,0.8584
transform_datetime,0.5,0.0,0.5,0.0,0.5/0.5,1.0


**Вывод:** Преобразование признака TransactionDT в datetime увеличило разрыв между показателями кросс-валидации и результом на лидерборде.

## Конкатенация признаков

**Задание 2:** сделать конкатенацию признаков:  

* card1 + card2;
* card1 + card2 + card_3 + card_5;
* card1 + card2 + card_3 + card_5 + addr1 + addr2

Рассматривать их, как категориальных признаки.

In [106]:
def concatenation(data):
    data = data.copy()
    data['card_1_2'] = data['card1'].astype(np.str) + '_' + data['card2'].astype(np.str)
    data['card_1_2_3_5'] = data['card_1_2'] + '_' + data['card3'].astype(np.str) + '_' + data['card5'].astype(np.str)
    data['card_1_2_3_5_addr_1_2'] = data['card_1_2_3_5'] + '_' + data['addr1'].astype(np.str) + '_' + data['addr2'].astype(np.str)
    return data

In [107]:
X_data_concat = concatenation(X_data) 
X_lb_concat = concatenation(X_lb)

result = evaluation_model(X_data_concat, y_data, X_lb_concat, y_lb, operation='concatenation')
result

Training until validation scores don't improve for 25 rounds
[100]	training's auc: 0.993307	training's binary_logloss: 0.0227319	valid_1's auc: 0.947786	valid_1's binary_logloss: 0.0487738
[200]	training's auc: 0.998799	training's binary_logloss: 0.0123169	valid_1's auc: 0.952444	valid_1's binary_logloss: 0.0464544
Early stopping, best iteration is:
[220]	training's auc: 0.99915	training's binary_logloss: 0.0109781	valid_1's auc: 0.952891	valid_1's binary_logloss: 0.0463379


Unnamed: 0,train_mean,train_std,valid_mean,valid_std,valid_conf_interval,auc_lb
baseline,0.8892,0.004,0.8877,0.012,0.872/0.907,0.8584
transform_datetime,0.5,0.0,0.5,0.0,0.5/0.5,1.0
concatenation,0.922,0.003,0.9205,0.01,0.908/0.933,0.8401


**Вывод:** Конкатинация признаков увеличила разрыв между показателями кросс-валидации и результом на лидерборде.

## FrequencyEncoder

**Задание 3:** Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.

In [108]:
def FrequencyEncoder(data, features):
    data = data.copy()
    for feature in features:
        freq_encoder = data[feature].value_counts(normalize=True)
        data[f"{feature}_freq_enc"] = data[feature].map(freq_encoder)
    return data

In [109]:
features = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']
X_data_freq = FrequencyEncoder(X_data, features)
X_lb_freq = FrequencyEncoder(X_lb, features)

result = evaluation_model(X_data_freq, y_data, X_lb_freq, y_lb, operation='frequency_encoder')
result

Training until validation scores don't improve for 25 rounds
[100]	training's auc: 0.977712	training's binary_logloss: 0.0423063	valid_1's auc: 0.941587	valid_1's binary_logloss: 0.0561101
[200]	training's auc: 0.993025	training's binary_logloss: 0.0296955	valid_1's auc: 0.951308	valid_1's binary_logloss: 0.0507267
[300]	training's auc: 0.997424	training's binary_logloss: 0.0220766	valid_1's auc: 0.954336	valid_1's binary_logloss: 0.0482006
Early stopping, best iteration is:
[337]	training's auc: 0.998232	training's binary_logloss: 0.0199437	valid_1's auc: 0.955214	valid_1's binary_logloss: 0.0474744


Unnamed: 0,train_mean,train_std,valid_mean,valid_std,valid_conf_interval,auc_lb
baseline,0.8892,0.004,0.8877,0.012,0.872/0.907,0.8584
transform_datetime,0.5,0.0,0.5,0.0,0.5/0.5,1.0
concatenation,0.922,0.003,0.9205,0.01,0.908/0.933,0.8401
frequency_encoder,0.8696,0.005,0.8678,0.014,0.855/0.892,0.8522


**Вывод:** FrequencyEncoder признаков увеличил среднеквадратичное отклонение и доверительный интервал модели на валидационной выборке.

## TransactionAmt к вычисленной статистике

**Задание 4:** Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [110]:
def create_aggs(data, groupby_id, aggs=None, features=None):
    
    data = data.copy()
    
    if aggs != None:
        data_grouped_num = data.groupby(groupby_id)
        stats_num = data_grouped_num.agg(aggs)
        stats_num.columns = [f"{groupby_id}_{feature}_{stat}" for feature, stat in stats_num]
        stats_num = stats_num.reset_index()
        data = data.merge(stats_num, how='left', on=groupby_id)
    
    if features != None:
        categorical = data[features].copy()
        le = LabelEncoder()
        for feature in features:
            cat_value = list(categorical[feature].values.astype('str'))
            le.fit(cat_value)
            categorical[feature] = le.transform(cat_value)
        categorical[groupby_id] = data[groupby_id]
        data_grouped_cat = categorical.groupby(groupby_id)
        stats_cat = data_grouped_cat.agg({col: ["mean", "sum"] for col in features})
        stats_cat.columns = [f"{groupby_id}_{feature}_{stat}" for feature, stat in stats_cat]
        stats_cat = stats_cat.reset_index()
        data = data.merge(stats_cat, how='left', on=groupby_id)
    
    return data

In [111]:
aggs = {"card1": [np.mean, np.std],
        "card2": [np.mean, np.std],
        "card3": [np.mean, np.std],
        "card5": [np.mean, np.std],
        "addr1": [np.mean, np.std],
        "addr2": [np.mean, np.std]
        }

features = ["card4", "card6", "card_1_2", "card_1_2_3_5", "card_1_2_3_5_addr_1_2"]

groupby_id = "TransactionAmt"
    
X_data_agg_amt = create_aggs(X_data_concat, groupby_id, aggs, features)
X_lb_agg_amt = create_aggs(X_lb_concat, groupby_id, aggs, features)

result = evaluation_model(X_data_agg_amt, y_data, X_lb_agg_amt, y_lb, operation='aggregating_TransactionAmt')
result

Training until validation scores don't improve for 25 rounds
[100]	training's auc: 0.993559	training's binary_logloss: 0.0215903	valid_1's auc: 0.948447	valid_1's binary_logloss: 0.0480976
[200]	training's auc: 0.999148	training's binary_logloss: 0.0105727	valid_1's auc: 0.954545	valid_1's binary_logloss: 0.0455159
Early stopping, best iteration is:
[216]	training's auc: 0.999388	training's binary_logloss: 0.00942071	valid_1's auc: 0.955455	valid_1's binary_logloss: 0.0453587


Unnamed: 0,train_mean,train_std,valid_mean,valid_std,valid_conf_interval,auc_lb
baseline,0.8892,0.004,0.8877,0.012,0.872/0.907,0.8584
transform_datetime,0.5,0.0,0.5,0.0,0.5/0.5,1.0
concatenation,0.922,0.003,0.9205,0.01,0.908/0.933,0.8401
frequency_encoder,0.8696,0.005,0.8678,0.014,0.855/0.892,0.8522
aggregating_TransactionAmt,0.9282,0.002,0.9276,0.007,0.92/0.937,0.8344


**Вывод:** Создание новых признаков TransactionAmt к вычисленной статистике увеличило разрыв между показателями кросс-валидации и результом на лидерборде.

## D15 к вычисленной статистике

**Задание 5:** Создать признаки на основе отношения: D15 к вычисленной статистике. Статистика - среднее значение / стандартное отклонение D15, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [112]:
groupby_id = "D15"

X_data_agg_d15 = create_aggs(X_data_concat, groupby_id, aggs, features)
X_lb_agg_d15 = create_aggs(X_lb_concat, groupby_id, aggs, features)

result = evaluation_model(X_data_agg_d15, y_data, X_lb_agg_d15, y_lb, operation='aggregating_D15')
result

Training until validation scores don't improve for 25 rounds
[100]	training's auc: 0.993511	training's binary_logloss: 0.0222547	valid_1's auc: 0.949163	valid_1's binary_logloss: 0.0483855
[200]	training's auc: 0.998708	training's binary_logloss: 0.0120595	valid_1's auc: 0.954198	valid_1's binary_logloss: 0.0456436
Early stopping, best iteration is:
[210]	training's auc: 0.998811	training's binary_logloss: 0.0114171	valid_1's auc: 0.954524	valid_1's binary_logloss: 0.0454885


Unnamed: 0,train_mean,train_std,valid_mean,valid_std,valid_conf_interval,auc_lb
baseline,0.8892,0.004,0.8877,0.012,0.872/0.907,0.8584
transform_datetime,0.5,0.0,0.5,0.0,0.5/0.5,1.0
concatenation,0.922,0.003,0.9205,0.01,0.908/0.933,0.8401
frequency_encoder,0.8696,0.005,0.8678,0.014,0.855/0.892,0.8522
aggregating_TransactionAmt,0.9282,0.002,0.9276,0.007,0.92/0.937,0.8344
aggregating_D15,0.9205,0.003,0.9191,0.01,0.908/0.932,0.8403


**Вывод:** Создание новых признаков D15 к вычисленной статистике увеличило разрыв между показателями кросс-валидации и результом на лидерборде.

## Логарифм от TransactionAmt

**Задание 6:** выделить дробную часть и целую часть признака TransactionAmt в два отдельных признака. После создать отдельных признак - логарифм от TransactionAmt.

In [113]:
def transform_TransactionAmt(data):
    data = data.copy()
    data['TransactionAmt_whole'] = data['TransactionAmt']//1
    data['TransactionAmt_frac'] = data['TransactionAmt']%1
    data['TransactionAmt_log'] = np.log2(data['TransactionAmt'])
    return data

In [114]:
X_data_trans_amt = transform_TransactionAmt(X_data)
X_lb_trans_amt = transform_TransactionAmt(X_lb)

result = evaluation_model(X_data_trans_amt, y_data, X_lb_trans_amt, y_lb, operation='transform_TransactionAmt')
result

Training until validation scores don't improve for 25 rounds
[100]	training's auc: 0.977391	training's binary_logloss: 0.0429455	valid_1's auc: 0.939139	valid_1's binary_logloss: 0.0567328
[200]	training's auc: 0.991944	training's binary_logloss: 0.0305835	valid_1's auc: 0.946859	valid_1's binary_logloss: 0.0519318
Early stopping, best iteration is:
[269]	training's auc: 0.995263	training's binary_logloss: 0.0254212	valid_1's auc: 0.948995	valid_1's binary_logloss: 0.0501956


Unnamed: 0,train_mean,train_std,valid_mean,valid_std,valid_conf_interval,auc_lb
baseline,0.8892,0.004,0.8877,0.012,0.872/0.907,0.8584
transform_datetime,0.5,0.0,0.5,0.0,0.5/0.5,1.0
concatenation,0.922,0.003,0.9205,0.01,0.908/0.933,0.8401
frequency_encoder,0.8696,0.005,0.8678,0.014,0.855/0.892,0.8522
aggregating_TransactionAmt,0.9282,0.002,0.9276,0.007,0.92/0.937,0.8344
aggregating_D15,0.9205,0.003,0.9191,0.01,0.908/0.932,0.8403
transform_TransactionAmt,0.8453,0.005,0.8437,0.017,0.82/0.866,0.8585


**Вывод:** Трансформация признака TransactionAmt сократила разрыв между показателями кросс-валидации и результом на лидерборде.

## Frequency Encoding

**Задание 7 (опция):** выполнить предварительную подготовку / очистку признаков P_emaildomain и R_emaildomain (что и как делать - остается на ваше усмотрение) и сделать Frequency Encoding для очищенных признаков.

In [115]:
def transform_emaildomain(data):
    
    data = data.copy()
    
    # Отсутствующие значения P_emaildomain заполнить данными из R_emaildomain
    condition = (data['P_emaildomain'].isnull()) & (data['R_emaildomain'].notnull())
    data[condition]['P_emaildomain'] = data[condition]['R_emaildomain']

    # Разбиение домена на уровни
    new = data['P_emaildomain'].str.split(".", n = 1, expand = True)
    data['P_emaildomain_1'] = new[0]
    data['P_emaildomain_2'] = new[1]

    # R_emaildomain, P_emaildomain удалить
    data = data.drop(['R_emaildomain', 'P_emaildomain'], axis=1)

    return data

In [116]:
X_data_trans_email = transform_emaildomain(X_data)
X_lb_trans_email = transform_emaildomain(X_lb)

result = evaluation_model(X_data_trans_email, y_data, X_lb_trans_email, y_lb, operation='transform_emaildomain')
result

Training until validation scores don't improve for 25 rounds
[100]	training's auc: 0.974751	training's binary_logloss: 0.0438913	valid_1's auc: 0.939765	valid_1's binary_logloss: 0.056667
[200]	training's auc: 0.991163	training's binary_logloss: 0.0313335	valid_1's auc: 0.947528	valid_1's binary_logloss: 0.0518995
[300]	training's auc: 0.99611	training's binary_logloss: 0.0237589	valid_1's auc: 0.950779	valid_1's binary_logloss: 0.049486
[400]	training's auc: 0.998157	training's binary_logloss: 0.0186276	valid_1's auc: 0.952912	valid_1's binary_logloss: 0.0480926
[500]	training's auc: 0.999284	training's binary_logloss: 0.0144914	valid_1's auc: 0.954038	valid_1's binary_logloss: 0.0473572
Early stopping, best iteration is:
[476]	training's auc: 0.999083	training's binary_logloss: 0.015438	valid_1's auc: 0.95411	valid_1's binary_logloss: 0.0474054


Unnamed: 0,train_mean,train_std,valid_mean,valid_std,valid_conf_interval,auc_lb
baseline,0.8892,0.004,0.8877,0.012,0.872/0.907,0.8584
transform_datetime,0.5,0.0,0.5,0.0,0.5/0.5,1.0
concatenation,0.922,0.003,0.9205,0.01,0.908/0.933,0.8401
frequency_encoder,0.8696,0.005,0.8678,0.014,0.855/0.892,0.8522
aggregating_TransactionAmt,0.9282,0.002,0.9276,0.007,0.92/0.937,0.8344
aggregating_D15,0.9205,0.003,0.9191,0.01,0.908/0.932,0.8403
transform_TransactionAmt,0.8453,0.005,0.8437,0.017,0.82/0.866,0.8585
transform_emaildomain,0.8913,0.004,0.8891,0.013,0.878/0.912,0.8463


**Вывод:** Трансформация признаков emaildomain увеличила среднеквадратичное отклонение и доверительный интервал модели на валидационной выборке.