In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

# Создание новых признаков

In [2]:
path = 'E:/retailhero-uplift/data'

In [5]:
uplift = pd.read_csv(path + '/train.csv', encoding='utf-8')
uplift.head()

Unnamed: 0,client_id,first_issue_date,first_redeem_date,age,gender,treatment_flg,target
0,000012768d,2017-08-05 15:40:48,2018-01-04 19:30:07,45,U,0,1
1,000036f903,2017-04-10 13:54:23,2017-04-23 12:37:56,72,F,1,1
2,00010925a5,2018-07-24 16:21:29,2018-09-14 16:12:49,83,U,1,1
3,0001f552b0,2017-06-30 19:20:38,2018-08-28 12:59:45,33,F,1,1
4,00020e7b18,2017-11-27 11:41:45,2018-01-10 17:50:05,73,U,1,1


In [10]:
uplift.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200039 entries, 0 to 200038
Data columns (total 7 columns):
client_id            200039 non-null object
first_issue_date     200039 non-null object
first_redeem_date    182493 non-null object
age                  200039 non-null int64
gender               200039 non-null object
treatment_flg        200039 non-null int64
target               200039 non-null int64
dtypes: int64(3), object(4)
memory usage: 10.7+ MB


In [6]:
purchases = pd.read_csv(path + '/purchases.csv', encoding='utf-8')
purchases.head()

Unnamed: 0,client_id,transaction_id,transaction_datetime,regular_points_received,express_points_received,regular_points_spent,express_points_spent,purchase_sum,store_id,product_id,product_quantity,trn_sum_from_iss,trn_sum_from_red
0,000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0,0.0,1007.0,54a4a11a29,9a80204f78,2.0,80.0,
1,000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0,0.0,1007.0,54a4a11a29,da89ebd374,1.0,65.0,
2,000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0,0.0,1007.0,54a4a11a29,0a95e1151d,1.0,24.0,
3,000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0,0.0,1007.0,54a4a11a29,4055b15e4a,2.0,50.0,
4,000012768d,7e3e2e3984,2018-12-01 07:12:45,10.0,0.0,0.0,0.0,1007.0,54a4a11a29,a685f1916b,1.0,22.0,


In [11]:
purchases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45786568 entries, 0 to 45786567
Data columns (total 13 columns):
client_id                  object
transaction_id             object
transaction_datetime       object
regular_points_received    float64
express_points_received    float64
regular_points_spent       float64
express_points_spent       float64
purchase_sum               float64
store_id                   object
product_id                 object
product_quantity           float64
trn_sum_from_iss           float64
trn_sum_from_red           float64
dtypes: float64(8), object(5)
memory usage: 4.4+ GB


Извлечение признаков из временных меток:
 - месяц
 - день недели
 - час

Из файла с покупками сделаем такие признаки за всю историю и последние 30 дней: 
   - общее количество покупок(штуки и суммы)
   - количество накопленных баллов
   - количество потраченных баллов
   - баланс баллов
   - количество уникальных магазинов, где были совершены покупки
   - количество дней между первой и последней покупкой

Добавим описательные статистики:
 - среднее
 - стандартное отклонение
 - квартили
 - минимумы
 - максимумы

И добавим функциональные зависимости:
 - полиномы
 - логарифмы
 - экспоненты

## 0.Используемые классы и функции

Класс для выбора колонки.

In [6]:
class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.column]

Класс для преобразования возраста.

In [15]:
class FeaturesTransformator(BaseEstimator, TransformerMixin):
    
    def __init__(self, key, age_min: int = 16, age_max: int = 80):
        self.key = key
        self.age_min = age_min
        self.age_max = age_max
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, age_min, age_max):
        X.loc[X[self.key] > age_max, key] = age_max
        X.loc[X[self.key] < age_min, key] = age_min
        return X

Класс для извлечения признаков из временных меток.

In [14]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X[self.key+'_'+'month'] = X[self.key].dt.month
        X[self.key+'_'+'weekday'] = X[self.key].dt.weekday
        X[self.key+'_'+'hour'] = X[self.column].dt.hour
        return X

Класс для создания признаков: описательные статистики, функциональные зависимости.

In [7]:
class FeaturesGenerator:
    
    def load_pd(self, pd_dataframe):
        self.df = pd_dataframe
    
    def base_statistics(self, column):
        self.df[column+'_'+'max'] = self.df[column].max()
        self.df[column+'_'+'min'] = self.df[column].min()
        self.df[column+'_'+'mean'] = self.df[column].mean()
        self.df[column+'_'+'std'] = self.df[column].std()
        self.df[column+'_'+'q1'] = self.df[column].quantile([0.25])
        self.df[column+'_'+'q2'] = self.df[column].quantile([0.5])
        self.df[column+'_'+'q3'] = self.df[column].quantile([0.75])
    
    def polynomizer(self, column, n=2):
        for i in range(2,n+1):
            self.df[column+'_'+str(i)] = self.df[column]**i
                
    def naturalize(self, columns, tol=1.01):
        self.df[column+'_'+'log'] = np.log(self.df[column] + tol) 
        self.df[column+'_'+'exp'] = np.exp(self.df[column] + tol)

Класс для создания фичей из purchase.

In [None]:
class PurchaseFeatures:
    
    def load_pd(self, pd_dataframe1, pd_dataframe2):
        self.df1 = pd_dataframe1
        self.df2 = pd_dataframe2
        
    def get_sum(self, columns):
        for column in columns:
            self.df1[column+'_'+'total'] = self.df1.groupby('client_id')[column].sum()
            self.df2[column+'_'+'last'] = self.df2.groupby('client_id')[column].sum()
            
    def get_unique(self, columns):
        for column in columns:
            self.df1[column+'_'+'total_unique'] = self.df1.groupby('client_id')[[column]].nunique()
            self.df2[column+'_'+'last_unique'] = self.df2.groupby('client_id')[[column]].nunique()
    
    def get_balance(self, columns):
        

In [12]:
def purchase_features(df):
    last_cols = ['regular_points_received', 'express_points_received','regular_points_spent', 
                 'express_points_spent', 'purchase_sum','store_id', 'product_quantity']
    all_hist = df.groupby(['client_id','transaction_id'])[last_cols].last()
    last_month = df[df['transaction_datetime'] > '2019-02-18'].groupby(['client_id','transaction_id'])[last_cols].last()
    
    df['total_purchase_sum'] = all_hist.groupby('client_id')['purchase_sum'].sum()
    df['last_30_purchase_sum'] = last_month.groupby('client_id')['purchase_sum'].sum()
    df['total_product_quantity'] = all_hist.groupby('client_id')['product_quantity'].sum()
    df['last_30_product_quantity_sum'] = last_month.groupby('client_id')['product_quantity'].sum()
    df['total_regular_points_received'] = all_hist.groupby('client_id')['regular_points_received'].sum()
    df['last_30_regular_points_received'] = last_month.groupby('client_id')['regular_points_received'].sum()
    df['total_express_points_received'] = all_hist.groupby('client_id')['express_points_received'].sum()
    df['last_30_express_points_received'] = last_month.groupby('client_id')['express_points_received'].sum()
    df['total_regular_points_spent'] = all_hist.groupby('client_id')['regular_points_spent'].sum()
    df['last_30_regular_points_spent'] = last_month.groupby('client_id')['regular_points_spent'].sum()
    df['total_express_points_spent'] = all_hist.groupby('client_id')['express_points_spent'].sum()
    df['last_30_express_points_spent'] = last_month.groupby('client_id')['express_points_spent'].sum()
    df['total_regular_points_balance'] = df['total_regular_points_received'] - df['total_regular_points_spent']
    df['total_express_points_balance'] = df['total_express_points_received'] - df['total_express_points_spent']
    df['last_30_regular_points_balance'] = df['last_30_regular_points_received'] - df['last_30_regular_points_spent']
    df['last_30_express_points_balance'] = df['last_30_express_points_received'] - df['total_express_points_spent']
    df['unique_stores'] = all_hist.groupby('client_id')[['store_id']].nunique()
    df['last_30_unique_stores'] = last_month.groupby('client_id')[['store_id']].nunique()
    df['days_delay'] = (all_hist.groupby('client_id')['transaction_datetime'].iloc[0] - 
                        all_hist.groupby('client_id')['transaction_datetime'].iloc[-1]).dt.days
    df['last_days_delay'] = (last_month.groupby('client_id')['transaction_datetime'].iloc[0] - 
                        last_month.groupby('client_id')['transaction_datetime'].iloc[-1]).dt.days

Функция для mean target кодирования.

In [11]:
def mean_target_encoding(train_df, y_train, valid_df, skf):    
    glob_mean = y_train.mean()
    train_df = pd.concat([train_df, pd.Series(y_train, name='y')], axis=1)
    new_train_df = train_df.copy()
    
    cat_features = train_df.columns[train_df.dtypes == 'object'].tolist()
    
    for col in cat_features:
        new_train_df[col + '_mean_target'] = [glob_mean for _ in range(new_train_df.shape[0])]
        
    for train_idx, valid_idx in skf.split(train_df, y_train):
        train_df_cv, valid_df_cv = train_df.iloc[train_idx, :], train_df.iloc[valid_idx, :]
        
        for col in cat_features:
            
            means = valid_df_cv[col].map(train_df_cv.groupby(col)['y'].means())
            valid_df_cv[col + '_mean_target'] = means.fillna(glob_mean)
            
        new_train_df.iloc[valid_idx] = valid_df_cv
        
    new_train_df.drop(cat_features + ['y'], axis=1, inplace=True)
    
    for col in cat_features:
        means = valid_df[col].map(train_df.groupby(col)['y'].mean())
        valid_df[col + '_mean_target'] = means.fillna(glob_mean)
        
    valid_df.drop(train_df.columns[train_df.dtypes == 'object'], axis=1, inplace=True)
    
    return new_train_df, valid_df

Функция для визуализации ROC-AUC.

In [9]:
def roc_auc_plot(y_test, preds):
    sns.set(font_scale=1.5)
    sns.set_color_codes("muted")

    plt.figure(figsize=(10, 8))
    fpr, tpr, thresholds_ = roc_curve(y_test, preds, pos_label=1)
    lw = 2
    plt.plot(fpr, tpr, lw=lw, label='ROC curve')
    plt.plot([0, 1], [0, 1])
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()