In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime, math

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import seaborn as sns
import matplotlib.pyplot as plt

########################### Model
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [0]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
########################### Vars
#################################################################################
SEED = 42
seed_everything(SEED)
TARGET = 'isFraud'
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [0]:
def values_normalization(dt_df, periods, columns):
    for period in periods:
        for col in columns:
            new_col = col +'_'+ period
            dt_df[col] = dt_df[col].astype(float)  

            temp_min = dt_df.groupby([period])[col].agg(['min']).reset_index()
            temp_min.index = temp_min[period].values
            temp_min = temp_min['min'].to_dict()

            temp_max = dt_df.groupby([period])[col].agg(['max']).reset_index()
            temp_max.index = temp_max[period].values
            temp_max = temp_max['max'].to_dict()

            temp_mean = dt_df.groupby([period])[col].agg(['mean']).reset_index()
            temp_mean.index = temp_mean[period].values
            temp_mean = temp_mean['mean'].to_dict()

            temp_std = dt_df.groupby([period])[col].agg(['std']).reset_index()
            temp_std.index = temp_std[period].values
            temp_std = temp_std['std'].to_dict()

            dt_df['temp_min'] = dt_df[period].map(temp_min)
            dt_df['temp_max'] = dt_df[period].map(temp_max)
            dt_df['temp_mean'] = dt_df[period].map(temp_mean)
            dt_df['temp_std'] = dt_df[period].map(temp_std)

            dt_df[new_col + '_min_max'] = (dt_df[col] - dt_df['temp_min']) / (dt_df['temp_max'] - dt_df['temp_min'])
            dt_df[new_col +  '_std_score'] = (dt_df[col] - dt_df['temp_mean']) / (dt_df['temp_std'])
            del dt_df['temp_min'], dt_df['temp_max'], dt_df['temp_mean'], dt_df['temp_std']
    return dt_df

In [0]:
def frequency_encoding(train_df, test_df, columns, self_encoding=False):
    for col in columns:
        temp_df = pd.concat([train_df[[col]], test_df[[col]]])
        fq_encode = temp_df[col].value_counts(dropna=False).to_dict()
        if self_encoding:
            train_df[col] = train_df[col].map(fq_encode)
            test_df[col]  = test_df[col].map(fq_encode)            
        else:
            train_df[col+'_fq_enc'] = train_df[col].map(fq_encode)
            test_df[col+'_fq_enc']  = test_df[col].map(fq_encode)
    return train_df, test_df

In [0]:
def timeblock_frequency_encoding(train_df, test_df, periods, columns, 
                                 with_proportions=True, only_proportions=False):
    for period in periods:
        for col in columns:
            new_col = col + '_' + period
            train_df[new_col] = train_df[col].astype(str) + '_' + train_df[period].astype(str)
            test_df[new_col]  = test_df[col].astype(str) + '_' + test_df[period].astype(str)

            temp_df = pd.concat([train_df[[new_col]], test_df[[new_col]]])
            fq_encode = temp_df[new_col].value_counts().to_dict()

            train_df[new_col] = train_df[new_col].map(fq_encode)
            test_df[new_col]  = test_df[new_col].map(fq_encode)
            
            if only_proportions:
                train_df[new_col] = train_df[new_col] / train_df[period + '_total']
                test_df[new_col]  = test_df[new_col] / test_df[period + '_total']

            if with_proportions:
                train_df[new_col + '_proportions'] = train_df[new_col] / train_df[period + '_total']
                test_df[new_col + '_proportions']  = test_df[new_col] / test_df[period + '_total']

    return train_df, test_df

In [0]:
def uid_aggregation(train_df, test_df, main_columns, uids, aggregations):
    
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:                  
                
                new_col_name = col + '_' + main_column + '_' + agg_type
                
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df)
                test_df[new_col_name]  = test_df[col].map(temp_df)
                
    return train_df, test_df

def uid_aggregation_and_normalization(train_df, test_df, main_columns, uids, aggregations):
    for main_column in main_columns:  
        for col in uids:
            
            new_norm_col_name = col + '_' + main_column + '_std_norm'
            norm_cols = []
            
            for agg_type in aggregations:
                new_col_name = col + '_' + main_column + '_' + agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df)
                test_df[new_col_name]  = test_df[col].map(temp_df)
                norm_cols.append(new_col_name)
            
            train_df[new_norm_col_name] = (train_df[main_column] - train_df[norm_cols[0]]) / train_df[norm_cols[1]]
            test_df[new_norm_col_name]  = (test_df[main_column] - test_df[norm_cols[0]]) / test_df[norm_cols[1]]          
            
            del train_df[norm_cols[0]], train_df[norm_cols[1]]
            del test_df[norm_cols[0]], test_df[norm_cols[1]]
                                              
    return train_df, test_df

In [0]:
def check_cor_and_remove(train_df, test_df, i_cols, new_columns, remove=False):
    # Check correllation
    print('Correlations','#'*10)
    for col in new_columns:
        cor_cof = np.corrcoef(train_df[TARGET], train_df[col].fillna(0))[0][1]
        print(col, cor_cof)

    if remove:
        print('#'*10)
        print('Best options:')
        best_fe_columns = []
        for main_col in i_cols:
            best_option = ''
            best_cof = 0
            for col in new_columns:
                if main_col in col:
                    cor_cof = np.corrcoef(train_df[TARGET], train_df[col].fillna(0))[0][1]
                    cor_cof = (cor_cof**2)**0.5
                    if cor_cof > best_cof:
                        best_cof = cor_cof
                        best_option = col

            print(main_col, best_option, best_cof)            
            best_fe_columns.append(best_option)

        for col in new_columns:
            if col not in best_fe_columns:
                del train_df[col], test_df[col]

    return train_df, test_df

In [0]:
def fill_pairs(train, test, pairs):
    for pair in pairs:

        unique_train = []
        unique_test = []

        print(f'Pair: {pair}')
        print(f'In train{[pair[1]]} there are {train[pair[1]].isna().sum()} NaNs' )
        print(f'In test{[pair[1]]} there are {test[pair[1]].isna().sum()} NaNs' )

        for value in train[pair[0]].unique():
            unique_train.append(train[pair[1]][train[pair[0]] == value].value_counts().shape[0])

        for value in test[pair[0]].unique():
            unique_test.append(test[pair[1]][test[pair[0]] == value].value_counts().shape[0])

        pair_values_train = pd.Series(data=unique_train, index=train[pair[0]].unique())
        pair_values_test = pd.Series(data=unique_test, index=test[pair[0]].unique())
        
        print('Filling train...')

        for value in pair_values_train[pair_values_train == 1].index:
            train.loc[train[pair[0]] == value, pair[1]] = train.loc[train[pair[0]] == value, pair[1]].value_counts().index[0]

        print('Filling test...')

        for value in pair_values_test[pair_values_test == 1].index:
            test.loc[test[pair[0]] == value, pair[1]] = test.loc[test[pair[0]] == value, pair[1]].value_counts().index[0]

        print(f'In train{[pair[1]]} there are {train[pair[1]].isna().sum()} NaNs' )
        print(f'In test{[pair[1]]} there are {test[pair[1]].isna().sum()} NaNs' )
        
    return train, test

In [0]:
from sklearn.decomposition import PCA 

def pca_made_cols(train, test, pair):
    group = ['V' + str(num) for num in range(pair[0], pair[1] + 1)]
    pca_df = pd.concat([train_df, test_df])[group].dropna()
    pca = PCA().fit(pca_df)
    
    sums = np.cumsum(pca.explained_variance_ratio_)
    num = len([x for x in sums if x <= 0.98])
    
    if len(group) > num:
        sklearn_pca = PCA(n_components = num + 1)
        sklearn_pca.fit(pca_df[group])
        print(pair, num + 1)
        
        #train
        group_new = sklearn_pca.transform(train[group].dropna())
        group_new = pd.DataFrame(group_new)
        name_dict = {}

        for i in range(group_new.shape[1]):
            name_dict[i] = 'V_' + str(pair[0]) + '_' + str(pair[1]) + '_' + str(i)

        new_ind_group = group_new.set_index(pd.Index(train[group].dropna().index)).rename(name_dict, axis=1)
        train.drop(group, axis=1, inplace=True)
        train = pd.concat([train, new_ind_group], join='outer', axis=1)
        
        # test
        group_new = sklearn_pca.transform(test[group].dropna())
        group_new = pd.DataFrame(group_new)
        
        new_ind_group = group_new.set_index(pd.Index(test[group].dropna().index)).rename(name_dict, axis=1)
        test.drop(group, axis=1, inplace=True)
        test = pd.concat([test, new_ind_group], join='outer', axis=1)

    return train, test

pairs = [(1,11), (12, 34), (35, 52), (53, 74), (75, 94), (95, 137), (138, 166), (167, 216), (217, 278), (279, 321), (322, 339)]

#for pair in pairs:
#    train_df, test_df = pca_made_cols(train_df, test_df, pair)

----

In [0]:
########################### DATA LOAD
#################################################################################
print('Load Data')
train_df = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ieee/train_transaction.pkl')
test_df = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ieee/test_transaction.pkl')
train_identity = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ieee/train_identity.pkl')
test_identity = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ieee/test_identity.pkl')

base_columns = list(train_df.columns) + list(train_identity.columns)

print('Shape control:', train_df.shape, test_df.shape)

Load Data
Shape control: (590540, 123) (506691, 123)


In [0]:
remove_features = [
    'TransactionID','TransactionDT', # These columns are pure noise right now
    TARGET,
    ]

base_columns = [col for col in list(base_columns) if col not in remove_features]

In [0]:
########################### Device info and identity
for df in [train_identity, test_identity]:
    ########################### Device info
    df['DeviceInfo'] = df['DeviceInfo'].fillna('unknown_device').str.lower()
    df['DeviceInfo_device'] = df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    df['DeviceInfo_version'] = df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
    ########################### Device info 2
    df['id_30'] = df['id_30'].fillna('unknown_device').str.lower()
    df['id_30_device'] = df['id_30'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    df['id_30_version'] = df['id_30'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
    ########################### Browser
    df['id_31'] = df['id_31'].fillna('unknown_device').str.lower()
    df['id_31_device'] = df['id_31'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    
########################### Merge Identity columns
temp_df = train_df[['TransactionID']]
temp_df = temp_df.merge(train_identity, on=['TransactionID'], how='left')
del temp_df['TransactionID']
train_df = pd.concat([train_df,temp_df], axis=1)
    
temp_df = test_df[['TransactionID']]
temp_df = temp_df.merge(test_identity, on=['TransactionID'], how='left')
del temp_df['TransactionID']
test_df = pd.concat([test_df,temp_df], axis=1)

In [0]:
########################### TransactionDT
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

dates_range = pd.date_range(start='2017-10-01', end='2019-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

# Let's add temporary "time variables" for aggregations
# and add normal "time variables"
for df in [train_df, test_df]:
    
    # Temporary variables for aggregation
    df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = ((df['DT'].dt.year-2017)*12 + df['DT'].dt.month).astype(np.int8)
    df['DT_W'] = ((df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear).astype(np.int8)
    df['DT_D'] = ((df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear).astype(np.int16)
    
    df['DT_hour'] = (df['DT'].dt.hour).astype(np.int8)
    df['DT_day_week'] = (df['DT'].dt.dayofweek).astype(np.int8)
    df['DT_day_month'] = (df['DT'].dt.day).astype(np.int8)

# Remove temporary features from final list
remove_features += ['DT','DT_M','DT_W','DT_D','DT_hour','DT_day_week','DT_day_month']

categorical_features = []
    
# Total transactions per timeblock
for col in ['DT_M','DT_W','DT_D']:
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()
            
    train_df[col + '_total'] = train_df[col].map(fq_encode)
    test_df[col + '_total'] = test_df[col].map(fq_encode)
    
    # We can't use it as solo feature
    remove_features.append(col + '_total')

In [0]:
########################### Card columns "outliers"
for col in ['card1']: 
    valid_card = pd.concat([train_df[[col]], test_df[[col]]])
    valid_card = valid_card[col].value_counts()
    valid_card_std = valid_card.values.std()

    invalid_cards = valid_card[valid_card<=2]
    print('Rare cards', len(invalid_cards))

    valid_card = valid_card[valid_card>2]
    valid_card = list(valid_card.index)

    print('No intersection in Train', len(train_df[~train_df[col].isin(test_df[col])]))
    print('Intersection in Train', len(train_df[train_df[col].isin(test_df[col])]))
    
    train_df[col] = np.where(train_df[col].isin(test_df[col]), train_df[col], np.nan)
    test_df[col]  = np.where(test_df[col].isin(train_df[col]), test_df[col], np.nan)

    train_df[col] = np.where(train_df[col].isin(valid_card), train_df[col], np.nan)
    test_df[col]  = np.where(test_df[col].isin(valid_card), test_df[col], np.nan)
    print('#'*20)

for col in ['card2','card3','card4','card5','card6',]: 
    print('No intersection in Train', col, len(train_df[~train_df[col].isin(test_df[col])]))
    print('Intersection in Train', col, len(train_df[train_df[col].isin(test_df[col])]))
    
    train_df[col] = np.where(train_df[col].isin(test_df[col]), train_df[col], np.nan)
    test_df[col]  = np.where(test_df[col].isin(train_df[col]), test_df[col], np.nan)
    print('#'*20)

Rare cards 5993
No intersection in Train 10396
Intersection in Train 580144
####################
No intersection in Train card2 5012
Intersection in Train card2 585528
####################
No intersection in Train card3 47
Intersection in Train card3 590493
####################
No intersection in Train card4 0
Intersection in Train card4 590540
####################
No intersection in Train card5 7279
Intersection in Train card5 583261
####################
No intersection in Train card6 30
Intersection in Train card6 590510
####################


In [0]:
groups = pd.read_csv('groups (3).csv', index_col='TransactionID')
user_id = pd.read_csv('UserID (1).csv', index_col='TransactionID')

train_df = train_df.merge(groups, on='TransactionID', how='left')
test_df = test_df.merge(groups, on='TransactionID', how='left')


train_df = train_df.merge(user_id, on='TransactionID', how='left')
test_df = test_df.merge(user_id, on='TransactionID', how='left')

In [0]:
train_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,id_33_0,id_33_1,DeviceInfo_device,DeviceInfo_version,id_30_device,id_30_version,id_31_device,DT,DT_M,DT_W,DT_D,DT_hour,DT_day_week,DT_day_month,DT_M_total,DT_W_total,DT_D_total,groups,UserID_proxy
0,2987000,0,86400,68.5,800657,13926.0,327.0,150.0,9524.0,142.0,267648.0,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-12-01 00:00:00,12,48,335,0,4,1,137321,12093,5122,group0,
1,2987001,0,86401,29.0,800657,2755.0,404.0,150.0,347386.0,102.0,267648.0,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-12-01 00:00:01,12,48,335,0,4,1,137321,12093,5122,group1,
2,2987002,0,86469,59.0,800657,4663.0,490.0,150.0,719649.0,166.0,824959.0,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-12-01 00:01:09,12,48,335,0,4,1,137321,12093,5122,group2,
3,2987003,0,86499,50.0,800657,18132.0,567.0,150.0,347386.0,117.0,824959.0,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-12-01 00:01:39,12,48,335,0,4,1,137321,12093,5122,group9,
4,2987004,0,86506,50.0,62397,4497.0,514.0,150.0,347386.0,102.0,267648.0,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,...,144.0,,,,,,,,2.0,0.0,android 7.0,samsung browser 6.2,32.0,267.0,2.0,1.0,0.0,1.0,1.0,mobile,samsung sm-g892a build/nrd90m,2220.0,1080.0,samsungsmgabuildnrdm,89290.0,android,70.0,samsungbrowser,2017-12-01 00:01:46,12,48,335,0,4,1,137321,12093,5122,group14,Hgroup8511


In [0]:
train_df['uid_DT'] = (train_df['D1'] - train_df['DT_D']).astype(str) + '__' + train_df['card1'].astype(str)
test_df['uid_DT'] = (test_df['D1'] - test_df['DT_D']).astype(str) + '__' + test_df['card1'].astype(str)

train_df['uid5'] = train_df['uid_DT'].astype(str)+'_'+train_df['TransactionAmt'].astype(str)
test_df['uid5'] = test_df['uid_DT'].astype(str)+'_'+test_df['TransactionAmt'].astype(str)

remove_features += ['D1']

# Add values remove list
new_columns = ['uid_DT', 'uid5', 'groups',	'UserID_proxy']
#remove_features += new_columns

print('#'*10)
print('Most common uIds:')
for col in new_columns:
    print('#'*10, col)
    print(train_df[col].value_counts()[:10])

# Do Global frequency encoding 
i_cols = ['card1','card2','card3','card5', 'card4', 'card6'] + new_columns
#train_df, test_df = frequency_encoding(train_df, test_df, i_cols, self_encoding=False)

categorical_features += i_cols

##########
Most common uIds:
########## uid_DT
-463.0__15775.0    1414
-249.0__9500.0      482
-426.0__7919.0      439
-335.0__7919.0      410
-368.0__7919.0      333
-397.0__7919.0      325
-398.0__7919.0      286
-458.0__7919.0      285
-369.0__7919.0      252
157.0__12616.0      242
Name: uid_DT, dtype: int64
########## uid5
-463.0__15775.0_106.0    550
-463.0__15775.0_110.0    263
-463.0__15775.0_125.0    221
-175.0__8528.0_59.0      184
-463.0__15775.0_75.0     122
-463.0__15775.0_105.0    106
-358.0__6019.0_100.0      63
-495.0__16136.0_0.878     57
-463.0__15775.0_55.0      55
-242.0__9002.0_15.0       51
Name: uid5, dtype: int64
########## groups
group267171    1413
group670        466
group210688     344
group259665     241
group17323      231
group28599      215
group1759       194
group120497     179
group9057       173
group214512     145
Name: groups, dtype: int64
########## UserID_proxy
Sgroup33      1460
group238       577
group73        275
Hgroup143      207
group72   

In [0]:
# Clip Values
train_df['TransactionAmt'] = train_df['TransactionAmt'].clip(0,5000)
test_df['TransactionAmt']  = test_df['TransactionAmt'].clip(0,5000)

# Check if the Transaction Amount is common or not (we can use freq encoding here)
# In our dialog with a model we are telling to trust or not to these values   
train_df['TransactionAmt_check'] = np.where(train_df['TransactionAmt'].isin(test_df['TransactionAmt']), 1, 0)
test_df['TransactionAmt_check']  = np.where(test_df['TransactionAmt'].isin(train_df['TransactionAmt']), 1, 0)

# For our model current TransactionAmt is a noise
# https://www.kaggle.com/kyakovlev/ieee-check-noise
# (even if features importances are telling contrariwise)
# There are many unique values and model doesn't generalize well
# Lets do some aggregations
i_cols = ['TransactionAmt']
uids = ['card1','card2','card3','card5', 'uid_DT', 'groups',	'UserID_proxy']
aggregations = ['mean']

# uIDs aggregations
train_df, test_df = uid_aggregation(train_df, test_df, i_cols, uids, aggregations)
 
# # TransactionAmt Normalization
periods = ['DT_D','DT_W','DT_M']
for df in [train_df, test_df]:
    df = values_normalization(df, periods, i_cols)

# Product type
train_df['product_type'] = train_df['ProductCD'].astype(str)+'_'+train_df['TransactionAmt'].astype(str)
test_df['product_type'] = test_df['ProductCD'].astype(str)+'_'+test_df['TransactionAmt'].astype(str)

i_cols = ['product_type']
periods = ['DT_D','DT_W','DT_M']
train_df, test_df = timeblock_frequency_encoding(train_df, test_df, periods, i_cols, 
                                                 with_proportions=False, only_proportions=True)
#train_df, test_df = frequency_encoding(train_df, test_df, i_cols, self_encoding=False)

categorical_features += i_cols

In [0]:
i_cols = ['D'+str(i) for i in range(2,16)]
uids = ['uid_DT', 'groups',	'UserID_proxy']
aggregations = ['mean', 'std']

####### uIDs aggregations
train_df, test_df = uid_aggregation(train_df, test_df, i_cols, uids, aggregations)

####### Cleaning Neagtive values and columns transformations
for df in [train_df, test_df]:

    for col in i_cols:
        df[col] = df[col].clip(0) 
    
    # Lets transform D8 and D9 column
    # As we almost sure it has connection with hours
    df['D9_not_na'] = np.where(df['D9'].isna(),0,1)
    df['D8_not_same_day'] = np.where(df['D8']>=1,1,0)
    df['D8_D9_decimal_dist'] = df['D8'].fillna(0)-df['D8'].fillna(0).astype(int)
    df['D8_D9_decimal_dist'] = ((df['D8_D9_decimal_dist']-df['D9'])**2)**0.5
    df['D8'] = df['D8'].fillna(-1).astype(int)

# ####### Values Normalization
i_cols.remove('D2')
i_cols.remove('D9')
periods = ['DT_D','DT_W','DT_M']
for df in [train_df, test_df]:
    df = values_normalization(df, periods, i_cols)

for col in ['D2']:
    for df in [train_df, test_df]:
        df[col + '_scaled'] = df[col] / df[col].max()

# i_cols = ['D'+str(i) for i in range(1,16)]

# train_df.drop(i_cols, axis=1, inplace=True)
# test_df.drop(i_cols, axis=1, inplace=True)

In [0]:
# tonantheadholddict = {'C1':2000,'C2':2000,'C4':1000,'C6':1000,'C8':800,'C9':240,
# 'C14':240,'C10':520,'C11':520,'C12':520,'C13':1000}

# for col,thod in tonantheadholddict.items():
#     train_df.loc[train_df[col]>= thod, col] = np.nan
#     test_df.loc[test_df[col]>= thod, col] = np.nan

########################### C Columns
i_cols = ['C'+str(i) for i in range(1,15)]
uids = ['uid_DT', 'groups',	'UserID_proxy']
aggregations = ['mean', 'std']

####### uIDs aggregations
train_df, test_df = uid_aggregation(train_df, test_df, i_cols, uids, aggregations)

categorical_features += i_cols

####### Clip max values
for df in [train_df, test_df]:
    for col in i_cols:
        max_value = train_df[train_df['DT_M']==train_df['DT_M'].max()][col].max()
        df[col] = df[col].clip(None,max_value) 

In [0]:
i_cols = ['id_02__id_20', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1',
                'M2__M3', 'D8__D9', 'D11__DeviceInfo', 'id_02__D8', 'id_19__id_20']

# Some arbitrary features interaction
for feature in i_cols:

    f1, f2 = feature.split('__')
    train_df[feature] = train_df[f1].astype(str) + '_' + train_df[f2].astype(str)
    test_df[feature] = test_df[f1].astype(str) + '_' + test_df[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train_df[feature].astype(str).values) + list(test_df[feature].astype(str).values))
    train_df[feature] = le.transform(list(train_df[feature].astype(str).values))
    test_df[feature] = le.transform(list(test_df[feature].astype(str).values))
    
#train_df, test_df = frequency_encoding(train_df, test_df, i_cols, self_encoding=True)
categorical_features += i_cols

In [0]:
i_cols = [
          'DeviceInfo','DeviceInfo_device','DeviceInfo_version',
          'id_30','id_30_device','id_30_version',
          'id_31','id_31_device',
          'id_33',
         ]

####### Global Self frequency encoding
# self_encoding=True because 
# we don't need original values anymore
#train_df, test_df = frequency_encoding(train_df, test_df, i_cols, self_encoding=False)
categorical_features += i_cols

In [0]:
train_df, test_df = uid_aggregation(train_df, test_df, ['TransactionDT'], ['uid5'],['count', 'std'])

In [0]:
train_df['id_19'].head()

0      NaN
1      NaN
2      NaN
3      NaN
4    542.0
Name: id_19, dtype: float16

In [0]:
groups = pd.read_csv('/content/drive/My Drive/newroupingfeatures_train_test.csv')

addfeatures = ['TransactionID', 'count_group','skew_Amt_group','unique_Amt_group','var_Amt_group','unique_adr1_group','amplitude_dist1_group',
               'unique_Pemail_group','Amtisfirst_group','Amt_timeslowest_first_group','amplitude_Time_group']

train_df = train_df.merge(groups[addfeatures], on='TransactionID', how='left')
test_df = test_df.merge(groups[addfeatures], on='TransactionID', how='left')

In [0]:
i_cols = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'DeviceInfo', 'id_31', 'id_19__id_20', 'addr1', 'addr2']
uids = ['UserID_proxy']
aggregations = ['nunique']

####### uIDs aggregations
train_df, test_df = uid_aggregation(train_df, test_df, i_cols, uids, aggregations)

In [0]:
########################### Encode Str columns
# For all such columns (probably not)
# we already did frequency encoding (numeric feature)
# so we will use astype('category') here
for col in list(train_df):
    if train_df[col].dtype=='O':
        print(col)
        train_df[col] = train_df[col].fillna('unseen_before_label')
        test_df[col]  = test_df[col].fillna('unseen_before_label')
        
        train_df[col] = train_df[col].astype(str)
        test_df[col] = test_df[col].astype(str)
        
        le = LabelEncoder()
        le.fit(list(train_df[col])+list(test_df[col]))
        train_df[col] = le.transform(train_df[col])
        test_df[col]  = le.transform(test_df[col])

P_emaildomain
R_emaildomain
id_30
id_31
DeviceType
DeviceInfo
DeviceInfo_device
DeviceInfo_version
id_30_device
id_30_version
id_31_device
groups
UserID_proxy
uid_DT
uid5
product_type


In [0]:
useful_features = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1',
                   'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
                   'C14', 'D1','D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3',
                   'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V_1_11_0', 'V_1_11_1', 'V_1_11_2', 'V_1_11_3', 'V_1_11_4', 'V_1_11_5', 
                   'V_1_11_6', 'V_12_34_0', 'V_12_34_1', 'V_12_34_2', 'V_12_34_3', 'V_12_34_4', 'V_12_34_5', 'V_12_34_6', 
                   'V_12_34_7', 'V_12_34_8', 'V_12_34_9', 'V_12_34_10', 'V_12_34_11', 'V_35_52_0', 'V_35_52_1', 'V_35_52_2', 
                   'V_35_52_3', 'V_35_52_4', 'V_35_52_5', 'V_35_52_6', 'V_35_52_7', 'V_35_52_8', 'V_53_74_0', 'V_53_74_1', 
                   'V_53_74_2', 'V_53_74_3', 'V_53_74_4', 'V_53_74_5', 'V_53_74_6', 'V_53_74_7', 'V_53_74_8', 'V_53_74_9', 
                   'V_53_74_10', 'V_53_74_11', 'V_75_94_0', 'V_75_94_1', 'V_75_94_2', 'V_75_94_3', 'V_75_94_4', 'V_75_94_5', 
                   'V_75_94_6', 'V_75_94_7', 'V_75_94_8', 'V_75_94_9', 'V_75_94_10', 'V_95_137_0', 'V_95_137_1', 'V_95_137_2', 
                   'V_138_166_0', 'V_167_216_0', 'V_167_216_1', 'V_167_216_2', 'V_217_278_0', 'V_217_278_1', 'V_217_278_2', 
                    'V_322_339_0', 'V_322_339_1', 'V_322_339_2', 'V_279_321_0', 'V_279_321_1', 'V_279_321_2', 'V_279_321_3',
                   'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
                   'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33',
                   'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']
#

# useful_features = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1',
#                    'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
#                    'C14', 'D1','D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'M2', 'M3',
#                    'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V_1_11_0', 'V_1_11_1', 'V_1_11_2', 'V_1_11_3', 'V_1_11_4', 'V_1_11_5', 
#                    'V_1_11_6', 'V_12_34_0', 'V_12_34_1', 'V_12_34_2', 'V_12_34_3', 'V_12_34_4', 'V_12_34_5', 'V_12_34_6', 
#                    'V_12_34_7', 'V_12_34_8', 'V_12_34_9', 'V_12_34_10', 'V_12_34_11', 'V_35_52_0', 'V_35_52_1', 'V_35_52_2', 
#                    'V_35_52_3', 'V_35_52_4', 'V_35_52_5', 'V_35_52_6', 'V_35_52_7', 'V_35_52_8', 'V_53_74_0', 'V_53_74_1', 
#                    'V_53_74_2', 'V_53_74_3', 'V_53_74_4', 'V_53_74_5', 'V_53_74_6', 'V_53_74_7', 'V_53_74_8', 'V_53_74_9', 
#                    'V_53_74_10', 'V_53_74_11', 'V_75_94_0', 'V_75_94_1', 'V_75_94_2', 'V_75_94_3', 'V_75_94_4', 'V_75_94_5', 
#                    'V_75_94_6', 'V_75_94_7', 'V_75_94_8', 'V_75_94_9', 'V_75_94_10', 'V_95_137_0', 'V_95_137_1', 'V_95_137_2', 
#                    'V_138_166_0', 'V_167_216_0', 'V_167_216_1', 'V_167_216_2', 'V_217_278_0', 'V_217_278_1', 'V_217_278_2', 
#                    'V_322_339_0', 'V_322_339_1', 'V_322_339_2' 
#                    'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
#                    'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_19', 'id_20', 'id_30', 'id_32', 'id_33',
#                    'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [0]:
categorical_features += ['ProductCD'] + \
           ["card"+f"{i+1}" for i in range(6)] + \
           ["addr"+f"{i+1}" for i in range(2)] + \
           ["P_emaildomain", "R_emaildomain"] + \
           ["DeviceType", "DeviceInfo"] 

In [0]:
features_columns = [col for col in list(train_df) if col not in remove_features]
bad_features = [col for col in base_columns if col not in useful_features]
bad_features = list(set(bad_features) - set(set(bad_features) - set(train_df.columns)))
features_columns = list(set(features_columns) - set(bad_features))


categorical_features = [col for col in categorical_features if col in features_columns]
categorical_features = list(set(categorical_features))

In [0]:
X = train_df.sort_values('TransactionDT')[features_columns].fillna(-10000)
y = train_df.sort_values('TransactionDT')['isFraud']
dt_m = train_df.sort_values('TransactionDT')['DT_M']

X_test = test_df[features_columns].fillna(-10000)

In [0]:
#print(categorical_features)

#categorical_features = ['id_02__id_20', 'card2', 'C13', 'id_30_device', 'C8', 'card3', 'id_30_version', 'C2', 'card5__P_emaildomain', 'card6', 'DeviceInfo', 'id_31', 'id_31_device', 'uid_DT', 'C6', 'C1', 'addr2', 'ProductCD', 'card4', 'D8__D9', 'P_emaildomain', 'R_emaildomain', 'C10', 'product_type', 'addr1__card1', 'card5', 'M2__M3', 'id_02__D8', 'id_30', 'addr1', 'uid5', 'C9', 'C12', 'C7', 'C5', 'DeviceInfo__P_emaildomain', 'groups', 'card1', 'card1__card5', 'C11', 'DeviceInfo_version', 'DeviceType', 'id_33', 'D11__DeviceInfo', 'card2__dist1', 'C14', 'card2__id_20', 'C4', 'P_emaildomain__C2', 'DeviceInfo_device']

In [0]:
#categorical_features = [feat for feat in categorical_features if feat not in ['id_31_device', 'addr1__card1', 'id_02__D8']]

In [0]:
X.shape, X_test.shape

((590540, 464), (506691, 464))

In [0]:
X = reduce_mem_usage(X)
X_test = reduce_mem_usage(X_test)

Mem. usage decreased to 799.16 Mb (57.3% reduction)
Mem. usage decreased to 686.65 Mb (57.2% reduction)


In [0]:
X.to_pickle('X.pkl')
y.to_pickle('y.pkl')
X_test.to_pickle('X_test.pkl')
dt_m.to_pickle('dt_m.pkl')

In [0]:
# train = train_df.sort_values('TransactionDT')[['TransactionID', 'isFraud', 'DT_M'] + features_columns].fillna(-10000)
# test = test_df.sort_values('TransactionDT')[['TransactionID'] + features_columns].fillna(-10000)

# train = reduce_mem_usage(train)
# test = reduce_mem_usage(test)

# train.to_csv('train.csv', index=False)
# test.to_csv('test.csv', index=False)

In [0]:
#del train_df, test_df, X_test, train_identity, test_identity, X, y
#gc.collect()

In [0]:
!pip install catboost



In [0]:
from catboost import CatBoostClassifier

X = pd.read_pickle('X.pkl')
y = pd.read_pickle('y.pkl')
X_test = pd.read_pickle('X_test.pkl')
dt_m = pd.read_pickle('dt_m.pkl')

In [0]:
cat_params = {
              'n_estimators':2500,
              'learning_rate': 0.07,
              'eval_metric':'AUC',
              'loss_function':'Logloss',
              'random_seed': 42,
              'metric_period':500,
              'od_wait':500,
              'task_type':'GPU',
              'depth': 8,
              } 

In [0]:
%%time

NFOLDS = 5
folds = GroupKFold(n_splits=NFOLDS)

#neptune.create_experiment(params=params)

columns = X.columns
splits = folds.split(X, y, groups=dt_m)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    # Run model for this fold
    
    estimator = CatBoostClassifier(**cat_params)        
    estimator.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        cat_features=categorical_features,
        use_best_model=False,
        early_stopping_rounds = 5000,
        verbose=True)
    
    y_pred_valid = estimator.predict_proba(X_valid)[:,1]
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += estimator.predict_proba(X_test)[:,1] / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")
#neptune.send_metric('local cv', score)
#neptune.send_metric('oof score', roc_auc_score(y, y_oof))

* 9458

In [0]:
#neptune.stop()

In [0]:
import os

name = 'g_catboost' + '_' + str(int(score * 10000))
path = name

os.mkdir(path)

sample_submission = pd.read_csv('sample_submission.csv', index_col='TransactionID').reset_index()
sample_submission["isFraud"] = y_preds
sample_submission.to_csv(name + '/test_' + name +'.csv', index=False)

train_oof = pd.DataFrame()
train_oof['TransactionID'] = train_df.sort_values('TransactionDT')['TransactionID']
train_oof['isFraud'] = y_oof
train_oof.to_csv(name + '/train_' + name + '.csv', index=False)

In [0]:
train_oof.head()