In [None]:
# !pip3 install modin 
# !pip3 install --upgrade pandas

In [2]:
import os
import gc
import sys

import pandas as pd
# import modin.pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [3]:
from sklearn import preprocessing

In [4]:
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [9]:
sample_submission = pd.read_csv('../../../Data/sample_submission.csv')

In [11]:
train_tr = pd.read_csv('../../../Data/train_transaction.csv')
train_id = pd.read_csv('../../../Data/train_identity.csv')
test_tr = pd.read_csv('../../../Data/test_transaction.csv')
test_id = pd.read_csv('../../../Data/test_identity.csv')

In [12]:
train = train_tr.merge(train_id, how='left', on='TransactionID')
test = test_tr.merge(test_id, how='left', on='TransactionID')
# del train_tr, train_id, test_tr, test_id

In [13]:
print('train.shape: ', train.shape)
print('test.shape: ', test.shape)
print('----------------')
print('train_tr.shape: ', train_tr.shape)
print('train_id.shape: ', train_id.shape)
print('test_tr.shape: ', test_tr.shape)
print('test_id.shape: ', test_id.shape)
print('sample_submission.shape: ', sample_submission.shape)


train.shape:  (590540, 434)
test.shape:  (506691, 433)
----------------
train_tr.shape:  (590540, 394)
train_id.shape:  (144233, 41)
test_tr.shape:  (506691, 393)
test_id.shape:  (141907, 41)
sample_submission.shape:  (506691, 2)


In [14]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [15]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
print('training set shape: ', train.shape)
print('test set shape: ', test.shape)

Mem. usage decreased to 650.48 Mb (66.8% reduction)
Mem. usage decreased to 565.37 Mb (66.3% reduction)
training set shape:  (590540, 434)
test set shape:  (506691, 433)


# 전처리
### 맡은 부분: Transaction 앞부분 (~M까지)

In [16]:
for column in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']:
    train[column +  '_count_full'] = \
        train[column].map(pd.concat([train[column], test[column]], ignore_index=True).value_counts(dropna=False))
    test[column +  '_count_full'] = \
        test[column].map(pd.concat([train[column], test[column]], ignore_index=True).value_counts(dropna=False))


In [17]:
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)

train['Transaction_hour_of_day'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour_of_day'] = np.floor(test['TransactionDT'] / 3600) % 24

In [18]:
train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

In [19]:
train[['R_emaildomain', 'P_emaildomain']].apply(lambda x: x[0] if x[0] == np.nan else x[1], axis=1).head()

0            NaN
1      gmail.com
2    outlook.com
3      yahoo.com
4      gmail.com
dtype: object

In [20]:
train['emaildomain'] = train[['R_emaildomain', 'P_emaildomain']].apply(lambda x: x[0] if x[0] == np.nan else x[1], axis=1)
test['emaildomain'] = test[['R_emaildomain', 'P_emaildomain']].apply(lambda x: x[0] if x[0] == np.nan else x[1], axis=1)

In [21]:
train = train.drop(columns=['R_emaildomain', 'P_emaildomain'])
test = test.drop(columns=['R_emaildomain', 'P_emaildomain'])

In [22]:
train.columns

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'card6_count_full', 'addr1_count_full', 'addr2_count_full',
       'Transaction_day_of_week', 'Transaction_hour_of_day',
       'TransactionAmt_to_mean_card1', 'TransactionAmt_to_mean_card4',
       'TransactionAmt_to_std_card1', 'TransactionAmt_to_std_card4',
       'emaildomain'],
      dtype='object', length=447)

In [23]:
def print_columns(df):
    cols = df.columns
    string = ''
    for col in cols:
        string += '"%s", ' % str(col)
    print(string)
    print(len(string))

In [24]:
print_columns(train)

"TransactionID", "isFraud", "TransactionDT", "TransactionAmt", "ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "dist1", "dist2", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12", "D13", "D14", "D15", "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9", "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "V29", "V30", "V31", "V32", "V33", "V34", "V35", "V36", "V37", "V38", "V39", "V40", "V41", "V42", "V43", "V44", "V45", "V46", "V47", "V48", "V49", "V50", "V51", "V52", "V53", "V54", "V55", "V56", "V57", "V58", "V59", "V60", "V61", "V62", "V63", "V64", "V65", "V66", "V67", "V68", "V69", "V70", "V71", "V72", "V73", "V74", "V75", "V76", "V77", "V78", "V79", "V80", "V81", "V82", "V83", "V84", "V85", "V86", "V

In [25]:
print_columns(test)

"TransactionID", "TransactionDT", "TransactionAmt", "ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "dist1", "dist2", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12", "D13", "D14", "D15", "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9", "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "V29", "V30", "V31", "V32", "V33", "V34", "V35", "V36", "V37", "V38", "V39", "V40", "V41", "V42", "V43", "V44", "V45", "V46", "V47", "V48", "V49", "V50", "V51", "V52", "V53", "V54", "V55", "V56", "V57", "V58", "V59", "V60", "V61", "V62", "V63", "V64", "V65", "V66", "V67", "V68", "V69", "V70", "V71", "V72", "V73", "V74", "V75", "V76", "V77", "V78", "V79", "V80", "V81", "V82", "V83", "V84", "V85", "V86", "V87", "V88",

# 나머지 columns 모두 drop

In [26]:
def check_and_drop_column(df, column):
    if column in df.columns:
        df.drop(columns=[column], axis=1, inplace=True)

my_cols_to_drop = ['dist1', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9', 'D2', 'D11']
other_cols_to_drop = ['DeviceType', 'DeviceInfo']

cols_to_drop = my_cols_to_drop + other_cols_to_drop

for col in cols_to_drop:
    check_and_drop_column(train, col)
    check_and_drop_column(test, col)

# Drop id01~id38
for num in range(1, 39):
    num_str = '%d' % num
    if num < 10:
        num_str = '0' + num_str
    id_str = 'id_%s' % num_str
    check_and_drop_column(train, id_str)
    check_and_drop_column(test, id_str)
    
# Drop V1~V339
for num in range(1, 340):
    v_str = 'V%d' % num
    check_and_drop_column(train, v_str)
    check_and_drop_column(test, v_str)

In [27]:
my_final_cols = []

my_final_cols.extend(["TransactionID", "card1_count_full", "card2_count_full", \
                      "card3_count_full", "card4_count_full", \
                      "card5_count_full", "card6_count_full", \
                      "addr1_count_full", "addr2_count_full", \
                      "Transaction_day_of_week", "Transaction_hour_of_day", \
                      "TransactionAmt_to_mean_card1", "TransactionAmt_to_mean_card4", \
                      "TransactionAmt_to_std_card1", "TransactionAmt_to_std_card4", "emaildomain"])

for col in train.columns:
    if col not in my_final_cols:
        check_and_drop_column(train, col)
        
for col in test.columns:
    if col not in my_final_cols:
        check_and_drop_column(test, col)

## 최종 Columns

In [28]:
print_columns(train)

"TransactionID", "card1_count_full", "card2_count_full", "card3_count_full", "card4_count_full", "card5_count_full", "card6_count_full", "addr1_count_full", "addr2_count_full", "Transaction_day_of_week", "Transaction_hour_of_day", "TransactionAmt_to_mean_card1", "TransactionAmt_to_mean_card4", "TransactionAmt_to_std_card1", "TransactionAmt_to_std_card4", "emaildomain", 
372


In [29]:
print_columns(test)

"TransactionID", "card1_count_full", "card2_count_full", "card3_count_full", "card4_count_full", "card5_count_full", "card6_count_full", "addr1_count_full", "addr2_count_full", "Transaction_day_of_week", "Transaction_hour_of_day", "TransactionAmt_to_mean_card1", "TransactionAmt_to_mean_card4", "TransactionAmt_to_std_card1", "TransactionAmt_to_std_card4", "emaildomain", 
372


In [32]:
test.to_csv('jinoo_test.csv', index=False)

In [33]:
train.to_csv('jinoo_train.csv', index=False)