In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import pandas_profiling

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import LabelEncoder

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import roc_auc_score
from sklearn import preprocessing

import xgboost as xgb

from datetime import datetime
import lightgbm as lgb
#import multiprocessing
#import gc

In [2]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[: 3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min  and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    
    end_mem = df.memory_usage().sum()/ 1024**2
    print('Memory usage after optimization is: {:.2f} MB, {:.1f}% reduction'.\
          format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
trans = pd.read_csv('train_transaction.csv', index_col='TransactionID')
test_trans = pd.read_csv('test_transaction.csv', index_col='TransactionID')

In [4]:
ident = pd.read_csv('train_identity.csv', index_col='TransactionID')
test_ident = pd.read_csv('test_identity.csv', index_col='TransactionID')

In [5]:
train = trans.merge(ident, how='left', left_index=True, right_index=True)
test = test_trans.merge(test_ident, how='left', left_index=True, right_index=True)

In [6]:
print(train.shape)
print(test.shape)

(590540, 433)
(506691, 432)


In [8]:
import os
import gc
import sys
sys.path.append("../..")
from tqdm import tqdm_notebook

In [10]:

gc.collect()

22

In [11]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage after optimization is: 668.22 MB, 66.2% reduction
Memory usage after optimization is: 583.43 MB, 65.6% reduction


In [13]:
final_cols = ['TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V2',
 'V3',
 'V4',
 'V5',
 'V7',
 'V12',
 'V13',
 'V15',
 'V19',
 'V20',
 'V23',
 'V24',
 'V25',
 'V26',
 'V29',
 'V30',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V43',
 'V44',
 'V45',
 'V47',
 'V48',
 'V49',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V66',
 'V67',
 'V69',
 'V70',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V85',
 'V86',
 'V87',
 'V90',
 'V91',
 'V93',
 'V94',
 'V96',
 'V97',
 'V98',
 'V99',
 'V100',
 'V101',
 'V102',
 'V103',
 'V105',
 'V106',
 'V109',
 'V115',
 'V124',
 'V126',
 'V127',
 'V128',
 'V129',
 'V130',
 'V131',
 'V132',
 'V133',
 'V134',
 'V135',
 'V136',
 'V137',
 'V139',
 'V140',
 'V143',
 'V144',
 'V145',
 'V146',
 'V147',
 'V149',
 'V150',
 'V151',
 'V152',
 'V156',
 'V158',
 'V159',
 'V160',
 'V161',
 'V162',
 'V163',
 'V164',
 'V165',
 'V166',
 'V167',
 'V168',
 'V169',
 'V170',
 'V171',
 'V172',
 'V173',
 'V174',
 'V175',
 'V178',
 'V180',
 'V184',
 'V187',
 'V188',
 'V189',
 'V192',
 'V197',
 'V198',
 'V200',
 'V201',
 'V202',
 'V203',
 'V204',
 'V205',
 'V206',
 'V207',
 'V208',
 'V209',
 'V210',
 'V211',
 'V212',
 'V213',
 'V214',
 'V215',
 'V216',
 'V217',
 'V218',
 'V219',
 'V220',
 'V221',
 'V222',
 'V223',
 'V224',
 'V225',
 'V226',
 'V228',
 'V229',
 'V230',
 'V231',
 'V232',
 'V233',
 'V234',
 'V238',
 'V239',
 'V243',
 'V244',
 'V245',
 'V246',
 'V248',
 'V249',
 'V251',
 'V256',
 'V257',
 'V258',
 'V259',
 'V261',
 'V262',
 'V263',
 'V264',
 'V265',
 'V266',
 'V267',
 'V268',
 'V270',
 'V271',
 'V272',
 'V273',
 'V274',
 'V275',
 'V276',
 'V277',
 'V278',
 'V279',
 'V280',
 'V281',
 'V282',
 'V283',
 'V284',
 'V285',
 'V286',
 'V287',
 'V288',
 'V289',
 'V290',
 'V291',
 'V292',
 'V293',
 'V294',
 'V295',
 'V296',
 'V298',
 'V299',
 'V300',
 'V301',
 'V303',
 'V304',
 'V306',
 'V307',
 'V308',
 'V309',
 'V310',
 'V311',
 'V312',
 'V313',
 'V314',
 'V315',
 'V316',
 'V317',
 'V318',
 'V319',
 'V320',
 'V321',
 'V322',
 'V323',
 'V324',
 'V326',
 'V331',
 'V332',
 'V333',
 'V335',
 'V336',
 'id_01',
 'id_02',
 'id_03',
 'id_05',
 'id_06',
 'id_09',
 'id_11',
 'id_13',
 'id_14',
 'id_15',
 'id_16',
 'id_17',
 'id_18',
 'id_19',
 'id_20',
 'id_25',
 'id_26',
 'id_30',
 'id_31',
 'id_32',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'DeviceType',
 'DeviceInfo']
print(len(final_cols))

321


In [15]:
del_cols = []
no_cols= []
for col in train.columns:

  if col not in final_cols:
    del_cols.append(col)
  else:
    no_cols.append(col)
print(len(no_cols));print(len(del_cols))
print(no_cols == final_cols)

321
112
True


In [16]:
del_cols

['isFraud',
 'TransactionDT',
 'C3',
 'M1',
 'V1',
 'V6',
 'V8',
 'V9',
 'V10',
 'V11',
 'V14',
 'V16',
 'V17',
 'V18',
 'V21',
 'V22',
 'V27',
 'V28',
 'V31',
 'V32',
 'V41',
 'V42',
 'V46',
 'V50',
 'V65',
 'V68',
 'V71',
 'V72',
 'V84',
 'V88',
 'V89',
 'V92',
 'V95',
 'V104',
 'V107',
 'V108',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V116',
 'V117',
 'V118',
 'V119',
 'V120',
 'V121',
 'V122',
 'V123',
 'V125',
 'V138',
 'V141',
 'V142',
 'V148',
 'V153',
 'V154',
 'V155',
 'V157',
 'V176',
 'V177',
 'V179',
 'V181',
 'V182',
 'V183',
 'V185',
 'V186',
 'V190',
 'V191',
 'V193',
 'V194',
 'V195',
 'V196',
 'V199',
 'V227',
 'V235',
 'V236',
 'V237',
 'V240',
 'V241',
 'V242',
 'V247',
 'V250',
 'V252',
 'V253',
 'V254',
 'V255',
 'V260',
 'V269',
 'V297',
 'V302',
 'V305',
 'V325',
 'V327',
 'V328',
 'V329',
 'V330',
 'V334',
 'V337',
 'V338',
 'V339',
 'id_04',
 'id_07',
 'id_08',
 'id_10',
 'id_12',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_27',
 'id_28',
 'id_29']

In [17]:
del_cols.pop(0)

'isFraud'

In [18]:
len(del_cols)

111

In [19]:
del_cols

['TransactionDT',
 'C3',
 'M1',
 'V1',
 'V6',
 'V8',
 'V9',
 'V10',
 'V11',
 'V14',
 'V16',
 'V17',
 'V18',
 'V21',
 'V22',
 'V27',
 'V28',
 'V31',
 'V32',
 'V41',
 'V42',
 'V46',
 'V50',
 'V65',
 'V68',
 'V71',
 'V72',
 'V84',
 'V88',
 'V89',
 'V92',
 'V95',
 'V104',
 'V107',
 'V108',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V116',
 'V117',
 'V118',
 'V119',
 'V120',
 'V121',
 'V122',
 'V123',
 'V125',
 'V138',
 'V141',
 'V142',
 'V148',
 'V153',
 'V154',
 'V155',
 'V157',
 'V176',
 'V177',
 'V179',
 'V181',
 'V182',
 'V183',
 'V185',
 'V186',
 'V190',
 'V191',
 'V193',
 'V194',
 'V195',
 'V196',
 'V199',
 'V227',
 'V235',
 'V236',
 'V237',
 'V240',
 'V241',
 'V242',
 'V247',
 'V250',
 'V252',
 'V253',
 'V254',
 'V255',
 'V260',
 'V269',
 'V297',
 'V302',
 'V305',
 'V325',
 'V327',
 'V328',
 'V329',
 'V330',
 'V334',
 'V337',
 'V338',
 'V339',
 'id_04',
 'id_07',
 'id_08',
 'id_10',
 'id_12',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_27',
 'id_28',
 'id_29']

In [20]:
train = train.drop(del_cols, axis=1)
test = test.drop(del_cols, axis=1)

In [21]:
train.shape

(590540, 322)

In [22]:
test.shape

(506691, 321)

In [23]:
train.to_csv('rfecv_1_train.csv')
test.to_csv('rfecv_1_test.csv')