In [1]:
# Scientific libraries
              
import numpy as np              
import scipy as sp              
import pandas as pd              
import json              
import missingno as msno              

# Loading Plotting Modules
              
import matplotlib              
import matplotlib.pyplot as plt              
import seaborn as sns              
%matplotlib inline              
import chart_studio.plotly as py              
import plotly.figure_factory as ff              
import plotly.graph_objects as go              
import plotly.express as px              
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot              
init_notebook_mode(connected=True)              

# Setting Data Frame Options
              
pd.set_option('display.max_rows', 40)              
pd.set_option('display.max_columns', 50)              
pd.set_option('display.width', 100)              
pd.set_option('display.max_colwidth', 100)              

    

import katana.nbutil as ktn
              
import os
import gc

In [2]:
# Setting Plot Configuration
              
sns.set(rc={'figure.figsize':(19,11)}, style = 'white')          

In [42]:
import configparser

wd = os.getcwd()
pardir = os.path.join(wd, os.pardir, )

_CFG_FILE = os.path.join(pardir, os.pardir, 'features.cfg')

cfg = configparser.ConfigParser()
cfg.read(_CFG_FILE)
categoric_trs = json.loads(cfg['categoric']['transaction'])
categoric_id = json.loads(cfg['categoric']['identity'])
categoric_all = categoric_id + categoric_trs

In [13]:
train_trs = pd.read_csv('../../rsc/train_transaction.csv.zip')
train_id = pd.read_csv('../../rsc/train_identity.csv.zip')
train_trs.set_index('TransactionID', inplace=True)
train_id.set_index('TransactionID', inplace=True)
test_trs = pd.read_csv('../../rsc/test_transaction.csv.zip')
test_id = pd.read_csv('../../rsc/test_identity.csv.zip')
test_trs.set_index('TransactionID', inplace=True)
test_id.set_index('TransactionID', inplace=True)

In [15]:
train_rj = train_trs.join(train_id, how='right')
train_lj = train_trs.join(train_id, how='left')
test_lj = test_trs.join(test_id, how='left')
print('Train Cat (RJ): %d x %d' % train_rj.shape)
print('Train Cat (LJ): %d x %d' % train_lj.shape)
print('Test Cat (LJ): %d x %d' % test_lj.shape)

Train Cat (RJ): 144233 x 433
Train Cat (LJ): 590540 x 433
Test Cat (LJ): 506691 x 432


In [16]:
train_set_id = train_rj.index.tolist()
dev_set_id = list(set(train_lj.index.tolist()) -  set(train_set_id))
test_set_id = test_lj.index.tolist()

In [27]:
import gc
del train_id, train_trs, test_id, test_trs
gc.collect()

5191

In [23]:
indexes = pd.concat([pd.Series(train_set_id) ,
                     pd.Series(dev_set_id) ,
                     pd.Series(test_set_id)], axis=1)
indexes.columns = ['train_set_id', 'dev_set_id', 'test_set_id']

In [26]:
indexes.to_csv('./output/transaction_ids.csv.gz')

In [1]:
all_cat_data = pd.read_parquet('./output/pre-modeling/all_cat.pqt.gz')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cossim = cosine_similarity(all_cat_data.T)
cossim.columns = all_cat_data.columns
cossim.index = all_cat_data.columns

In [None]:
high_sim = cossim.sum(axis=1).sort_values(ascending=False)
high_sim

In [8]:
colsums = 100*(all_cat_data.sum(axis=0)/all_cat_data.shape[0])

In [29]:
rm_cols = colsums[colsums<.0001].index.tolist()

In [33]:
all_cat_data.drop(rm_cols, axis=1, inplace=True)

In [35]:
all_cat_data.to_parquet('./output/pre-modeling/reduced_all_cat.pqt.gz', compression='gzip')

In [36]:
all_data = train_lj.append(test_lj)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [37]:
all_data.head()

Unnamed: 0_level_0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,isFraud
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
2987001,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
2987002,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
2987003,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
2987004,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,0.0


In [47]:
all_data.drop(categoric_all, axis=1, inplace=True)

In [52]:
all_data.to_parquet('./output/pre-modeling/all_num_data.pqt.gz', compression='gzip')

In [57]:
any(['Trans' in x for x in all_data.columns.tolist()])

True

In [61]:
all_data.shape, all_cat_data.shape

((1097231, 384), (1097231, 6202))

In [63]:
del all_data, train_lj, train_rj, test_lj, indexes
gc.collect()

13989

In [1]:
reduced_cat_data = pd.read_parquet('./output/pre-modeling/reduced_all_cat.pqt.gz')
all_num_data = pd.read_parquet('./output/pre-modeling/all_num_data.pqt.gz')

In [2]:
df = all_num_data.join(reduced_cat_data, how='left')

In [3]:
indexes = pd.read_csv('./output/transaction_ids.csv.gz')

In [4]:
os.makedirs('../modeling/model_data', exist_ok=True)

In [5]:
import gc
del reduced_cat_data, all_num_data
gc.collect()

8010

In [6]:
train = df.loc[indexes[indexes.train_set_id.notna()].train_set_id]
train.to_parquet('../modeling/model_data/train_set.pqt.gz', compression='gzip')
del train
gc.collect()
dev = df.loc[indexes[indexes.dev_set_id.notna()].dev_set_id]
dev.to_parquet('../modeling/model_data/dev_set.pqt.gz', compression='gzip')
del dev
gc.collect()
test = df.loc[indexes[indexes.test_set_id.notna()].test_set_id]
test.to_parquet('../modeling/model_data/test_set.pqt.gz', compression='gzip')
del test
gc.collect()

0