In [1]:
from datetime import datetime, timedelta
import gc
import numpy as np
from scipy.spatial.distance import pdist, squareform
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
warnings.filterwarnings("ignore")

CAT_FCOLS = ['card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2']
C_FCOLS = [f'C{i}' for i in range(1, 15)]
D_FCOLS = [f'D{i}' for i in range(1, 16)]
V_FCOLS = [f'V{i}' for i in range(1, 340)] 
FLOAT64_TCOLS = CAT_FCOLS + C_FCOLS + D_FCOLS + V_FCOLS
FLOAT64_ICOLS = [f'id_0{i}' for i in range(1, 10)] + ['id_10', 'id_11', 'id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'id_32']

In [2]:
%%time

df_train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv', dtype=dict.fromkeys(FLOAT64_ICOLS, np.float32))
df_test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv', dtype=dict.fromkeys(FLOAT64_ICOLS, np.float32))
df_train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv', dtype=dict.fromkeys(FLOAT64_TCOLS, np.float32))
df_test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv', dtype=dict.fromkeys(FLOAT64_TCOLS, np.float32))
X_train = pd.merge(df_train_transaction, df_train_identity, how='left', on='TransactionID')
X_test = pd.merge(df_test_transaction, df_test_identity, how='left', on='TransactionID')

print('Number of Training Examples = {}'.format(df_train_transaction.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test_transaction.shape[0]))
print('Number of Training Examples with Identity = {}'.format(df_train_identity.shape[0]))
print('Number of Test Examples with Identity = {}\n'.format(df_test_identity.shape[0]))
print('Training X Shape = {}'.format(X_train.shape))
print('Training y Shape = {}'.format(X_train['isFraud'].shape))
print('Test X Shape = {}\n'.format(X_test.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(X_train.memory_usage().sum() / 1024**2))
print('Test Set Memory Usage = {:.2f} MB\n'.format(X_test.memory_usage().sum() / 1024**2))

del df_train_identity, df_test_identity, df_train_transaction, df_test_transaction
gc.collect()

Number of Training Examples = 590540
Number of Test Examples = 506691

Number of Training Examples with Identity = 144233
Number of Test Examples with Identity = 141907

Training X Shape = (590540, 434)
Training y Shape = (590540,)
Test X Shape = (506691, 433)

Training Set Memory Usage = 1063.29 MB
Test Set Memory Usage = 908.45 MB

CPU times: user 43.8 s, sys: 12.6 s, total: 56.4 s
Wall time: 55.7 s


11

Grazder's [Filling card NaNs](https://www.kaggle.com/grazder/filling-card-nans) kernel inspired me to create this helper function. It basically checks the value counts of two given variables and outputs how many different values can dependent_var get for every independent variable value. This is one way to understand causality between two vectors which can't be seen by pearson correlation.

This function can be used to reveal connection between features and for imputation. There isn't any standard threshold for deciding dependent/not dependent, so if you have a hunch just use that information. There are some examples below.

In [3]:
def check_dependency(independent_var, dependent_var):
    
    independent_uniques = []
    temp_df = pd.concat([X_train[[independent_var, dependent_var]], X_test[[independent_var, dependent_var]]])
    
    for value in temp_df[independent_var].unique():
        independent_uniques.append(temp_df[temp_df[independent_var] == value][dependent_var].value_counts().shape[0])

    values = pd.Series(data=independent_uniques, index=temp_df[independent_var].unique())
    
    N = len(values)
    N_dependent = len(values[values == 1])
    N_notdependent = len(values[values > 1])
    N_null = len(values[values == 0])
        
    print(f'In {independent_var}, there are {N} unique values')
    print(f'{N_dependent}/{N} have one unique {dependent_var} value')
    print(f'{N_notdependent}/{N} have more than one unique {dependent_var} values')
    print(f'{N_null}/{N} have only missing {dependent_var} values\n')

In [4]:
check_dependency('card1', 'card2')

In card1, there are 17091 unique values
16274/17091 have one unique card2 value
296/17091 have more than one unique card2 values
521/17091 have only missing card2 values



In [5]:
check_dependency('card1', 'card3')

In card1, there are 17091 unique values
17028/17091 have one unique card3 value
29/17091 have more than one unique card3 values
34/17091 have only missing card3 values



In [6]:
check_dependency('card1', 'card4')

In card1, there are 17091 unique values
17037/17091 have one unique card4 value
0/17091 have more than one unique card4 values
54/17091 have only missing card4 values



In [7]:
check_dependency('card1', 'card5')

In card1, there are 17091 unique values
16521/17091 have one unique card5 value
307/17091 have more than one unique card5 values
263/17091 have only missing card5 values



In [8]:
check_dependency('card1', 'card6')

In card1, there are 17091 unique values
16898/17091 have one unique card6 value
158/17091 have more than one unique card6 values
35/17091 have only missing card6 values



In [9]:
check_dependency('card1', 'addr2')

In card1, there are 17091 unique values
14560/17091 have one unique addr2 value
366/17091 have more than one unique addr2 values
2165/17091 have only missing addr2 values



In [10]:
check_dependency('card1', 'P_emaildomain')

In card1, there are 17091 unique values
6754/17091 have one unique P_emaildomain value
9887/17091 have more than one unique P_emaildomain values
450/17091 have only missing P_emaildomain values



In [11]:
check_dependency('card1', 'R_emaildomain')

In card1, there are 17091 unique values
5439/17091 have one unique R_emaildomain value
5569/17091 have more than one unique R_emaildomain values
6083/17091 have only missing R_emaildomain values



In [12]:
check_dependency('P_emaildomain', 'R_emaildomain')

In P_emaildomain, there are 61 unique values
7/61 have one unique R_emaildomain value
53/61 have more than one unique R_emaildomain values
1/61 have only missing R_emaildomain values



In [13]:
check_dependency('addr1', 'P_emaildomain')

In addr1, there are 442 unique values
243/442 have one unique P_emaildomain value
189/442 have more than one unique P_emaildomain values
10/442 have only missing P_emaildomain values



In [14]:
check_dependency('dist1', 'C3')

In dist1, there are 2739 unique values
2738/2739 have one unique C3 value
0/2739 have more than one unique C3 values
1/2739 have only missing C3 values



How to use this function?
* Found connection between **R_emaildomain** and **C5**
* Checking what are the values can C5 take
* Filling the NaNs

In [15]:
check_dependency('R_emaildomain', 'C5')

In R_emaildomain, there are 61 unique values
60/61 have one unique C5 value
0/61 have more than one unique C5 values
1/61 have only missing C5 values



In [16]:
X_test[~X_test['R_emaildomain'].isnull()]['C5'].value_counts()

0.0    135867
Name: C5, dtype: int64

In [17]:
X_test = X_test['C5'].fillna(0)