## Load raw data with initial feature selection; save sample for prototyping.

In [1]:
import pandas as pd

In [3]:
# method for loading, cleaning data
def load_data(set_number):

    # get file path
    if set_number == 10:
        path = 'data/training_part_' + str(set_number) + '_of_10.txt'
    else:
        path = 'data/training_part_0' + str(set_number) + '_of_10.txt'

    # raw data column names
    colnames = ['AUTH_ID', 'ACCT_ID_TOKEN', 'FRD_IND', 'ACCT_ACTVN_DT', 'ACCT_AVL_CASH_BEFORE_AMT',
                'ACCT_AVL_MONEY_BEFORE_AMT', 'ACCT_CL_AMT', 'ACCT_CURR_BAL', 'ACCT_MULTICARD_IND',
                'ACCT_OPEN_DT', 'ACCT_PROD_CD', 'ACCT_TYPE_CD', 'ADR_VFCN_FRMT_CD', 'ADR_VFCN_RESPNS_CD',
                'APPRD_AUTHZN_CNT', 'APPRD_CASH_AUTHZN_CNT', 'ARQC_RSLT_CD', 'AUTHZN_ACCT_STAT_CD', 'AUTHZN_AMT',
                'AUTHZN_CATG_CD', 'AUTHZN_CHAR_CD', 'AUTHZN_OPSET_ID', 'AUTHZN_ORIG_SRC_ID', 'AUTHZN_OUTSTD_AMT',
                'AUTHZN_OUTSTD_CASH_AMT', 'AUTHZN_RQST_PROC_CD', 'AUTHZN_RQST_PROC_DT', 'AUTHZN_RQST_PROC_TM',
                'AUTHZN_RQST_TYPE_CD', 'AUTHZN_TRMNL_PIN_CAPBLT_NUM', 'AVG_DLY_AUTHZN_AMT', 'CARD_VFCN_2_RESPNS_CD',
                'CARD_VFCN_2_VLDTN_DUR', 'CARD_VFCN_MSMT_REAS_CD', 'CARD_VFCN_PRESNC_CD', 'CARD_VFCN_RESPNS_CD',
                'CARD_VFCN2_VLDTN_CD', 'CDHLDR_PRES_CD', 'CRCY_CNVRSN_RT', 'ELCTR_CMRC_IND_CD', 'HOME_PHN_NUM_CHNG_DUR',
                'HOTEL_STAY_CAR_RENTL_DUR', 'LAST_ADR_CHNG_DUR', 'LAST_PLSTC_RQST_REAS_CD', 'MRCH_CATG_CD',
                'MRCH_CNTRY_CD', 'NEW_USER_ADDED_DUR', 'PHN_CHNG_SNC_APPN_IND', 'PIN_BLK_CD', 'PIN_VLDTN_IND',
                'PLSTC_ACTVN_DT', 'PLSTC_ACTVN_REQD_IND', 'PLSTC_FRST_USE_TS', 'PLSTC_ISU_DUR', 'PLSTC_PREV_CURR_CD',
                'PLSTC_RQST_TS', 'POS_COND_CD', 'POS_ENTRY_MTHD_CD', 'RCURG_AUTHZN_IND', 'RVRSL_IND',
                'SENDR_RSIDNL_CNTRY_CD', 'SRC_CRCY_CD', 'SRC_CRCY_DCML_PSN_NUM', 'TRMNL_ATTNDNC_CD',
                'TRMNL_CAPBLT_CD', 'TRMNL_CLASFN_CD', 'TRMNL_ID', 'TRMNL_PIN_CAPBLT_CD', 'DISTANCE_FROM_HOME']

    # features to be used for classifier
    classifier_columns = ['FRD_IND', 'APPRD_AUTHZN_CNT', 'AVG_DLY_AUTHZN_AMT', 'MRCH_CATG_CD',
                          'POS_ENTRY_MTHD_CD', 'RCURG_AUTHZN_IND', 'DISTANCE_FROM_HOME']

    # features to be used for clustering strategies
    adversary_columns = ['ACCT_CURR_BAL', 'AUTHZN_AMT', 'AUTHZN_OUTSTD_AMT', 'PLSTC_ISU_DUR']

    # load data
    data = pd.read_table('data/training_part_01_of_10.txt', delimiter='|', header=None, names=colnames)

    # subset data to relevant columns
    data = data[classifier_columns + adversary_columns]

    # drop rows with NA values
    data.dropna(axis=0, inplace=True)

    # convert non-numeric columns to numeric columns
    data['FRD_IND'].replace(['Y', 'N'], [1, 0], inplace=True)
    data.drop(data[(data['FRD_IND'] != 0) & (data['FRD_IND'] != 1)].index, inplace=True)
    data['FRD_IND'] = pd.to_numeric(data['FRD_IND'])

    data['RCURG_AUTHZN_IND'].replace(['Y', 'N'], [1, 0], inplace=True)
    data.drop(data[(data['RCURG_AUTHZN_IND'] != 0) & (data['RCURG_AUTHZN_IND'] != 1)].index, inplace=True)
    data['RCURG_AUTHZN_IND'] = pd.to_numeric(data['RCURG_AUTHZN_IND'])

    # return
    return data

In [6]:
df = load_data(1)

In [7]:
df.head()

Unnamed: 0,FRD_IND,APPRD_AUTHZN_CNT,AVG_DLY_AUTHZN_AMT,MRCH_CATG_CD,POS_ENTRY_MTHD_CD,RCURG_AUTHZN_IND,DISTANCE_FROM_HOME,ACCT_CURR_BAL,AUTHZN_AMT,AUTHZN_OUTSTD_AMT,PLSTC_ISU_DUR
0,0,2,0,5983,1,0,10.198805,50.0,366.0,423.9,438
1,0,1,5,5621,90,0,21.638697,364.9,232.38,322.37,415
2,0,1,3,5331,90,0,21.638697,87.71,51.34,51.34,131
3,0,1,6,5511,90,0,21.638697,1732.48,500.0,500.0,343
4,0,1,3,3389,90,0,1132.0712,234.75,1.0,1.0,114


In [30]:
df_25 = df.sample(frac=0.25)
df_25.to_csv('sample-1-25.csv', index=False)