(+) This notebook serves to compute aggregated time series features look back 4, 12 weeks before the week of transaction
(++) So far there are 3 groups of time series features


In [2]:
import numpy as np
import pickle 
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import datetime
WORK_PATH = "/Users/welcome/Google Drive (cuong.tranus@gmail.com)/research/internship/fraud-detection"
feat_dict = pickle.load(open(WORK_PATH + "/results/features_category.pkl" ,"rb"))

numeric_cols = feat_dict['numeric_cols']
cat_cols = feat_dict['cat_cols']

print(cat_cols)
print(numeric_cols)

['merchantName', 'posEntryMode', 'posConditionCode', 'merchantCategoryCode']
['accountNumber', 'customerId', 'creditLimit', 'availableMoney', 'transactionAmount', 'cardCVV', 'enteredCVV', 'cardLast4Digits', 'currentBalance', 'cardPresent', 'isFraud']


In [3]:
pd00 = pd.read_csv(WORK_PATH + "/data/processed_data.csv")
print(pd00.shape)

(786363, 26)


In [4]:
pd00['merchantCategoryCode'].head(4)

0        rideshare
1    entertainment
2       mobileapps
3       mobileapps
Name: merchantCategoryCode, dtype: object

In [5]:
## Extract date info
def get_week_in_year(date):
    """
    get week number based on date
    return an integer number between 0 and 51
    """
    y,m, d = date.split("-")
    week = datetime.date(int(y), int(m), int(d)).isocalendar()[1]
    if week >52:
        # for some reasons get_week('2016-01-01') return 53 but not zero 
        return 0
    else:
        return week
    

def get_hour_in_year(trans_date):
    """
    get hour number based on date
    return an integer number between 0 and 24*365
    
    """
    ymd = trans_date.split("T")[0]
    hour = trans_date.split("T")[1].split(":")[0]
    y,m,d  = ymd.split("-")
    start_time = datetime.datetime(2016,1,1,0)
    end_time = datetime.datetime(int(y), int(m),int(d),int(hour))
    
    return abs(start_time - end_time).total_seconds() / 3600.0
    


print(get_week_in_year('2016-01-01'))
print(get_week_in_year('2016-12-31'))
print(get_week_in_year('2016-01-07'))


print("--------")

print(get_hour_in_year('2016-08-13T14:27:32'))
print(get_hour_in_year('2016-10-11T05:05:54'))

0
52
1
--------
5414.0
6821.0


In [6]:
pd00['date'] = pd00['transactionDateTime'].apply(lambda x: x.split("T")[0]).astype(str)
pd00['week_in_year'] = pd00['date'].apply(get_week_in_year)
pd00['hour_in_year'] = pd00['transactionDateTime'].apply(get_hour_in_year)

### I. Aggregate  previous n weeks (before the week of transaction) for numetric columns

Time series feature group 1: Compute basic aggregated function mean/max/...of numeric columns over n weeks before
the week that customer make the transaction.

In [88]:
agg_num_func = {col :[np.mean, np.std, np.min, np.max, pd.DataFrame.kurt, pd.DataFrame.skew] for col in ['creditLimit','availableMoney','transactionAmount','currentBalance']}
agg_num_func['customerId']= np.size

def get_stats_numeric_feats_nweeks(pd00, nweek):
    """
    Compute some aggegate stats (mean/max/std/skewness/kurtosis) over previous n weeks before the
    week when customer make a transaction. 
    """
    
    pd_list = []
    for week in range(nweek,52):
        temp_pd = pd00[(pd00['week_in_year']>=week-nweek) & (pd00['week_in_year']<=week-1)]
        group_pd = temp_pd.groupby('accountNumber').agg(agg_num_fun).reset_index()
        group_pd.columns = ['accountNumber'] + ["_".join(col) +"_lb_{}weeks".format(nweek) for col in group_pd.columns.tolist()[1:]]
        group_pd['week_in_year'] = week
        pd_list.append(group_pd)

    
    pd02  = pd.concat(pd_list)
    
    return pd02

In [89]:
%%time
lb4wk_pd02 = get_stats_numeric_feats_nweeks(pd00, nweek = 4)

CPU times: user 3min 18s, sys: 1.97 s, total: 3min 20s
Wall time: 3min 24s


In [90]:
lb4wk_pd02.head(3)

Unnamed: 0,accountNumber,creditLimit_mean_lb_4weeks,creditLimit_std_lb_4weeks,creditLimit_amin_lb_4weeks,creditLimit_amax_lb_4weeks,creditLimit_kurt_lb_4weeks,creditLimit_skew_lb_4weeks,availableMoney_mean_lb_4weeks,availableMoney_std_lb_4weeks,availableMoney_amin_lb_4weeks,...,transactionAmount_kurt_lb_4weeks,transactionAmount_skew_lb_4weeks,currentBalance_mean_lb_4weeks,currentBalance_std_lb_4weeks,currentBalance_amin_lb_4weeks,currentBalance_amax_lb_4weeks,currentBalance_kurt_lb_4weeks,currentBalance_skew_lb_4weeks,customerId_size_lb_4weeks,week_in_year
0,100088067,50000,0.0,50000,50000,,0.0,49847.77,133.863423,49748.44,...,,0.133836,152.23,133.863423,0.0,251.56,,-1.500594,3,4
1,100328049,5000,0.0,5000,5000,0.0,0.0,4828.962,190.327826,4605.1,...,1.872156,1.394735,171.038,190.327826,0.0,394.9,-2.960551,0.41205,5,4
2,100663626,20000,0.0,20000,20000,,0.0,19935.51,97.90706,19822.85,...,,1.426171,64.49,97.90706,0.0,177.15,,1.678067,3,4


In [91]:
%%time
lb12wk_pd02 = get_stats_numeric_feats_nweeks(pd00, nweek = 12)

CPU times: user 3min 9s, sys: 2.9 s, total: 3min 12s
Wall time: 3min 16s


In [92]:

lb4wk_pd02.to_csv(WORK_PATH + "/results/basic_num_ts_feats_lb4wk.csv", index = False)

In [93]:
lb12wk_pd02.head(3)

Unnamed: 0,accountNumber,creditLimit_mean_lb_12weeks,creditLimit_std_lb_12weeks,creditLimit_amin_lb_12weeks,creditLimit_amax_lb_12weeks,creditLimit_kurt_lb_12weeks,creditLimit_skew_lb_12weeks,availableMoney_mean_lb_12weeks,availableMoney_std_lb_12weeks,availableMoney_amin_lb_12weeks,...,transactionAmount_kurt_lb_12weeks,transactionAmount_skew_lb_12weeks,currentBalance_mean_lb_12weeks,currentBalance_std_lb_12weeks,currentBalance_amin_lb_12weeks,currentBalance_amax_lb_12weeks,currentBalance_kurt_lb_12weeks,currentBalance_skew_lb_12weeks,customerId_size_lb_12weeks,week_in_year
0,100088067,50000,0.0,50000,50000,0.0,0.0,49746.739286,241.724595,49349.93,...,1.870305,1.487139,253.260714,241.724595,0.0,650.07,-1.527795,0.429791,14,12
1,100328049,5000,0.0,5000,5000,0.0,0.0,4700.565556,234.323312,4334.74,...,-1.088848,0.409309,299.434444,234.323312,0.0,665.26,-1.368334,0.11847,18,12
2,100663626,20000,0.0,20000,20000,0.0,0.0,19732.161429,224.999672,19471.38,...,1.558473,1.457675,267.838571,224.999672,0.0,528.62,-2.163977,0.000487,7,12


In [94]:
lb12wk_pd02.to_csv(WORK_PATH + "/results/basic_num_ts_feats_lb12wk.csv", index = False)

### II. Aggregate previous n weeks for Categorical features

In [20]:

agg_cat_func = {'merchantName': [pd.Series.nunique, lambda x: list(x)], 'merchantCategoryCode': [pd.Series.nunique,lambda x: list(x)]}

def in_prev_mchan_list(new_mchant_name, prev_mchant_list):
    """
    Check if the merchant name in current transaction belongs to previous list of merchant list formed
    by aggregation over n weeks
    
    """
    try:
        if new_mchant_name in list(prev_mchant_list):
            return 1
        else:
            return 0
    
    except:
        return None

def get_stats_cat_feats_nweeks(pd00, nweek):
    """
    Compute several stats, e.g how many unique merchant names, 
    and collect all merchant names into a list for later purpose
    
    """
    pd_list = []
    for week in range(nweek,52):
        temp_pd = pd00[(pd00['week_in_year']>=week-nweek) & (pd00['week_in_year']<=week-1)]
        group_pd = temp_pd.groupby('accountNumber').agg(agg_cat_func).reset_index()
        group_pd.columns = ["accountNumber", "dcount_mchant_name", "list_mchant_name", "dcount_mchant_code","list_mchant_code"] 
        group_pd['week_in_year'] = week
        pd_list.append(group_pd)
    
    pd02  = pd.concat(pd_list)
    #pd03 = pd.merge(pd02, pd00[['accountNumber','merchantName', 'merchantCategoryCode', 'week_in_year']], on =['accountNumber','week_in_year'], how ='left')
    pd03 = pd02
    
    pd03 = pd03.rename(columns = {'dcount_mchant_name':"dcount_mchant_name_lb_{}weels".format(week)})
    pd03 = pd03.rename(columns = {'dcount_mchant_code':"dcount_mchant_code_lb_{}weels".format(week)})
    
    new_cols = ["dcount_mchant_name_lb_{}weels".format(week),"dcount_mchant_code_lb_{}weels".format(week) ]
    
#     pd03['in_prev_mchant_list_lb_{}weeks'.format(nweek)] = pd03.apply(lambda x: \
#                                                           in_prev_mchan_list( x['merchantName'], x['list_mchant_name']) , axis=1)
    
#     pd03['in_prev_mchant_codes_lb_{}weeks'.format(nweek)] = pd03.apply(lambda x: \
#                                                           in_prev_mchan_list( x['merchantCategoryCode'], x['list_mchant_code']) , axis=1)
    
    
#     new_cols = new_cols  + ['in_prev_mchant_list_lb_{}weeks'.format(nweek),'in_prev_mchant_codes_lb_{}weeks'.format(nweek) ]
    

    return pd03[['accountNumber','week_in_year'] + new_cols  ]

In [26]:
%%time
lb12wk_pd03 = get_stats_cat_feats_nweeks(pd00, nweek = 12)

CPU times: user 54.1 s, sys: 1.44 s, total: 55.6 s
Wall time: 56.7 s


In [34]:
lb12wk_pd03.shape

(190826, 4)

In [35]:
lb12wk_pd03.drop_duplicates(subset =['accountNumber','week_in_year']).shape

(190826, 4)

In [29]:
lb12wk_pd03.head(3)

Unnamed: 0,accountNumber,week_in_year,dcount_mchant_name_lb_51weels,dcount_mchant_code_lb_51weels
0,100088067,12,14,3
1,100328049,12,16,4
2,100663626,12,7,2


In [36]:
lb12wk_pd03.to_csv(WORK_PATH + "/results/basic_cat_ts_feats_lb12wk.csv", index = False)

In [37]:
%%time
lb14wk_pd03 = get_stats_cat_feats_nweeks(pd00, nweek = 4)

CPU times: user 1min 20s, sys: 967 ms, total: 1min 21s
Wall time: 1min 45s


In [38]:
lb14wk_pd03.head(3)

Unnamed: 0,accountNumber,week_in_year,dcount_mchant_name_lb_51weels,dcount_mchant_code_lb_51weels
0,100088067,4,3,3
1,100328049,4,4,2
2,100663626,4,3,1


In [39]:
lb14wk_pd03.to_csv(WORK_PATH + "/results/basic_cat_ts_feats_lb4wk.csv", index = False)

### Inter-event of prevous n weeks time series features

In [6]:
def get_diff_stats(x):
    x = np.asarray(x)
    diff_x = np.abs( x[1:] - x[:-1])
    
    return np.mean(diff_x), np.std(diff_x), np.max(x), np.min(x)


def get_stats_diff_time_nweeks(pd00, nweek):
    """
    Compute several stats, e.g how many unique merchant names, 
    and collect all merchant names into a list for later purpose
    
    """
    pd_list = []
    for week in range(nweek,52):
        temp_pd = pd00[(pd00['week_in_year']>=week-nweek) & (pd00['week_in_year']<=week-1)]
        temp_pd = temp_pd.sort_values(by=['accountNumber','hour_in_year'])
        group_pd = temp_pd.groupby('accountNumber')['hour_in_year'].apply(list).reset_index()
        group_pd.columns = ['accountNumber', "list_of_hours"]
        group_pd['diff_time_stat'] = group_pd['list_of_hours'].apply(get_diff_stats)
        for i in range(4):
            group_pd['diff_time_stat_{}_lb_{}weeks'.format(i, nweek)] =  group_pd['diff_time_stat'].apply(lambda x: x[i])
        
        group_pd['week_in_year'] = week
        
        cols = ['accountNumber','week_in_year'] + [col for col in group_pd.columns.tolist() if 'lb' in col]
            
        pd_list.append(group_pd[cols])
    
    return pd.concat(pd_list)
        

In [7]:
%%time
lb12wk_pd04 = get_stats_diff_time_nweeks(pd00, nweek = 12)

CPU times: user 24.8 s, sys: 2.21 s, total: 27 s
Wall time: 29 s


In [8]:
lb12wk_pd04.head(4)

Unnamed: 0,accountNumber,week_in_year,diff_time_stat_0_lb_12weeks,diff_time_stat_1_lb_12weeks,diff_time_stat_2_lb_12weeks,diff_time_stat_3_lb_12weeks
0,100088067,12,111.692308,98.747841,1716.0,264.0
1,100328049,12,106.764706,80.734517,1860.0,45.0
2,100663626,12,213.833333,153.061444,1588.0,305.0
3,100737756,12,48.166667,52.0862,1887.0,153.0


In [9]:
lb12wk_pd04.to_csv(WORK_PATH + "/results/stats_diff_time_feats_lb12wk.csv", index = False)

In [10]:
lb14wk_pd04 = get_stats_diff_time_nweeks(pd00, nweek = 4)

In [285]:
lb14wk_pd04.head(4)

Unnamed: 0,accountNumber,week_in_year,diff_time_stat_0_lb_4weeks,diff_time_stat_1_lb_4weeks,diff_time_stat_2_lb_4weeks,diff_time_stat_3_lb_4weeks
0,100088067,4,67.5,48.5,399.0,264.0
1,100328049,4,77.75,71.541509,356.0,45.0
2,100663626,4,128.5,98.5,562.0,305.0
3,100737756,4,28.692308,26.104478,526.0,153.0


In [287]:
lb14wk_pd04.to_csv(WORK_PATH + "/results/stats_diff_time_feats_lb4wk.csv", index = False)