#### Import Libraries, configure settings

In [None]:
import os
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
# Pandas Profiling used to create the exploratory data analysis reports
pd.set_option("display.max_columns", None)
# Set the option to display all the columns in dataframes
pd.options.display.max_colwidth = 500
# Set the option to widen the column width due to the identity hash field being length 65
pd.set_option("display.max_rows", None)
# Set the option to display all the rows in dataframes

from optbinning import OptimalBinning
from optbinning import BinningProcess

from datetime import datetime

#### Define the Data Types and Column Names for the input files

In [None]:
# Data Types

data_types_C03 = {
    'C03_ProductIndicator' : 'object',
    'C03_ReleaseNumber' : 'object',
    'C03_Status' : 'object',
    'C03_OrigCode' : 'object',
    'C03_RespCode' : 'object',
    'C03_MTI' : 'object',
    'C03_PrimaryBitMap' : 'object',
    'C03_SecondaryBitMap' : 'object',
    'C03_Card_No' : 'object',
    'C03_ProcessingCode' : 'object',
    'C03_Transaction_Amount' : 'float',
    'C03_CardBillingAmount' : 'float',
    'C03_TransmissionDate' : 'object',
    'C03_STAN' : 'object',
    'C03_Transaction_Datetime' : 'object',
    'C03_Expiry_Date' : 'object',
    'C03_SettlementDate' : 'object',
    'C03_CaptureDate' : 'object',
    'C03_MCC' : 'object',
    'C03_CountryCode' : 'object',
    'C03_POSEntryMode' : 'object',
    'C03_POSCondCode' : 'object',
    'C03_AuthIDRespLength' : 'object',
    'C03_Merchant_Id' : 'object',
    'C03_Track2' : 'object',
    'C03_RRN' : 'object',
    'C03_AuthIDResp' : 'object',
    'C03_ResponseCode' : 'object',
    'C03_CardAcceptorTerminalID' : 'object',
    'C03_CardAcceptorID' : 'object',
    'C03_TerminalOwner' : 'object',
    'C03_TerminalCIty' : 'object',
    'C03_TerminalState' : 'object',
    'C03_TerminalCountry' : 'object',
    'C03_AdditionalData' : 'object',
    'C03_CurrencyCode' : 'object',
    'C03_TerminalData' : 'object',
    'C03_POSData' : 'object',
    'C03_BanknetData' : 'object',
    'C03_RcvInstID' : 'object',
    'C03_AccountIdentification1' : 'object',
    'C03_AccountIdentification2' : 'object',
    'C03_AuthAgentID' : 'object',
    'C03_SettlementRecord' : 'object',
    'C03_BatchData' : 'object',
    'C03_SettlementData' : 'object',
    'C03_AccountIndicator' : 'object',
    'C03_Preauthorization' : 'object',
    'C03_ATMAdditionalData' : 'object'
}

data_types_C06 = {
    'C06_MTI' : 'object',
    'C06_PrimaryBitMap' : 'object',
    'C06_SecondaryBitMap' : 'object',
    'C06_PANNumber' : 'object',
    'C06_ProcessingCode' : 'object',
    'C06_TransactionAmount' : 'object',
    'C06_TransamisionDateTime' : 'object',
    'C06_STAN' : 'object',
    'C06_Transaction_Datetime' : 'object',
    'C06_SetllementDate' : 'object',
    'C06_CaptureDate' : 'object',
    'C06_MCC' : 'object',
    'C06_CardDataInputCapability' : 'object',
    'C06_CardholderAuthCapability' : 'object',
    'C06_CardCaptureCapability' : 'object',
    'C06_OS' : 'object',
    'C06_CardholderPresent' : 'object',
    'C06_CardPresent' : 'object',
    'C06_CardInputMode' : 'object',
    'C06_CardholderAuthMethod' : 'object',
    'C06_CardholderAuthEntity' : 'object',
    'C06_CardDataOutputCapability' : 'object',
    'C06_TerminalOutputCapability' : 'object',
    'C06_PINCaptureCapability' : 'object',
    'C06_AcquiringID' : 'object',
    'C06_ForwardingICC' : 'object',
    'C06_RRN' : 'object',
    'C06_CardAcceptorTerminalID' : 'object',
    'C06_CardAcceptorName' : 'object',
    'C06_CardAcceptorStreet' : 'object',
    'C06_CardAcceptorCity' : 'object',
    'C06_CardAcceptorPostalCode' : 'object',
    'C06_CardAcceptorRegionCode' : 'object',
    'C06_CardAcceptorCountryCode' : 'object',
    'C06_AdditionalData' : 'object',
    'C06_TransactionCurrencyCode' : 'object',
    'C06_OriginalMessageType' : 'object',
    'C06_OriginalSTAN' : 'object',
    'C06_OriginalTransmissionDatetime' : 'object',
    'C06_OriginalAcquiringID' : 'object',
    'C06_BillCompanyID' : 'object',
    'C06_BIllNumber' : 'object',
    'C06_BillConsumerNumber' : 'object',
    'C06_BillRefNo1' : 'object',
    'C06_BillRefNo2' : 'object',
    'C06_AccountIdentification1' : 'object',
    'C06_AccountIdentification2' : 'object',
    'C06_ORFTContraBankID' : 'object',
    'C06_ORFTContraBankAccNo' : 'object',
    'C06_Preauthhold' : 'object',
    'C06_PreAuthSeqNumber' : 'object',
    'C06_ReferalPhoneNumber' : 'object',
    'C06_MemberNumber' : 'object',
    'C06_POSTerminalID' : 'object',
    'C06_SICCode' : 'object'
}

data_types_C09 = {
    'C09_MTI': 'object',
    'C09_PrimaryBitMap': 'object',
    'C09_SecondaryBitMap': 'object',
    'C09_Card_No': 'object',
    'C09_ProcessingCode': 'object',
    'C09_Transaction_Amount': 'object',
    'C09_ReconAmount': 'object',
    'C09_TransmisionDateTime': 'object',
    'C09_STAN': 'object',
    'C09_Transaction_Datetime': 'object',
    'C09_Expiry_Date': 'object',
    'C09_CaptureDate': 'object',
    'C09_CountryCode': 'object',
    'C09_CardDataInputCapability': 'object',
    'C09_CardholderAuthCapability': 'object',
    'C09_CardCaptureCapability': 'object',
    'C09_OS': 'object',
    'C09_CardholderPresent': 'object',
    'C09_CardPresent': 'object',
    'C09_CardInputMode': 'object',
    'C09_CardholderAuthMethod': 'object',
    'C09_CardholderAuthEntity': 'object',
    'C09_CardDataOutputCapability': 'object',
    'C09_TerminalOutputCapability': 'object',
    'C09_PINCaptureCapability': 'object',
    'C09_CardSequenceNo': 'object',
    'C09_FunctionCode': 'object',
    'C09_MessageReasonCode': 'object',
    'C09_Merchant_Id': 'object',
    'C09_AcquirerID': 'object',
    'C09_ForwardingID': 'object',
    'C09_Track2': 'object',
    'C09_RRN': 'object',
    'C09_ApprovalCode': 'object',
    'C09_ActionCode': 'object',
    'C09_CardAcceptorTerminalID': 'object',
    'C09_CardAcceptorIdCode': 'object',
    'C09_CardAcceptorName': 'object',
    'C09_CardAcceptorStreet': 'object',
    'C09_CardAcceptorCity': 'object',
    'C09_CardAcceptorPostalCode': 'object',
    'C09_CardAcceptorRegionCode': 'object',
    'C09_CardAcceptorCountryCode': 'object',
    'C09_TransactionCurrencyCode': 'object',
    'C09_ReconCurrencyCode': 'object',
    'C09_OriginalMTI': 'object',
    'C09_OriginalSTAN': 'object',
    'C09_OriginalTrxDatetime': 'object',
    'C09_OriginalAcquirerID': 'object',
    'C09_ReservedPrivate1': 'object',
    'C09_AccountIdentification1': 'object',
    'C09_AccountIdentification2': 'object',
    'C09_ReservedPrivate2': 'object',
    'C09_TransactionAmountIDR': 'object',
    'C09_ECI': 'object',
}

data_types_C10 = {
    'C10_MTI': 'object',
    'C10_PrimaryBitMap': 'object',
    'C10_SecondaryBitMap': 'object',
    'C10_PANNumber': 'object',
    'C10_ProcessingCode': 'object',
    'C10_TransactionAmount': 'object',
    'C10_TransmissionDateTime': 'object',
    'C10_STAN': 'object',
    'C10_Transaction_Datetime': 'object',
    'C10_SettlementDate': 'object',
    'C10_CaptureDate': 'object',
    'C10_MCC': 'object',
    'C10_CardInputMode': 'object',
    'C10_PINCaptureCapability': 'object',
    'C10_AcquiringID': 'object',
    'C10_RRN': 'object',
    'C10_CardAcceptorTerminalID': 'object',
    'C10_CardAcceptorName': 'object',
    'C10_CardAcceptorStreet': 'object',
    'C10_CardAcceptorCity': 'object',
    'C10_CardAcceptorPostalCode': 'object',
    'C10_CardAcceptorRegionCode': 'object',
    'C10_CardAcceptorCountryCode': 'object',
    'C10_AdditionalData': 'object',
    'C10_TransactionCurrencyCode': 'object',
    'C10_OriginalMessageType': 'object',
    'C10_OriginalSTAN': 'object',
    'C10_OriginalTransmissionDatetime': 'object',
    'C10_BillCompanyID': 'object',
    'C10_BIllNumber': 'object',
    'C10_BillConsumerNumber': 'object',
    'C10_BillRefNo1': 'object',
    'C10_BillRefNo2': 'object',
    'C10_AccountIdentification1': 'object',
    'C10_AccountIdentification2': 'object',
    'C10_ORFTContraBankID': 'object',
    'C10_ORFTContraBankAccNo': 'object',
    'C10_ReferalPhoneNumber': 'object',
    'C10_MemberNumber': 'object',
    'C10_POSTerminalID': 'object',
    'C10_SICCode': 'object',
}

data_types_TSF = {}

data_types_TSCF = {}
                     


In [None]:
# Column Names

names_C03 = [
    'C03_ProductIndicator',
    'C03_ReleaseNumber',
    'C03_Status',
    'C03_OrigCode',
    'C03_RespCode',
    'C03_MTI',
    'C03_PrimaryBitMap',
    'C03_SecondaryBitMap',
    'C03_Card_No',
    'C03_ProcessingCode',
    'Transaction_Amount',
    'C03_CardBillingAmount',
    'C03_TransmissionDate',
    'C03_STAN',
    'Transaction_Datetime',
    'C03_Expiry_Date',
    'C03_SettlementDate',
    'C03_CaptureDate',
    'C03_MCC',
    'C03_CountryCode',
    'C03_POSEntryMode',
    'C03_POSCondCode',
    'C03_AuthIDRespLength',
    'C03_Merchant_Id',
    'C03_Track2',
    'C03_RRN',
    'C03_AuthIDResp',
    'C03_ResponseCode',
    'C03_CardAcceptorTerminalID',
    'C03_CardAcceptorID',
    'C03_TerminalOwner',
    'C03_TerminalCIty',
    'C03_TerminalState',
    'C03_TerminalCountry',
    'C03_AdditionalData',
    'C03_CurrencyCode',
    'C03_TerminalData',
    'C03_POSData',
    'C03_BanknetData',
    'C03_RcvInstID',
    'C03_AccountIdentification1',
    'C03_AccountIdentification2',
    'C03_AuthAgentID',
    'C03_SettlementRecord',
    'C03_BatchData',
    'C03_SettlementData',
    'C03_AccountIndicator',
    'C03_Preauthorization',
    'C03_ATMAdditionalData'
]

names_C06 = [
    'C06_MTI',
    'C06_PrimaryBitMap',
    'C06_SecondaryBitMap',
    'C06_PANNumber',
    'C06_ProcessingCode',
    'C06_TransactionAmount',
    'C06_TransamisionDateTime',
    'C06_STAN',
    'C06_Transaction_Datetime',
    'C06_SetllementDate',
    'C06_CaptureDate',
    'C06_MCC',
    'C06_CardDataInputCapability',
    'C06_CardholderAuthCapability',
    'C06_CardCaptureCapability',
    'C06_OS',
    'C06_CardholderPresent',
    'C06_CardPresent',
    'C06_CardInputMode',
    'C06_CardholderAuthMethod',
    'C06_CardholderAuthEntity',
    'C06_CardDataOutputCapability',
    'C06_TerminalOutputCapability',
    'C06_PINCaptureCapability',
    'C06_AcquiringID',
    'C06_ForwardingICC',
    'C06_RRN',
    'C06_CardAcceptorTerminalID',
    'C06_CardAcceptorName',
    'C06_CardAcceptorStreet',
    'C06_CardAcceptorCity',
    'C06_CardAcceptorPostalCode',
    'C06_CardAcceptorRegionCode',
    'C06_CardAcceptorCountryCode',
    'C06_AdditionalData',
    'C06_TransactionCurrencyCode',
    'C06_OriginalMessageType',
    'C06_OriginalSTAN',
    'C06_OriginalTransmissionDatetime',
    'C06_OriginalAcquiringID',
    'C06_BillCompanyID',
    'C06_BIllNumber',
    'C06_BillConsumerNumber',
    'C06_BillRefNo1',
    'C06_BillRefNo2',
    'C06_AccountIdentification1',
    'C06_AccountIdentification2',
    'C06_ORFTContraBankID',
    'C06_ORFTContraBankAccNo',
    'C06_Preauthhold',
    'C06_PreAuthSeqNumber',
    'C06_ReferalPhoneNumber',
    'C06_MemberNumber',
    'C06_POSTerminalID',
    'C06_SICCode',
]
names_C09 = []
names_C10 = []
names_TSF = []
names_TSCF = []





#### Read in each file

##### C03_Details - Credit Cards

In [None]:
C03_Details = pd.read_csv("path-to-C03_Details.csv",
                           dtype = data_types_C03,
                           parse_dates = ['TransmissionDate','Transaction_Datetime','SettlementDate','CaptureDate'],
                           header = 0,
                           names = names_C03)

# Check the shape of the dataframe
C03_Details.shape

In [None]:
# Check the transaction datetimes
C03_Details['Transaction_Datetime'].describe()

In [None]:
# Check the number of accounts/card numbers
print(C03_Details['Card_No'].nunique())

# Check for null card numbers
print(C03_Details['Card_No'].isna().sum())


##### C06_Details - Debit Cards

In [None]:
C06_Details = pd.read_csv('path-to-C06_Details.csv',
                           dtype = data_types_C06,
                           parse_dates = [],
                           header = 0,
                           names = names_C06)

C06_Details.shape

(4096478, 18)

##### C09_Details - AMEX Cards

In [None]:
C09_Details = pd.read_csv('path-to-C09_Details.csv',
                           dtype = data_types_C09,
                           parse_dates = [],
                           header = 0,
                           names = names_C09)

C09_Details.shape

##### C10_Details - DBank Pro

##### Transaction_Summary_Fraud

##### Transaction_Summary_Calculations_Fraud

In [None]:
# Remove the records with NaT transaction_datetime
print('Records before removing blank transaction_datetime records: ', C03_Details.shape[0])
print('Records removed: ', C03_Details['Transaction_Datetime'].isna().sum())
C03_Details = C03_Details.loc[C03_Details['Transaction_Datetime'].isna() == False]
print('Records after removing blank transaction_datetime records: ', C03_Details.shape[0])


In [None]:
# Remove the duplicated records
print('Records before removing duplicated records: ', C03_Details.shape[0])
print('Records removed: ', C03_Details.duplicated().sum())
C03_Details = C03_Details.loc[C03_Details.duplicated() == False]
print('Records after removing duplicated records: ', C03_Details.shape[0])


#### View the data in each of the files

In [None]:
C03_Details.head()

In [None]:
C06_Details.head()

In [None]:
C09_Details.head()

In [None]:
C10_Details.head()

In [None]:
Transaction_Summary_Fraud.head()

In [None]:
Transaction_Summary_Calculations_Fraud.head()

#### Check what is where in the files

In [None]:
# Are the Mobile Financial Transactions contained in the Deposit Transactions?

test_mf_deposit = deposit_transaction.merge(mobile_financial, left_on = ['customer_id','account_number','transaction_datetime'],
                               right_on = ['customer_id','from_account_no','transaction_datetime'],
                               how = 'outer',
                               indicator = 'merge_deposit_mf')

In [None]:
test_mf_deposit.groupby(by = ['merge_deposit_mf'], dropna=False).size()

In [None]:
print(deposit_transaction.shape[0])
print(mobile_financial.shape[0])
print(test_mf_deposit.shape[0])

In [None]:
test_mf_deposit.loc[test_mf_deposit['transaction_id'].isna() == False].groupby(['merge_deposit_mf'], dropna=False).size()

In [None]:
test_mf_deposit.loc[test_mf_deposit['transaction_id'].isna() == True].groupby(['merge_deposit_mf'], dropna=False).size()

#### 3270348 out of 4096478 mobile financial trnasactions are in the deposit transaction file (80%)

### Combine Tables

In [None]:
# Concatenate C03 + C09 (Credit Cards + Amex) to create a single dataframe for credit cards

credit = pd.concat([C03_Details, C09_Details], axis = 0, ignore_index = True)

print(credit.shape)
print(credit.head())
print(credit['Card_No'].isna().sum())
print(credit['Card_No'].nunique())


In [None]:
# Concatenate C06 + C10 Debit Cards

debit = pd.concat([C06_Details, C10_Details[C10_Details['AccountIdentification1'].isna()==False]], axis = 0, ignore_index = True)

print(debit.shape)
print(debit.head())
print(debit['Card_No'].isna().sum())
print(debit['Card_No'].nunique())



In [None]:
# Merge to TSF and TSCF
credit = credit.merge(Transaction_Summary_Fraud, on = ['Transaction_Serial_No'],
                      how = 'left',
                      indicator = 'merge_credit_tsf')

print(credit.shape)
print(credit.groupby(['merge_credit_tsf'], dropna=False).size())

credit = credit.merge(Transaction_Summary_Calculations_Fraud, on = ['Transaction_Serial_No'],
                      how = 'left',
                      indicator = 'merge_credit_tscf')

print(credit.shape)
print(credit.groupby(['merge_credit_tscf'], dropna=False).size())


In [None]:
# Merge to TSF and TSCF
debit = debit.merge(Transaction_Summary_Fraud, on = ['Transaction_Serial_No'],
                      how = 'left',
                      indicator = 'merge_debit_tsf')

print(debit.shape)
print(debit.groupby(['merge_debit_tsf'], dropna=False).size())

debit = debit.merge(Transaction_Summary_Calculations_Fraud, on = ['Transaction_Serial_No'],
                        how = 'left',
                        indicator = 'merge_debit_tscf')

print(debit.shape)
print(debit.groupby(['merge_debit_tscf'], dropna=False).size())


### Apply Fraud Indicator

In [None]:
credit['fraud'] = credit['confirmed'] == True

credit.groupby(['fraud'], dropna=False).size()


In [None]:
# separate card numbers which have >= 1 fraud and those with 0 fraud

credit_cardno = pd.DataFrame(credit.groupby(['Card_No'])['fraud'].sum()).reset_index()
credit_cardno_fraud = credit_cardno[credit_cardno['fraud'] > 0][['Card_No']]
credit_cardno_clean = credit_cardno[credit_cardno['fraud'] == 0][['Card_No']]


In [None]:
debit['fraud'] = debit['confirmed'] == True

debit.groupby(['fraud'], dropna=False).size()

In [None]:
# separate card numbers which have >= 1 fraud and those with 0 fraud

debit_cardno = pd.DataFrame(debit.groupby(['Card_No'])['fraud'].sum()).reset_index()
debit_cardno_fraud = debit_cardno[debit_cardno['fraud'] > 0][['Card_No']]
debit_cardno_clean = debit_cardno[debit_cardno['fraud'] == 0][['Card_No']]



### Date Analysis


In [None]:
credit['Transaction_Datetime_Day'] = pd.to_datetime(credit['Transaction_Datetime']).dt.date

In [None]:
# Transaction Date Trend Analysis

df = pd.DataFrame(credit.groupby(['Transaction_Datetime_Day']).size()).rename(columns = {0:'Transactions'})
# Group By Transaction Date, get the total transactions

df['Transactions %'] = 100*df['Transactions']/(df['Transactions'].sum())
# Calculate Transactions %
                                            
df1 = pd.DataFrame(credit.groupby(['Transaction_Datetime_Day'])['fraud'].sum())
# Group By Transaction Date, get the sum of frauds

df1['Fraud %'] = 100*df1['fraud']/(df1['fraud'].sum())
# Calculate the Fraud %
                                            
df = pd.concat([df, df1], axis = 1)
# Concatenate the dataframes

df['Fraud Rate %'] = 100*df['Fraud']/df['Transactions']
# Add the frauds rates in the table

In [None]:
df.to_csv('path-to-credit_transaction_date_analysis.csv', index = False)

### Sample Down Clean Transactions

In [None]:

# Get a random sample of card numbers where fraud is False
sample_card_numbers = credit_cardno_clean['Card_No'].sample(frac = 0.1, random_state = 42).tolist()



In [None]:

credit = credit[(credit['Card_No'].isin(sample_card_numbers) == True)|(credit['Card_No'].isin(credit_cardno_fraud.tolist()) == True)]


### Feature Engineering

In [None]:
# Calculation Functions
# These functions are used to calculate the frequency, monetary and unique count of transactions

def Frequency_1(dataset=None, datetime_col=None, Key=None, groupby=None, amount_col=None ,groupby_type='No',groupby_col=None, window=None,NA=None, out_col=None):
    dataset=dataset.sort_values(by='{datetime_col}'.format(datetime_col=datetime_col), ascending=True)
    if groupby_type == "No":
        df_num_trnx=pd.DataFrame({'{out_col}'.format(out_col=out_col):dataset.set_index('{datetime_col}'.format(datetime_col=datetime_col)).sort_index().groupby(['{groupby}'.format(groupby=groupby)])['{amount_col}'.format(amount_col=amount_col)].rolling('{window}'.format(window=window), closed='right').count().fillna(NA)})
        df_num_trnx.reset_index(['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], inplace=True)
        df_num_trnx_TJ=df_num_trnx.drop_duplicates(subset=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], keep='last')
        dataset_TJ=dataset[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)]]
        join_data=dataset_TJ.merge(df_num_trnx_TJ, how='left', left_on=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], right_on=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)])
    elif groupby_type == "Yes":
        df_num_trnx=pd.DataFrame({'{out_col}'.format(out_col=out_col):dataset.set_index('{datetime_col}'.format(datetime_col=datetime_col)).sort_index().groupby(['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col)])['{amount_col}'.format(amount_col=amount_col)].rolling('{window}'.format(window=window), closed='right').count().fillna(NA)})
        df_num_trnx.reset_index(['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], inplace=True)
        df_num_trnx_TJ=df_num_trnx.drop_duplicates(subset=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], keep='last')
        dataset_TJ=dataset[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)]]
        join_data=dataset_TJ.merge(df_num_trnx_TJ, how='left', left_on=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], right_on=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)])
    dataset_output=join_data[['{Key}'.format(Key=Key),'{out_col}'.format(out_col=out_col)]]
    return dataset_output

def Frequency(dataset=None, datetime_col=None, Key=None, groupby=None, amount_col=None ,groupby_type='No',groupby_col=None, window=None,NA=None, out_col=None):
    dataset=dataset.sort_values(by='{datetime_col}'.format(datetime_col=datetime_col), ascending=True)
    if groupby_type == "No":
        df_num_trnx=pd.DataFrame({'{out_col}'.format(out_col=out_col):dataset.set_index('{datetime_col}'.format(datetime_col=datetime_col)).sort_index().groupby(['{groupby}'.format(groupby=groupby)])['{amount_col}'.format(amount_col=amount_col)].rolling('{window}'.format(window=window), closed='left').count().fillna(NA)})
        df_num_trnx.reset_index(['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], inplace=True)
        df_num_trnx_TJ=df_num_trnx.drop_duplicates(subset=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], keep='last')
        dataset_TJ=dataset[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)]]
        join_data=dataset_TJ.merge(df_num_trnx_TJ, how='left', left_on=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], right_on=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)])
    elif groupby_type == "Yes":
        df_num_trnx=pd.DataFrame({'{out_col}'.format(out_col=out_col):dataset.set_index('{datetime_col}'.format(datetime_col=datetime_col)).sort_index().groupby(['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col)])['{amount_col}'.format(amount_col=amount_col)].rolling('{window}'.format(window=window), closed='left').count().fillna(NA)})
        df_num_trnx.reset_index(['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], inplace=True)
        df_num_trnx_TJ=df_num_trnx.drop_duplicates(subset=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], keep='last')
        dataset_TJ=dataset[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)]]
        join_data=dataset_TJ.merge(df_num_trnx_TJ, how='left', left_on=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], right_on=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)])
    dataset_output=join_data[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col),'{out_col}'.format(out_col=out_col)]]
    return dataset_output

def Monetary(dataset=None, datetime_col=None, Key=None, groupby=None, amount_col=None ,groupby_type='No',groupby_col=None, window=None,NA=None, out_col=None):
    dataset=dataset.sort_values(by='{datetime_col}'.format(datetime_col=datetime_col), ascending=True)
    if groupby_type == "No":
        df_amt_trnx=pd.DataFrame({'{out_col}'.format(out_col=out_col):dataset.set_index('{datetime_col}'.format(datetime_col=datetime_col)).sort_index().groupby(['{groupby}'.format(groupby=groupby)])['{amount_col}'.format(amount_col=amount_col)].rolling('{window}'.format(window=window), closed='left').mean().fillna(NA)})
        df_amt_trnx.reset_index(['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], inplace=True)
        df_amt_trnx_TJ=df_amt_trnx.drop_duplicates(subset=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], keep='last')
        dataset_TJ=dataset[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)]]
        join_data=dataset_TJ.merge(df_amt_trnx_TJ, how='left', left_on=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], right_on=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)])
    elif groupby_type == "Yes":
        df_amt_trnx=pd.DataFrame({'{out_col}'.format(out_col=out_col):dataset.set_index('{datetime_col}'.format(datetime_col=datetime_col)).sort_index().groupby(['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col)])['{amount_col}'.format(amount_col=amount_col)].rolling('{window}'.format(window=window), closed='left').mean().fillna(NA)})
        df_amt_trnx.reset_index(['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], inplace=True)
        df_amt_trnx_TJ=df_amt_trnx.drop_duplicates(subset=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], keep='last')
        dataset_TJ=dataset[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)]]
        join_data=dataset_TJ.merge(df_amt_trnx_TJ, how='left', left_on=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], right_on=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)])
    dataset_output=join_data[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col),'{out_col}'.format(out_col=out_col)]]
    return dataset_output

def Unique_Count(dataset=None, datetime_col=None, count_col=None, groupby=None, window=None, NA=None, out_col=None):

    dataset = dataset.sort_values(by=datetime_col, ascending=True)

    df_num = (pd.DataFrame({out_col: dataset.set_index(datetime_col)
    .sort_index()
    .groupby(groupby)[count_col]
    .rolling(window=window, closed='left', min_periods=1)
    .apply(lambda x: np.unique(x[~np.isnan(x)]).size, raw=True)
    .fillna(NA)}))

    df_num.reset_index([groupby, datetime_col], inplace=True)

    df_num_TJ = df_num.drop_duplicates(subset=[groupby, datetime_col], keep='last')

    df_output = dataset.merge(df_num_TJ, on=[groupby, datetime_col], how='left')

    return df_output

#### Time Since Last Transaction

In [None]:

df=credit.sort_values(by=['Transaction_Datetime'], ascending=True)

df['time_row_before']=df.groupby(by = ['Card_No'])['Transaction_Datetime'].shift(1)
df['time_row_before_transaction_type']=df.groupby(by = ['Card_No','transaction_type'])['Transaction_Datetime'].shift(1)
df['time_row_before_from_account_to_account']=df.groupby(by = ['AccountIdentification1','AccountIdentification2'])['Transaction_Datetime'].shift(1)
df['time_row_before_to_account']=df.groupby(by = ['AccountIdentification2'])['Transaction_Datetime'].shift(1)
df['time_row_before_merchant']=df.groupby(by = ['Card_No','Merchant_Id'])['Transaction_Datetime'].shift(1)
df['time_row_before_mcc']=df.groupby(by = ['Card_No','MCC'])['Transaction_Datetime'].shift(1)
df['time_row_before_country_code']=df.groupby(by = ['Card_No','CountryCode'])['Transaction_Datetime'].shift(1)


df['TSLastTxn_mins'] = np.where(df['time_row_before'].isnull(),-1, (df['Transaction_Datetime']-df['time_row_before']).dt.total_seconds()/60)
df['TSLastTxn_TransactionType_mins'] = np.where(df['time_row_before_transaction_type'].isnull(),-1, (df['Transaction_Datetime']-df['time_row_before_transaction_type']).dt.total_seconds()/60)
df['TSLastTxn_fromacct_to_acct_mins'] = np.where(df['time_row_before_from_account_to_account'].isnull(),-1, (df['Transaction_Datetime']-df['time_row_before_from_account_to_account']).dt.total_seconds()/60)
df['TSLastTxn_toacct_mins'] = np.where(df['time_row_before_to_account'].isnull(),-1, (df['Transaction_Datetime']-df['time_row_before_to_account']).dt.total_seconds()/60)
df['TSLastTxn_tobankcode_mins'] = np.where(df['time_row_before_to_bank_code'].isnull(),-1, (df['Transaction_Datetime']-df['time_row_before_to_bank_code']).dt.total_seconds()/60)
df['TSLastTxn_countrycode_mins'] = np.where(df['time_row_before_any_id_type'].isnull(),-1, (df['Transaction_Datetime']-df['time_row_before_any_id_type']).dt.total_seconds()/60)



In [None]:
print(df['TSLastMFTxn_mins'].describe())
print(df['TSLastMFTxn_TransactionType_mins'].describe())
print(df['TSLastMFTxn_fromacct_to_acct_mins'].describe())
print(df['TSLastMFTxn_toacct_mins'].describe())
print(df['TSLastMFTxn_tobankcode_mins'].describe())
print(df['TSLastMFTxn_anyidtype_mins'].describe())


#### Transaction Count Same From Account No

In [None]:


# Create tables for each of the date intervals
Txn_Count_L15M=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='account_number', amount_col='transaction_id' ,groupby_type='No',groupby_col=None, window='900S',NA=0, out_col='MFTxnCount_L15M')
Txn_Count_L1H=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='account_number', amount_col='transaction_id' ,groupby_type='No',groupby_col=None, window='1H',NA=0, out_col='MFTxnCount_L1H')
Txn_Count_L1D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='account_number', amount_col='transaction_id' ,groupby_type='No',groupby_col=None, window='1D',NA=0, out_col='MFTxnCount_L1D')
Txn_Count_L7D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='account_number', amount_col='transaction_id' ,groupby_type='No',groupby_col=None, window='7D',NA=0, out_col='MFTxnCount_L7D')
Txn_Count_L14D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='account_number', amount_col='transaction_id' ,groupby_type='No',groupby_col=None, window='14D',NA=0, out_col='MFTxnCount_L14D')
Txn_Count_L30D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='account_number', amount_col='transaction_id' ,groupby_type='No',groupby_col=None, window='30D',NA=0, out_col='MFTxnCount_L30D')
Txn_Count_L90D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='account_number', amount_col='transaction_id' ,groupby_type='No',groupby_col=None, window='90D',NA=0, out_col='MFTxnCount_L90D')



In [None]:
Txn_Count_L1H['MFTxnCount_L1H'].describe()

In [None]:


# Merge the date intervals tables together with the main table
df = df.merge(Txn_Count_L15M, how='left', on=['transaction_id','account_number','transaction_datetime'])
df = df.merge(Txn_Count_L1H, how='left', on=['transaction_id','account_number','transaction_datetime'])
df = df.merge(Txn_Count_L1D, how='left', on=['transaction_id','account_number','transaction_datetime'])
df = df.merge(Txn_Count_L7D, how='left', on=['transaction_id','account_number','transaction_datetime'])
df = df.merge(Txn_Count_L14D, how='left', on=['transaction_id','account_number','transaction_datetime'])
df = df.merge(Txn_Count_L30D, how='left', on=['transaction_id','account_number','transaction_datetime'])
df = df.merge(Txn_Count_L90D, how='left', on=['transaction_id','account_number','transaction_datetime'])

df.head()

In [None]:

# Define the time windows and corresponding output column names
time_windows = {
    '900S': 'MFTxnCount_L15M',
    '1H': 'MFTxnCount_L1H',
    '1D': 'MFTxnCount_L1D',
    '7D': 'MFTxnCount_L7D',
    '14D': 'MFTxnCount_L14D',
    '30D': 'MFTxnCount_L30D',
    '90D': 'MFTxnCount_L90D'
}

# Calculate frequency for each window and store in a list
frequency_dfs = []
for window, out_col in time_windows.items():
    freq_df = calculate_frequency(
        dataset=df,
        datetime_col='transaction_datetime',
        key='transaction_id',
        groupby='account_number',
        amount_col='transaction_id',
        groupby_type='No',
        groupby_col=None,
        window=window,
        na_value=0,
        out_col=out_col
    )
    frequency_dfs.append(freq_df)

# Merge all frequency DataFrames into the original df
from functools import reduce

df = reduce(
    lambda left, right: pd.merge(
        left, right, on=['transaction_id', 'account_number', 'transaction_datetime'], how='left'
    ),
    [df] + frequency_dfs
)


#### Unique To Account No (Same From Account No)

In [None]:
df['to_account_no_num'],uniques = df['to_account_no'].factorize()

In [None]:
%%time

df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='to_account_no_num', groupby='from_account_no', window='900S', NA=0, out_col='Unique_To_Account_No_L15M')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='to_account_no_num', groupby='from_account_no', window='1H', NA=0, out_col='Unique_To_Account_No_L1H')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='to_account_no_num', groupby='from_account_no', window='1D', NA=0, out_col='Unique_To_Account_No_L1D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='to_account_no_num', groupby='from_account_no', window='7D', NA=0, out_col='Unique_To_Account_No_L7D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='to_account_no_num', groupby='from_account_no', window='14D', NA=0, out_col='Unique_To_Account_No_L14D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='to_account_no_num', groupby='from_account_no', window='30D', NA=0, out_col='Unique_To_Account_No_L30D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='to_account_no_num', groupby='from_account_no', window='90D', NA=0, out_col='Unique_To_Account_No_L90D')




#### Unique From Account No (Same To Account No)

In [None]:
df['from_account_no_num'],uniques = df['from_account_no'].factorize()

In [None]:
%%time

df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='from_account_no_num', groupby='to_account_no', window='900S', NA=0, out_col='Unique_From_Account_No_L15M')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='from_account_no_num', groupby='to_account_no', window='1H', NA=0, out_col='Unique_From_Account_No_L1H')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='from_account_no_num', groupby='to_account_no', window='1D', NA=0, out_col='Unique_From_Account_No_L1D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='from_account_no_num', groupby='to_account_no', window='7D', NA=0, out_col='Unique_From_Account_No_L7D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='from_account_no_num', groupby='to_account_no', window='14D', NA=0, out_col='Unique_From_Account_No_L14D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='from_account_no_num', groupby='to_account_no', window='30D', NA=0, out_col='Unique_From_Account_No_L30D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='from_account_no_num', groupby='to_account_no', window='90D', NA=0, out_col='Unique_From_Account_No_L90D')



#### Unique Transaction Amount Same To Account

In [None]:
%%time

df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='to_account_no', window='900S', NA=0, out_col='Unique_AmountTo_L15M')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='to_account_no', window='1H', NA=0, out_col='Unique_AmountTo_L1H')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='to_account_no', window='1D', NA=0, out_col='Unique_AmountTo_L1D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='to_account_no', window='7D', NA=0, out_col='Unique_AmountTo_L7D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='to_account_no', window='14D', NA=0, out_col='Unique_AmountTo_L14D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='to_account_no', window='30D', NA=0, out_col='Unique_AmountTo_L30D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='to_account_no', window='90D', NA=0, out_col='Unique_AmountTo_L90D')


#### Unique Transaction Amount Same From Account

In [None]:
%%time

df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='from_account_no', window='900S', NA=0, out_col='Unique_AmountFrom_L15M')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='from_account_no', window='1H', NA=0, out_col='Unique_AmountFrom_L1H')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='from_account_no', window='1D', NA=0, out_col='Unique_AmountFrom_L1D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='from_account_no', window='7D', NA=0, out_col='Unique_AmountFrom_L7D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='from_account_no', window='14D', NA=0, out_col='Unique_AmountFrom_L14D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='from_account_no', window='30D', NA=0, out_col='Unique_AmountFrom_L30D')
df = Unique_Count(dataset=df, datetime_col='transaction_datetime', count_col='amount', groupby='from_account_no', window='90D', NA=0, out_col='Unique_AmountFrom_L90D')


#### Transaction Count Same Transaction Type

In [None]:
%%time

# Create tables for each of the date intervals
Txn_Count_SameType_L15M=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='transaction_type', window='900S',NA=0, out_col='MFTxnCountSameType_L15M')
Txn_Count_SameType_L1H=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='transaction_type', window='1H',NA=0, out_col='MFTxnCountSameType_L1H')
Txn_Count_SameType_L1D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='transaction_type', window='1D',NA=0, out_col='MFTxnCountSameType_L1D')
Txn_Count_SameType_L7D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='transaction_type', window='7D',NA=0, out_col='MFTxnCountSameType_L7D')
Txn_Count_SameType_L14D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='transaction_type', window='14D',NA=0, out_col='MFTxnCountSameType_L14D')
Txn_Count_SameType_L30D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='transaction_type', window='30D',NA=0, out_col='MFTxnCountSameType_L30D')
Txn_Count_SameType_L90D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='transaction_type', window='90D',NA=0, out_col='MFTxnCountSameType_L90D')



In [None]:
df.groupby(['transaction_type','Fraud']).size()

In [None]:
%%time

# Merge the date intervals tables together with the main table
df = df.merge(Txn_Count_SameType_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameType_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameType_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameType_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameType_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameType_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameType_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])

df.head()

#### Transaction Count Same To Account No

In [None]:
%%time

# Create tables for each of the date intervals
Txn_Count_SameToAcc_L15M=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='to_account_no', window='900S',NA=0, out_col='MFTxnCountSameToAcc_L15M')
Txn_Count_SameToAcc_L1H=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='to_account_no', window='1H',NA=0, out_col='MFTxnCountSameToAcc_L1H')
Txn_Count_SameToAcc_L1D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='to_account_no', window='1D',NA=0, out_col='MFTxnCountSameToAcc_L1D')
Txn_Count_SameToAcc_L7D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='to_account_no', window='7D',NA=0, out_col='MFTxnCountSameToAcc_L7D')
Txn_Count_SameToAcc_L14D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='to_account_no', window='14D',NA=0, out_col='MFTxnCountSameToAcc_L14D')
Txn_Count_SameToAcc_L30D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='to_account_no', window='30D',NA=0, out_col='MFTxnCountSameToAcc_L30D')
Txn_Count_SameToAcc_L90D=Frequency(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='transaction_id' ,groupby_type='Yes',groupby_col='to_account_no', window='90D',NA=0, out_col='MFTxnCountSameToAcc_L90D')




In [None]:
%%time

# Merge the date intervals tables together with the main table
df = df.merge(Txn_Count_SameToAcc_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameToAcc_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameToAcc_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameToAcc_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameToAcc_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameToAcc_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Txn_Count_SameToAcc_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])

df.head()

#### Average Transaction Amount

In [None]:
%%time

# Apply the function
Avg_Amt_L15M=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='900S',NA=0, out_col='Avg_Amt_L15M')
Avg_Amt_L1H=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='1H',NA=0, out_col='Avg_Amt_L1H')
Avg_Amt_L1D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='1D',NA=0, out_col='Avg_Amt_L1D')
Avg_Amt_L7D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='7D',NA=0, out_col='Avg_Amt_L7D')
Avg_Amt_L14D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='14D',NA=0, out_col='Avg_Amt_L14D')
Avg_Amt_L30D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='30D',NA=0, out_col='Avg_Amt_L30D')
Avg_Amt_L90D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='90D',NA=0, out_col='Avg_Amt_L90D')


In [None]:
Avg_Amt_L15M.head()

#### Average Transaction Amount Same To Account

In [None]:
%%time

# Apply the function
Avg_Amt_SameToAcc_L15M=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='to_account_no', window='900S',NA=0, out_col='Avg_Amt_SameToAcc_L15M')
Avg_Amt_SameToAcc_L1H=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='to_account_no', window='1H',NA=0, out_col='Avg_Amt_SameToAcc_L1H')
Avg_Amt_SameToAcc_L1D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='to_account_no', window='1D',NA=0, out_col='Avg_Amt_SameToAcc_L1D')
Avg_Amt_SameToAcc_L7D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='to_account_no', window='7D',NA=0, out_col='Avg_Amt_SameToAcc_L7D')
Avg_Amt_SameToAcc_L14D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='to_account_no', window='14D',NA=0, out_col='Avg_Amt_SameToAcc_L14D')
Avg_Amt_SameToAcc_L30D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='to_account_no', window='30D',NA=0, out_col='Avg_Amt_SameToAcc_L30D')
Avg_Amt_SameToAcc_L90D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='to_account_no', window='90D',NA=0, out_col='Avg_Amt_SameToAcc_L90D')


In [None]:
Avg_Amt_SameToAcc_L1D.head()

#### Average Transaction Amount Same Transaction Type

In [None]:
%%time

# Apply the function

Avg_Amt_SameType_L15M=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='transaction_type', window='900S',NA=0, out_col='Avg_Amt_SameType_L15M')
Avg_Amt_SameType_L1H=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='transaction_type', window='1H',NA=0, out_col='Avg_Amt_SameType_L1H')
Avg_Amt_SameType_L1D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='transaction_type', window='1D',NA=0, out_col='Avg_Amt_SameType_L1D')
Avg_Amt_SameType_L7D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='transaction_type', window='7D',NA=0, out_col='Avg_Amt_SameType_L7D')
Avg_Amt_SameType_L14D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='transaction_type', window='14D',NA=0, out_col='Avg_Amt_SameType_L14D')
Avg_Amt_SameType_L30D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='transaction_type', window='30D',NA=0, out_col='Avg_Amt_SameType_L30D')
Avg_Amt_SameType_L90D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='Yes',groupby_col='transaction_type', window='90D',NA=0, out_col='Avg_Amt_SameType_L90D')


#### Average Available Balance

In [None]:
%%time

# Apply the function
Avg_Bal_L15M=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='900S',NA=0, out_col='Avg_Bal_L15M')
Avg_Bal_L1H=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='1H',NA=0, out_col='Avg_Bal_L1H')
Avg_Bal_L1D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='1D',NA=0, out_col='Avg_Bal_L1D')
Avg_Bal_L7D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='7D',NA=0, out_col='Avg_Bal_L7D')
Avg_Bal_L14D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='14D',NA=0, out_col='Avg_Bal_L14D')
Avg_Bal_L30D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='30D',NA=0, out_col='Avg_Bal_L30D')
Avg_Bal_L90D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='90D',NA=0, out_col='Avg_Bal_L90D')


#### Average Available Balance Same To Account

In [None]:
%%time

# Apply the function
Avg_Bal_SameToAcc_L15M=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='Yes',groupby_col='to_account_no', window='900S',NA=0, out_col='Avg_Bal_SameToAcc_L15M')
Avg_Bal_SameToAcc_L1H=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='Yes',groupby_col='to_account_no', window='1H',NA=0, out_col='Avg_Bal_SameToAcc_L1H')
Avg_Bal_SameToAcc_L1D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='Yes',groupby_col='to_account_no', window='1D',NA=0, out_col='Avg_Bal_SameToAcc_L1D')
Avg_Bal_SameToAcc_L7D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='Yes',groupby_col='to_account_no', window='7D',NA=0, out_col='Avg_Bal_SameToAcc_L7D')
Avg_Bal_SameToAcc_L14D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='Yes',groupby_col='to_account_no', window='14D',NA=0, out_col='Avg_Bal_SameToAcc_L14D')
Avg_Bal_SameToAcc_L30D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='Yes',groupby_col='to_account_no', window='30D',NA=0, out_col='Avg_Bal_SameToAcc_L30D')
Avg_Bal_SameToAcc_L90D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='Yes',groupby_col='to_account_no', window='90D',NA=0, out_col='Avg_Bal_SameToAcc_L90D')


#### Average Amount to Balance Ratio

In [None]:
%%time

# Set the ratio value to 0 when avaliable_balance is 0 (there are no Null values in amount or available balance)
df['amount_balance_ratio'] = np.where(df['available_balance'] == 0, 0, df['amount']/df['available_balance']).astype('float64')

# Apply the function
Avg_AmtBalRatio_L15M=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='900S',NA=0, out_col='Avg_AmtBalRatio_L15M')
Avg_AmtBalRatio_L1H=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='1H',NA=0, out_col='Avg_AmtBalRatio_L1H')
Avg_AmtBalRatio_L1D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='1D',NA=0, out_col='Avg_AmtBalRatio_L1D')
Avg_AmtBalRatio_L7D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='7D',NA=0, out_col='Avg_AmtBalRatio_L7D')
Avg_AmtBalRatio_L14D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='14D',NA=0, out_col='Avg_AmtBalRatio_L14D')
Avg_AmtBalRatio_L30D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='30D',NA=0, out_col='Avg_AmtBalRatio_L30D')
Avg_AmtBalRatio_L90D=Monetary(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='90D',NA=0, out_col='Avg_AmtBalRatio_L90D')


In [None]:
%%time

# Merge the date intervals tables together with the main table
df = df.merge(Avg_Amt_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])

df = df.merge(Avg_Amt_SameToAcc_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameToAcc_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameToAcc_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameToAcc_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameToAcc_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameToAcc_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameToAcc_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])

df = df.merge(Avg_Amt_SameType_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameType_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameType_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameType_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameType_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameType_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Amt_SameType_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])

df = df.merge(Avg_Bal_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])

df = df.merge(Avg_Bal_SameToAcc_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_SameToAcc_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_SameToAcc_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_SameToAcc_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_SameToAcc_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_SameToAcc_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_Bal_SameToAcc_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])

df = df.merge(Avg_AmtBalRatio_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_AmtBalRatio_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_AmtBalRatio_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_AmtBalRatio_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_AmtBalRatio_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_AmtBalRatio_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Avg_AmtBalRatio_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])



In [None]:
def MonetaryMax(dataset=None, datetime_col=None, Key=None, groupby=None, amount_col=None ,groupby_type='No',groupby_col=None, window=None,NA=None, out_col=None):
    dataset=dataset.sort_values(by='{datetime_col}'.format(datetime_col=datetime_col), ascending=True)
    if groupby_type == "No":
        df_amt_trnx=pd.DataFrame({'{out_col}'.format(out_col=out_col):dataset.set_index('{datetime_col}'.format(datetime_col=datetime_col)).sort_index().groupby(['{groupby}'.format(groupby=groupby)])['{amount_col}'.format(amount_col=amount_col)].rolling('{window}'.format(window=window), closed='left').max().fillna(NA)})
        df_amt_trnx.reset_index(['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], inplace=True)
        df_amt_trnx_TJ=df_amt_trnx.drop_duplicates(subset=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], keep='last')
        dataset_TJ=dataset[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)]]
        join_data=dataset_TJ.merge(df_amt_trnx_TJ, how='left', left_on=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)], right_on=['{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col)])
    elif groupby_type == "Yes":
        df_amt_trnx=pd.DataFrame({'{out_col}'.format(out_col=out_col):dataset.set_index('{datetime_col}'.format(datetime_col=datetime_col)).sort_index().groupby(['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col)])['{amount_col}'.format(amount_col=amount_col)].rolling('{window}'.format(window=window), closed='left').max().fillna(NA)})
        df_amt_trnx.reset_index(['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], inplace=True)
        df_amt_trnx_TJ=df_amt_trnx.drop_duplicates(subset=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], keep='last')
        dataset_TJ=dataset[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)]]
        join_data=dataset_TJ.merge(df_amt_trnx_TJ, how='left', left_on=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)], right_on=['{groupby}'.format(groupby=groupby),'{groupby_col}'.format(groupby_col=groupby_col),'{datetime_col}'.format(datetime_col=datetime_col)])
    dataset_output=join_data[['{Key}'.format(Key=Key),'{groupby}'.format(groupby=groupby),'{datetime_col}'.format(datetime_col=datetime_col),'{out_col}'.format(out_col=out_col)]]
    return dataset_output

#### Maximum Transaction Amount

In [None]:
%%time

# Apply the function
Max_Amt_L15M=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='900S',NA=0, out_col='Max_Amt_L15M')
Max_Amt_L1H=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='1H',NA=0, out_col='Max_Amt_L1H')
Max_Amt_L1D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='1D',NA=0, out_col='Max_Amt_L1D')
Max_Amt_L7D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='7D',NA=0, out_col='Max_Amt_L7D')
Max_Amt_L14D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='14D',NA=0, out_col='Max_Amt_L14D')
Max_Amt_L30D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='30D',NA=0, out_col='Max_Amt_L30D')
Max_Amt_L90D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount' ,groupby_type='No',groupby_col=None, window='90D',NA=0, out_col='Max_Amt_L90D')


In [None]:
Max_Amt_L7D.head()

In [None]:
%%time

df = df.merge(Max_Amt_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Amt_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Amt_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Amt_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Amt_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Amt_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Amt_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])


#### Maximum Available Balance

In [None]:
%%time

# Apply the function
Max_Bal_L15M=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='900S',NA=0, out_col='Max_Bal_L15M')
Max_Bal_L1H=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='1H',NA=0, out_col='Max_Bal_L1H')
Max_Bal_L1D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='1D',NA=0, out_col='Max_Bal_L1D')
Max_Bal_L7D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='7D',NA=0, out_col='Max_Bal_L7D')
Max_Bal_L14D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='14D',NA=0, out_col='Max_Bal_L14D')
Max_Bal_L30D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='30D',NA=0, out_col='Max_Bal_L30D')
Max_Bal_L90D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='available_balance' ,groupby_type='No',groupby_col=None, window='90D',NA=0, out_col='Max_Bal_L90D')


In [None]:
%%time

df = df.merge(Max_Bal_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Bal_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Bal_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Bal_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Bal_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Bal_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_Bal_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])


#### Maximum Amount/Balance Ratio

In [None]:
%%time

df = df.drop(columns = ['Max_AmtBalRatio_L15M','Max_AmtBalRatio_L1H','Max_AmtBalRatio_L1D','Max_AmtBalRatio_L7D',
                       'Max_AmtBalRatio_L14D','Max_AmtBalRatio_L30D','Max_AmtBalRatio_L90D'])

# Apply the function
Max_AmtBalRatio_L15M=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='900S',NA=0, out_col='Max_AmtBalRatio_L15M')
Max_AmtBalRatio_L1H=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='1H',NA=0, out_col='Max_AmtBalRatio_L1H')
Max_AmtBalRatio_L1D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='1D',NA=0, out_col='Max_AmtBalRatio_L1D')
Max_AmtBalRatio_L7D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='7D',NA=0, out_col='Max_AmtBalRatio_L7D')
Max_AmtBalRatio_L14D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='14D',NA=0, out_col='Max_AmtBalRatio_L14D')
Max_AmtBalRatio_L30D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='30D',NA=0, out_col='Max_AmtBalRatio_L30D')
Max_AmtBalRatio_L90D=MonetaryMax(dataset=df, datetime_col='transaction_datetime', Key='transaction_id', groupby='from_account_no', amount_col='amount_balance_ratio' ,groupby_type='No',groupby_col=None, window='90D',NA=0, out_col='Max_AmtBalRatio_L90D')


In [None]:
%%time

df = df.merge(Max_AmtBalRatio_L15M, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_AmtBalRatio_L1H, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_AmtBalRatio_L1D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_AmtBalRatio_L7D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_AmtBalRatio_L14D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_AmtBalRatio_L30D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])
df = df.merge(Max_AmtBalRatio_L90D, how='left', on=['transaction_id','from_account_no','transaction_datetime'])


In [None]:
df.head()

In [None]:
# Save the dataframe to the disk
df.to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\MF_Data.csv', index=False)

In [None]:
# Output a Training file

df.loc[df['flag'] != 'TEST'].to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Train001.csv', index=False)

In [None]:
# Output a Validation (TEST) file

df_val = df
df_val['Fraud_val'] = df_val['Fraud']
df_val.loc[df_val['transaction_id'] == 'NT1937830033075700', 'Fraud_val'] = 1
df_val['Fraud'] = df_val['Fraud_val']

df_val.loc[df_val['flag'] == 'TEST'].to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Val002.csv', index=False)

## Cross-Channel

### Mobile Non-Financial

In [None]:
mobile_non_financial.groupby(['transaction_type'], dropna=False).size()

In [None]:
# Create a grouping for transaction_type

mobile_non_financial['transaction_type_group'] = 'Others'

mobile_non_financial.loc[mobile_non_financial['transaction_type'].isin(['100000201','100000202','100000203','186']) == True, 'transaction_type_group'] = 'Verify Email'
mobile_non_financial.loc[mobile_non_financial['transaction_type'].isin(['100200601','100200602','107']) == True, 'transaction_type_group'] = 'Set PIN-free Transactions Limit'
mobile_non_financial.loc[mobile_non_financial['transaction_type'] == '035', 'transaction_type_group'] = 'Change International Limit'
mobile_non_financial.loc[mobile_non_financial['transaction_type'] == '052', 'transaction_type_group'] = 'Change Daily Transaction Limit'
mobile_non_financial.loc[mobile_non_financial['transaction_type'] == '069', 'transaction_type_group'] = 'Change Mobile No'
mobile_non_financial.loc[mobile_non_financial['transaction_type'].isin(['100200801','100200802']) == True, 'transaction_type_group'] = 'Daily Transaction Limit'
mobile_non_financial.loc[mobile_non_financial['transaction_type'].isin(['100701200','100701201','100701202']) == True, 'transaction_type_group'] = 'Manage Usage Limit'
mobile_non_financial.loc[mobile_non_financial['transaction_type'].isin(['101700501','101700502']) == True, 'transaction_type_group'] = 'Change Limit Usage'
mobile_non_financial.loc[mobile_non_financial['transaction_type'].isin(['401501401','401501402','401501403','401501404','401501405','401501406','401501409']) == True, 'transaction_type_group'] = 'Change Mobile-Contact Address'
mobile_non_financial.loc[mobile_non_financial['transaction_type'] == '500100100', 'transaction_type_group'] = 'Request Activation Code'
mobile_non_financial.loc[mobile_non_financial['transaction_type'].isin(['109','110','111','113','157']) == True, 'transaction_type_group'] = 'Activation'

mobile_non_financial.groupby(['transaction_type_group'] ,dropna=False).size()


In [None]:
print(mobile_non_financial['transaction_id'].nunique())
print(mobile_non_financial.shape[0])

#### Remove Duplicate Transaction ID's

In [None]:
print('Total Records in Non-Financial file:', mobile_non_financial.shape[0])
print('Duplicate transaction_ids:', mobile_non_financial.duplicated(subset = ['transaction_id']).sum())
dfn = mobile_non_financial.loc[mobile_non_financial['transaction_id'].duplicated() == False]
print('Records after removing Duplicate transaction_id: ',dfn.shape[0])

#### Merge the Financial and Non-Financial using merge_asof

In [None]:
%%time

# merge the Non-Financial to the Financial
# this will merge the Financial record with the closest future transaction_datetime onto the Non-Financial transaction record
# will include only transactions within 30 days

dff = df
dff = dff.sort_values(by = ['transaction_datetime'])
dff['transaction_datetime_f'] = dff['transaction_datetime']

dfn = dfn.sort_values(by = ['transaction_datetime'])
dfn['transaction_datetime_nf'] = dfn['transaction_datetime']

dfn = pd.merge_asof(dfn, dff,
                           left_on = ['transaction_datetime_nf'], right_on = ['transaction_datetime_f'],
                           left_by = ['customer_id'], right_by = ['customer_id'],
                            tolerance = pd.Timedelta(days = 30),
                            direction = 'forward',
                           suffixes = ['_non_fin','_fin'])



In [None]:
dfn.shape

In [None]:
dfn.head()

In [None]:
print('Matches: ', dfn.loc[dfn['account_number'].isna() == False].shape[0])
print('Non-Matches: ', dfn.loc[dfn['account_number'].isna() == True].shape[0])

In [None]:
dfn['transaction_id_fin'].nunique()

#### Non-Financial Feature Engineering

In [None]:
# number of c01 transactions
a01 = pd.DataFrame(dfn.groupby(by = ['transaction_id_fin']).size()).reset_index().rename(columns = {0:'Txn_Count_NF_L30D'})


In [None]:
a01.head()

In [None]:
%%time

# number of transactions of top 4 transaction category codes
a01 = a01.merge(pd.DataFrame(dfn.loc[dfn['transaction_type_group'] == 'Change Mobile-Contact Address'].groupby(by = ['transaction_id_fin']).size()).fillna(-1).reset_index().rename(columns = {0:'Txn_Count_NF_01_L30D'}), on = ['transaction_id_fin'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfn.loc[dfn['transaction_type_group'] == 'Daily Transaction Limit'].groupby(by = ['transaction_id_fin']).size()).fillna(-1).reset_index().rename(columns = {0:'Txn_Count_NF_02_L30D'}), on = ['transaction_id_fin'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfn.loc[dfn['transaction_type_group'] == 'Activation'].groupby(by = ['transaction_id_fin']).size()).fillna(-1).reset_index().rename(columns = {0:'Txn_Count_NF_03_L30D'}), on = ['transaction_id_fin'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfn.loc[dfn['transaction_type_group'] == 'Verify Email'].groupby(by = ['transaction_id_fin']).size()).fillna(-1).reset_index().rename(columns = {0:'Txn_Count_NF_04_L30D'}), on = ['transaction_id_fin'], how = 'left')


# time of last NF transaction
a01 = a01.merge(pd.DataFrame(dfn.groupby(by = ['transaction_id_fin'])['transaction_datetime_nf'].max()).reset_index().rename(columns = {'transaction_datetime_nf':'TimeLastTxn_NF'}), on = ['transaction_id_fin'], how = 'left')

# time of last transaction of top 4 transaction category codes
a01 = a01.merge(pd.DataFrame(dfn.loc[dfn['transaction_type_group'] == 'Change Mobile-Contact Address'].groupby(by = ['transaction_id_fin'])['transaction_datetime_nf'].max()).reset_index().rename(columns = {'transaction_datetime_nf':'TimeLastTxn_NF_01'}), on = ['transaction_id_fin'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfn.loc[dfn['transaction_type_group'] == 'Daily Transaction Limit'].groupby(by = ['transaction_id_fin'])['transaction_datetime_nf'].max()).reset_index().rename(columns = {'transaction_datetime_nf':'TimeLastTxn_NF_02'}), on = ['transaction_id_fin'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfn.loc[dfn['transaction_type_group'] == 'Activation'].groupby(by = ['transaction_id_fin'])['transaction_datetime_nf'].max()).reset_index().rename(columns = {'transaction_datetime_nf':'TimeLastTxn_NF_03'}), on = ['transaction_id_fin'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfn.loc[dfn['transaction_type_group'] == 'Verify Email'].groupby(by = ['transaction_id_fin'])['transaction_datetime_nf'].max()).reset_index().rename(columns = {'transaction_datetime_nf':'TimeLastTxn_NF_04'}), on = ['transaction_id_fin'], how = 'left')


# number of unique transaction types/ip addresses
a01 = a01.merge(pd.DataFrame(dfn.groupby(by = ['transaction_id_fin'])['transaction_type_non_fin'].nunique()).fillna(-1).reset_index().rename(columns = {'transaction_type_non_fin':'NF_Unique_Transaction_Type_L30D'}), on = ['transaction_id_fin'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfn.groupby(by = ['transaction_id_fin'])['ip_address_non_fin'].nunique()).fillna(-1).reset_index().rename(columns = {'ip_address_non_fin':'NF_Unique_IP_Address_L30D'}), on = ['transaction_id_fin'], how = 'left')

a01['Txn_Count_NF_01_L30D'] = a01['Txn_Count_NF_01_L30D'].fillna(0)
a01['Txn_Count_NF_02_L30D'] = a01['Txn_Count_NF_02_L30D'].fillna(0)
a01['Txn_Count_NF_03_L30D'] = a01['Txn_Count_NF_03_L30D'].fillna(0)
a01['Txn_Count_NF_04_L30D'] = a01['Txn_Count_NF_04_L30D'].fillna(0)



In [None]:
a01.head()

#### Merge back to Financial Transactions

In [None]:
print(dff.shape[0])
print(dff['transaction_id'].nunique())
print(a01.shape[0])
print(a01['transaction_id_fin'].nunique())

In [None]:
fraud_mobile = dff.merge(a01,
                         left_on = ['transaction_id'],
                         right_on = ['transaction_id_fin'],
                         how = 'left',
                         indicator = 'merge_f_mf_a01')

In [None]:
fraud_mobile.head()

In [None]:
fraud_mobile.groupby(by = ['merge_f_mf_a01','flag'], dropna=False).size()

In [None]:
# Fill NaN with -1

fraud_mobile['Txn_Count_NF_L30D'] = fraud_mobile['Txn_Count_NF_L30D'].fillna(-1)
fraud_mobile['Txn_Count_NF_01_L30D'] = fraud_mobile['Txn_Count_NF_01_L30D'].fillna(-1)
fraud_mobile['Txn_Count_NF_02_L30D'] = fraud_mobile['Txn_Count_NF_02_L30D'].fillna(-1)
fraud_mobile['Txn_Count_NF_03_L30D'] = fraud_mobile['Txn_Count_NF_03_L30D'].fillna(-1)
fraud_mobile['Txn_Count_NF_04_L30D'] = fraud_mobile['Txn_Count_NF_04_L30D'].fillna(-1)

fraud_mobile['NF_Unique_Transaction_Type_L30D'] = fraud_mobile['NF_Unique_Transaction_Type_L30D'].fillna(-1)
fraud_mobile['NF_Unique_IP_Address_L30D'] = fraud_mobile['NF_Unique_IP_Address_L30D'].fillna(-1)


In [None]:
fraud_mobile.head()

### Deposit Transactions

In [None]:
deposit_transaction.head()

In [None]:
deposit_transaction.groupby(['record_type'], dropna=False).size()

In [None]:
print(deposit_transaction['teller_id'].nunique())
print(deposit_transaction['customer_id'].nunique())
print(deposit_transaction['account_number'].nunique())
print(deposit_transaction.shape[0])

In [None]:
deposit_transaction_account_numbers = deposit_transaction.loc[deposit_transaction.duplicated() == False]
print(deposit_transaction_account_numbers.shape[0])

In [None]:
%%time
print('Duplicated Deposit Records: ',deposit_transaction.duplicated().sum())
print('Total Deposit Records: ',deposit_transaction.shape[0])
dfd = deposit_transaction.loc[deposit_transaction.duplicated() == False]
print('Records after removing duplicate records: ',dfd.shape[0])

In [None]:
test = dfd.merge(fraud4,
                on = ['customer_id','account_number'],
                how = 'left',
                indicator = 'merge_dep_f')


In [None]:
test.groupby(['flag'], dropna = False).size()

In [None]:
test.groupby(['merge_dep_f'], dropna = False).size()

In [None]:
test.info()

In [None]:
test.loc[(test['end_date'].isna() == False) & (test['end_date'] >= test['transaction_datetime'])].groupby(['flag'], dropna=False).size()

In [None]:
# Apply Fraud Transaction label based on end_date

test['fraud_txn'] = 0

test.loc[(test['end_date'].isna() == False) & (test['transaction_datetime'] >= test['end_date']), 'fraud_txn'] = 1

test.loc[(test['end_date'].isna() == False) & (test['transaction_datetime'] < test['end_date']), 'fraud_txn'] = 2

test.groupby(['fraud_txn'], dropna=False).size()


In [None]:
test.shape[0]

In [None]:
test.groupby(['flag','fraud_txn'], dropna=False).size()

In [None]:
test = deposit_transaction.merge(fraud_mobile[['customer_id','from_account_no','transaction_datetime','amount','transaction_id']],
                 left_on = ['customer_id','account_number','transaction_datetime','amount'],
                 right_on = ['customer_id','from_account_no','transaction_datetime','amount'],
                 how = 'left',
                 indicator = 'merge_dep_mob')

print(test.shape[0])
print(test.groupby(['merge_dep_mob'], dropna=False).size())

In [None]:
test = test.merge(reference_deposit, on = ['customer_id','account_number'],
                 how = 'left',
                 indicator = 'merge_test_ref')

test.groupby(['merge_test_ref'], dropna=False).size()

In [None]:
test.head()

In [None]:
test['account_age'] = (test['transaction_datetime'] - pd.to_datetime(test['account_open_date']))//np.timedelta64(1,'D')
test['account_age'].describe()

In [None]:
print('Total Deposit Transactions: ',test.shape[0])
print('Total Mobile Transactions: ',test.loc[test['transaction_id'].isna() == False].shape[0])

deposit_transaction_ex = test.loc[test['transaction_id'].isna() == True]

print('Remaining Deposit Transactions: ',deposit_transaction_ex.shape[0])

In [None]:
deposit_transaction_ex.info(show_counts=True)

In [None]:
deposit_transaction_ex['account_age'].notna().sum()

In [None]:
%%time

# merge the Deposits to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the Deposit transaction record
# will include only transactions within 30 days

dff = fraud_mobile
dff = dff.sort_values(by = ['transaction_datetime_f'])

dfd = deposit_transaction_ex.drop(columns = ['from_account_no','transaction_id','merge_dep_mob']).sort_values(by = ['transaction_datetime'])

dfd = pd.merge_asof(dfd, dff,
                           left_on = ['transaction_datetime'], right_on = ['transaction_datetime_f'],
                           left_by = ['customer_id'], right_by = ['customer_id'],
                            tolerance = pd.Timedelta(days = 30),
                            direction = 'forward',
                           suffixes = ['_dep','_mob'])



In [None]:
print(dfd.shape[0])
print(dfd['transaction_id'].nunique())

In [None]:
dfd.head()

In [None]:
dfd.info(verbose=True)

In [None]:
dfd['account_age'].isna().sum()

In [None]:
# Make an Amount/Balance Ratio field
dfd['amount_balance_ratio_dep'] = np.where(dfd['balance'] == 0, 0, dfd['amount_dep']/dfd['balance'])

In [None]:
%%time

# number of deposit transactions
a01 = pd.DataFrame(dfd.groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Dep_L30D'})
print(a01.shape[0])
print(a01['transaction_id'].nunique())

In [None]:
dfd['transaction_datetime_dep'].isna().sum()

In [None]:
%%time

# number of transactions of top record types
a01 = a01.merge(pd.DataFrame(dfd.loc[dfd['record_type'] == '40'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Dep_40_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.loc[dfd['record_type'] == '20'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Dep_20_L30D'}), on = ['transaction_id'], how = 'left')

# date of last deposit transaction
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['transaction_datetime_dep'].max()).reset_index().rename(columns = {'transaction_datetime_dep':'DateLastTxn_Dep'}), on = ['transaction_id'], how = 'left')

# number of unique teller id/transaction code/record type/amount/branch
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['teller_id'].nunique()).reset_index().rename(columns = {'teller_id':'Dep_Unique_TellerID_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['transaction_code'].nunique()).reset_index().rename(columns = {'transaction_code':'Dep_Unique_TransCode_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['amount_dep'].nunique()).reset_index().rename(columns = {'amount_dep':'Dep_Unique_Amount_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['record_type'].nunique()).reset_index().rename(columns = {'record_type':'Dep_Unique_RecordType_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['transaction_branch'].nunique()).reset_index().rename(columns = {'transaction_branch':'Dep_Unique_Branch_L30D'}), on = ['transaction_id'], how = 'left')


# Average/Max Deposit Amount, Balance
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['amount_dep'].mean()).reset_index().rename(columns = {'amount_dep':'Dep_AvgAmoutL30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['amount_dep'].max()).reset_index().rename(columns = {'amount_dep':'Dep_MaxAmount_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['balance'].mean()).reset_index().rename(columns = {'balance':'Dep_AvgBalance_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['balance'].max()).reset_index().rename(columns = {'balance':'Dep_MaxBalance_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['balance'].min()).reset_index().rename(columns = {'balance':'Dep_MinBalance_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['amount_balance_ratio_dep'].mean()).reset_index().rename(columns = {'amount_balance_ratio_dep':'Dep_AvgAmtBalRatio_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['amount_balance_ratio_dep'].max()).reset_index().rename(columns = {'amount_balance_ratio_dep':'Dep_MaxAmtBalRatio_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['amount_balance_ratio_dep'].min()).reset_index().rename(columns = {'amount_balance_ratio_dep':'Dep_MinAmtBalRatio_L30D'}), on = ['transaction_id'], how = 'left')

# Age of Account
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['account_age'].mean()).reset_index().rename(columns = {'account_age':'Dep_AvgAcctAge_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['account_age'].max()).reset_index().rename(columns = {'account_age':'Dep_MaxAcctAge_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(dfd.groupby(by = ['transaction_id'])['account_age'].min()).reset_index().rename(columns = {'account_age':'Dep_MinAcctAge_L30D'}), on = ['transaction_id'], how = 'left')



In [None]:
a01.head()

In [None]:
a01['transaction_id'].nunique()

In [None]:
a01.shape

In [None]:
# Check the memory usage
import sys

ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

sorted([(x, sys.getsizeof(globals().get(x))*0.00000095367432) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [None]:
del(dfn)

In [None]:
import gc
gc.collect()

In [None]:
fraud_mobile.to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\fraud_mobile.csv', index=False)

#### Merge to Mobile

In [None]:
fraud_mobile_dep = fraud_mobile.merge(a01.drop(columns = ['Txn_Count_Dep_40_L30D_y','Txn_Count_Dep_20_L30D_y']).rename(columns = {'Txn_Count_Dep_40_L30D_x':'Txn_Count_Dep_40_L30D',
                                                                                                                                 'Txn_Count_Dep_20_L30D_x':'Txn_Count_Dep_20_L30D'}),
                         left_on = ['transaction_id'],
                         right_on = ['transaction_id'],
                         how = 'left',
                         indicator = 'merge_mob_dep')

In [None]:
fraud_mobile_dep.groupby(by = ['merge_mob_dep','flag','Fraud'], dropna=False).size()

In [None]:
print(fraud_mobile_dep.shape[0])
print(fraud_mobile_dep['customer_id'].nunique())
print(fraud_mobile_dep['transaction_id'].nunique())

In [None]:
print(fraud_mobile_financial.shape[0])
print(fraud_mobile_financial['customer_id'].nunique())
print(fraud_mobile_financial['transaction_id'].nunique())

In [None]:
fraud_mobile_dep.to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\fraud_mobile_dep.csv', index=False)

In [None]:
fraud_mobile_dep.head()

#### Add Reference Customer

In [None]:
reference_customer.columns

In [None]:
fraud_mobile_dep_cust = fraud_mobile_dep.merge(reference_customer,
                                       on = ['customer_id'],
                                       how = 'left',
                                      indicator = 'merge_mob_dep_cust',
                             suffixes = [None, '_refcust'])

In [None]:
print(fraud_mobile_dep_cust.shape[0])
print(fraud_mobile_dep_cust['customer_id'].nunique())
print(fraud_mobile_dep_cust['transaction_id'].nunique())
print(fraud_mobile_dep_cust['account_number'].nunique())

In [None]:
fraud_mobile_dep_cust.groupby(['merge_mob_dep_cust'], dropna=False).size()

#### Add Reference Deposit

In [None]:
reference_deposit.columns

In [None]:
print(reference_deposit.shape[0])
print(reference_deposit['account_number'].nunique())

In [None]:
data = fraud_mobile_dep_cust.merge(reference_deposit,
                  on = ['account_number'],
                  how = 'left',
                  indicator = 'merge_mob_dep_cust_dep',
                 suffixes = [None, '_refdep'])

In [None]:
print(data.shape[0])
print(data[['customer_id','account_number']].nunique())
print(data['transaction_id'].nunique())

In [None]:
data.groupby(['merge_mob_dep_cust_dep']).size()

In [None]:
del(fraud_mobile_dep_cust)
gc.collect()

#### Add Employee

In [None]:
data = data.merge(reference_employee,
                  on = ['customer_id'],
                  how = 'left',
                  indicator = 'merge_employee')

In [None]:
reference_employee.shape

In [None]:
data.groupby(['merge_employee'], dropna=False).size()

#### Extra Calculations

In [None]:
# create the "time since last" calculations
data['TimeSinceLast_NF'] = (data['transaction_datetime'] - pd.to_datetime(data['TimeLastTxn_NF']))//np.timedelta64(1,'m')
data['TimeSinceLast_NF01'] = (data['transaction_datetime'] - pd.to_datetime(data['TimeLastTxn_NF_01']))//np.timedelta64(1,'m')
data['TimeSinceLast_NF02'] = (data['transaction_datetime'] - pd.to_datetime(data['TimeLastTxn_NF_02']))//np.timedelta64(1,'m')
data['TimeSinceLast_NF03'] = (data['transaction_datetime'] - pd.to_datetime(data['TimeLastTxn_NF_03']))//np.timedelta64(1,'m')
data['TimeSinceLast_NF04'] = (data['transaction_datetime'] - pd.to_datetime(data['TimeLastTxn_NF_04']))//np.timedelta64(1,'m')

data['DaysSinceLast_Dep'] = (data['transaction_datetime'] - pd.to_datetime(data['DateLastTxn_Dep']))//np.timedelta64(1,'D')

data['Customer_Age'] = (pd.to_datetime(data['transaction_datetime']) - pd.to_datetime(data['date_of_birth']))//np.timedelta64(1,'Y')

data['Deposit_Account_Age'] = (pd.to_datetime(data['transaction_datetime']) - pd.to_datetime(data['account_open_date']))//np.timedelta64(1,'D')

In [None]:
data.info(verbose=True)

In [None]:
data.head()

In [None]:
data.groupby(['Fraud'])['Deposit_Account_Age'].mean()

#### Save the Data to Disk

In [None]:
data.to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\data.csv', index=False)

#### Read the Data from Disk

In [None]:
date_fields = ['end_date',
'transaction_datetime',
'time_row_before',
'time_row_before_transaction_type',
'time_row_before_from_account_to_account',
'time_row_before_to_account',
'time_row_before_to_bank_code',
'time_row_before_any_id_type',    
'transaction_datetime_f',
'TimeLastTxn_NF',
'TimeLastTxn_NF_01',
'TimeLastTxn_NF_02',
'TimeLastTxn_NF_03',
'TimeLastTxn_NF_04',  
'DateLastTxn_Dep',
'date_of_birth',
'account_open_date']



In [None]:
data = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\data.csv',
                  parse_dates = date_fields)

In [None]:
data['time_row_before_any_id_type'].describe(datetime_is_numeric=True)

In [None]:
data.info(verbose=True,show_counts=True)

#### Channel Transactions

In [None]:
channel_transaction.shape

In [None]:
channel_transaction.info(show_counts=True)

In [None]:
channel_transaction['from_account_number'].nunique()

In [None]:
data[['customer_id','from_account_no','transaction_id','transaction_datetime']].merge(channel_transaction[['customer_id','from_account_number','transaction_datetime']],
                                                left_on = ['customer_id','from_account_no','transaction_datetime'],
                                                right_on = ['customer_id','from_account_number','transaction_datetime'],
                                                how = 'right',
                                                indicator = 'merge_mob_channel').groupby(['merge_mob_channel']).size()


In [None]:
channel_transaction.head()

In [None]:
channel_non_mobile = channel_transaction.merge(data[['transaction_id','from_account_no','customer_id','transaction_datetime','amount']],
                         left_on = ['customer_id','from_account_number','transaction_datetime','amount'],
                         right_on = ['customer_id','from_account_no','transaction_datetime','amount'],
                         how = 'left')

channel_non_mobile = channel_non_mobile.loc[channel_non_mobile['transaction_id'].isna() == True]

In [None]:
channel_non_mobile.shape[0]

In [None]:
%%time

# merge the Channel to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the channel transaction record
# will include only transactions within 30 days

data = data.sort_values(by = ['transaction_datetime'])
channel_non_mobile = channel_non_mobile.sort_values(by = ['transaction_datetime'])

channel_non_mobile = pd.merge_asof(channel_non_mobile.drop(columns = ['from_account_no','transaction_id']),
                                   data[['transaction_id','transaction_datetime','customer_id','from_account_no']],
              left_on = ['transaction_datetime'], right_on = ['transaction_datetime'],
              left_by = ['customer_id'], right_by = ['customer_id'],
              tolerance = pd.Timedelta(days = 30),
              direction = 'forward',
              suffixes = ['_chan', None])

channel_non_mobile.head()



In [None]:
channel_non_mobile['transaction_id'].nunique()

In [None]:
channel_non_mobile['transaction_id'].notna().sum()

In [None]:
channel_non_mobile.shape[0]

In [None]:
%%time

# number of channel transactions
a01 = pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_ChannelNM_L30D'}).fillna(0)
print(a01.shape[0])
print(a01['transaction_id'].nunique())

In [None]:
%%time

# number of transactions of specific service types
a01 = a01.merge(pd.DataFrame(channel_non_mobile.loc[channel_non_mobile['service_type'] == 'MOB'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Ch_MOB_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(channel_non_mobile.loc[channel_non_mobile['service_type'] == 'INT'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Ch_INT_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(channel_non_mobile.loc[channel_non_mobile['service_type'] == 'BIT'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Ch_BIT_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(channel_non_mobile.loc[channel_non_mobile['channel'] == '74'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Ch_74_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(channel_non_mobile.loc[channel_non_mobile['channel'] == '76'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Ch_76_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)


# date of last channel transaction
a01 = a01.merge(pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id'])['transaction_datetime'].max()).reset_index().rename(columns = {'transaction_datetime':'DateLastTxn_Ch'}), on = ['transaction_id'], how = 'left')

# number of unique to_bank_code/transaction code/channel/amount/terminal_bank_id/terminal_id
a01 = a01.merge(pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id'])['to_bank_code'].nunique()).reset_index().rename(columns = {'to_bank_code':'Ch_Unique_ToBank_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id'])['transaction_code'].nunique()).reset_index().rename(columns = {'transaction_code':'Ch_Unique_TransCode_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id'])['amount'].nunique()).reset_index().rename(columns = {'amount':'Ch_Unique_Amount_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id'])['channel'].nunique()).reset_index().rename(columns = {'channel':'Ch_Unique_Channel_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id'])['terminal_bank_id'].nunique()).reset_index().rename(columns = {'terminal_bank_id':'Ch_Unique_TermBankId_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id'])['terminal_id'].nunique()).reset_index().rename(columns = {'terminal_id':'Ch_Unique_TermId_L30D'}), on = ['transaction_id'], how = 'left')


# Average/Max Amount
a01 = a01.merge(pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id'])['amount'].mean()).reset_index().rename(columns = {'amount':'Ch_AvgAmoutL30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(channel_non_mobile.groupby(by = ['transaction_id'])['amount'].max()).reset_index().rename(columns = {'amount':'Ch_MaxAmount_L30D'}), on = ['transaction_id'], how = 'left')


In [None]:
a01.head()

In [None]:
# Merge back to the Mobile Financial transactions

data = data.merge(a01,
                  left_on = ['transaction_id'],
                  right_on = ['transaction_id'],
                  how = 'left',
                  indicator = 'merge_channel')

In [None]:
data.info(verbose=True)

#### Deposit Customer Maintenance

In [None]:
deposit_customer_maintenance.info(show_counts=True)

In [None]:
deposit_customer_maintenance.head()

In [None]:
deposit_customer_maintenance.groupby(['action_code'], dropna=False).size()

In [None]:
%%time

# merge the Channel to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the channel transaction record
# will include only transactions within 30 days

data = data.sort_values(by = ['transaction_datetime'])
deposit_customer_maintenance = deposit_customer_maintenance.sort_values(by = ['transaction_datetime'])

deposit_customer_maintenance = pd.merge_asof(deposit_customer_maintenance,
                                   data[['transaction_id','transaction_datetime','customer_id']],
              left_on = ['transaction_datetime'], right_on = ['transaction_datetime'],
              left_by = ['customer_id'], right_by = ['customer_id'],
              tolerance = pd.Timedelta(days = 30),
              direction = 'forward',
              suffixes = ['_dcm', None])

deposit_customer_maintenance.head()

In [None]:
%%time

# number of DCM transactions
a01 = pd.DataFrame(deposit_customer_maintenance.groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_DCM_L30D'}).fillna(0)
print(a01.shape[0])
print(a01['transaction_id'].nunique())

In [None]:
%%time

# number of transactions of specific service types
a01 = a01.merge(pd.DataFrame(deposit_customer_maintenance.loc[deposit_customer_maintenance['action_code'] == 'A'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_DCM_A_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(deposit_customer_maintenance.loc[deposit_customer_maintenance['action_code'] == 'U'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_DCM_U_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)

# date of last DCM transaction
a01 = a01.merge(pd.DataFrame(deposit_customer_maintenance.groupby(by = ['transaction_id'])['transaction_datetime'].max()).reset_index().rename(columns = {'transaction_datetime':'DateLastTxn_DCM'}), on = ['transaction_id'], how = 'left')

# number of unique action_code/user_id/update_field_code
a01 = a01.merge(pd.DataFrame(deposit_customer_maintenance.groupby(by = ['transaction_id'])['action_code'].nunique()).reset_index().rename(columns = {'action_code':'DCM_Unique_ActionCode_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(deposit_customer_maintenance.groupby(by = ['transaction_id'])['user_id'].nunique()).reset_index().rename(columns = {'user_id':'DCM_Unique_UserId_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(deposit_customer_maintenance.groupby(by = ['transaction_id'])['update_field_code'].nunique()).reset_index().rename(columns = {'update_field_code':'DCM_Unique_UFieldCode_L30D'}), on = ['transaction_id'], how = 'left')


In [None]:
a01.head()

In [None]:
a01.shape

In [None]:
# Merge back to the Mobile Financial transactions

data = data.merge(a01,
                  left_on = ['transaction_id'],
                  right_on = ['transaction_id'],
                  how = 'left',
                  indicator = 'merge_DCM')

In [None]:
# Make the Time Since fields

data['TimeSinceLast_DCM'] = (data['transaction_datetime'] - pd.to_datetime(data['DateLastTxn_DCM']))//np.timedelta64(1,'D')
data['TimeSinceLast_Ch'] = (data['transaction_datetime'] - pd.to_datetime(data['DateLastTxn_Ch']))//np.timedelta64(1,'D')


#### Card Transactions

In [None]:
card_transaction.head()

In [None]:
card_transaction['transaction_datetime'].describe(datetime_is_numeric=True)

In [None]:
data['customer_id'].nunique()

In [None]:
x = data[['transaction_id','transaction_datetime','customer_id']].merge(card_daily,
                                                                   on = ['customer_id'],
                                                                   how = 'left',
                                                                   indicator = "merge_card")
x.loc[x['merge_card'] == 'both']['customer_id'].nunique()

#### Only 4 matching customer_ids - OMIT card data


In [None]:
%%time

# merge the Card Transactions to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the card transaction record
# will include only transactions within 30 days

data = data.sort_values(by = ['transaction_datetime'])
card_transaction = card_transaction.sort_values(by = ['transaction_datetime'])

card_transaction = pd.merge_asof(card_transaction,
                                 data[['transaction_id','transaction_datetime','customer_id']],
              left_on = ['transaction_datetime'], right_on = ['transaction_datetime'],
              left_by = ['customer_id'], right_by = ['customer_id'],
              tolerance = pd.Timedelta(days = 90),
              direction = 'forward',
              suffixes = ['_card', None])

card_transaction.head()

In [None]:
%%time

# number of DCM transactions
a01 = pd.DataFrame(card_transaction.groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_L30D'}).fillna(0)
print(a01.shape[0])
print(a01['transaction_id'].nunique())

In [None]:
%%time

# number of transactions of specific things
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['source_currency'] == 'THB'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_THB_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['source_currency'] != 'THB'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_NotTHB_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['pos_condition_code'] == '59'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_eComm_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['domestic'] == 0].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_Dom0_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['country_code'] == '764'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_Country764_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['country_code'] != '764'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_CountryNot764_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['pos_entry_mode'] == '01'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_PEM01_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['pos_entry_mode'] == '05'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_PEM05_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['pos_entry_mode'] == '07'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_PEM07_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['pos_entry_mode'] == '10'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_PEM10_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['decision'] == 'Approve'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_Approve_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['decision'] == 'Reject'].groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_Card_Reject_L30D'}), on = ['transaction_id'], how = 'left').fillna(0)


# date of last channel transaction
a01 = a01.merge(pd.DataFrame(card_transaction.groupby(by = ['transaction_id'])['transaction_datetime'].max()).reset_index().rename(columns = {'transaction_datetime':'DateLastTxn_Card'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['decision'] == 'Approve'].groupby(by = ['transaction_id'])['transaction_datetime'].max()).reset_index().rename(columns = {'transaction_datetime':'DateLastTxn_CardApp'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(card_transaction.loc[card_transaction['decision'] == 'Reject'].groupby(by = ['transaction_id'])['transaction_datetime'].max()).reset_index().rename(columns = {'transaction_datetime':'DateLastTxn_CardRej'}), on = ['transaction_id'], how = 'left')


# number of unique to_bank_code/transaction code/channel/amount/terminal_bank_id/terminal_id
a01 = a01.merge(pd.DataFrame(card_transaction.groupby(by = ['transaction_id'])['merchant_id'].nunique()).reset_index().rename(columns = {'merchant_id':'Card_Unique_MerchIdL30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(card_transaction.groupby(by = ['transaction_id'])['merchant_category_code'].nunique()).reset_index().rename(columns = {'merchant_category_code':'Card_Unique_MCC_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(card_transaction.groupby(by = ['transaction_id'])['source_currency'].nunique()).reset_index().rename(columns = {'source_currency':'Card_Unique_Currency_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(card_transaction.groupby(by = ['transaction_id'])['country_code'].nunique()).reset_index().rename(columns = {'country_code':'Card_Unique_CountryCode_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(card_transaction.groupby(by = ['transaction_id'])['card_acceptor_terminal'].nunique()).reset_index().rename(columns = {'card_acceptor_terminal':'Card_Unique_Terminal_L30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(card_transaction.groupby(by = ['transaction_id'])['pos_mode'].nunique()).reset_index().rename(columns = {'pos_mode':'Card_Unique_POSMode_L30D'}), on = ['transaction_id'], how = 'left')


# Average/Max Amount
a01 = a01.merge(pd.DataFrame(card_transaction.groupby(by = ['transaction_id'])['card_holder_billing_amount'].mean()).reset_index().rename(columns = {'card_holder_billing_amount':'Card_AvgAmoutL30D'}), on = ['transaction_id'], how = 'left')
a01 = a01.merge(pd.DataFrame(card_transaction.groupby(by = ['transaction_id'])['card_holder_billing_amount'].max()).reset_index().rename(columns = {'card_holder_billing_amount':'Card_MaxAmount_L30D'}), on = ['transaction_id'], how = 'left')


#### Auto Loan Applications

In [None]:
auto_application.head()

In [None]:
print(auto_application['citizen_id'].nunique())
print(auto_application.shape[0])

print(data['citizen_id'].nunique())
print(data.shape[0])

print(data['customer_id'].nunique())

In [None]:
data.info(verbose=True)

In [None]:
x = data[['citizen_id','transaction_id','customer_id']].merge(auto_application,
                                                         on = ['citizen_id'],
                                                         how = 'left',
                                                         indicator = 'merge_auto')

x.loc[x['merge_auto'] == 'both']['customer_id'].nunique()

In [None]:
x.loc[x['merge_auto'] == 'both']['transaction_id'].nunique()

In [None]:
x.shape[0]

In [None]:
%%time

# merge the auto applications to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the auto application record
# will include only transactions within 180 days

data = data.sort_values(by = ['transaction_datetime'])
auto_application = auto_application.sort_values(by = ['request_date'])

auto_application = pd.merge_asof(auto_application,
                                 data[['transaction_id','transaction_datetime','customer_id','citizen_id']],
              left_on = ['request_date'], right_on = ['transaction_datetime'],
              left_by = ['citizen_id'], right_by = ['citizen_id'],
              tolerance = pd.Timedelta(days = 180),
              direction = 'forward',
              suffixes = ['_auto', None])

auto_application.head()

In [None]:
auto_application['transaction_id'].notna().sum()

In [None]:
auto_application['customer_id'].nunique()

#### < 200 transactions where Auto Loan Application submitted within 180 days - OMIT auto loan data

#### deposit_idcardtracking

In [None]:
deposit_idcardtracking.head()

In [None]:
deposit_idcardtracking.shape

In [None]:
del(x)
import gc
gc.collect()

In [None]:
%%time

# merge the Card Transactions to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the card transaction record
# will include only transactions within 30 days

data = data.sort_values(by = ['transaction_datetime'])
deposit_idcardtracking = deposit_idcardtracking.sort_values(by = ['transaction_datetime'])

deposit_idcardtracking = pd.merge_asof(deposit_idcardtracking,
                                       data[['transaction_id','transaction_datetime','customer_id','from_account_no','citizen_id']],
              left_on = ['transaction_datetime'], right_on = ['transaction_datetime'],
              left_by = ['IDCARDNO'], right_by = ['citizen_id'],
              tolerance = pd.Timedelta(days = 30),
              direction = 'forward',
              suffixes = ['_idc', None])

deposit_idcardtracking.head()

In [None]:
deposit_idcardtracking['transaction_id'].notna().sum()

In [None]:
deposit_idcardtracking.shape[0]

In [None]:
%%time

# number of DCM transactions
a01 = pd.DataFrame(deposit_idcardtracking.groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_IDC_L30D'}).fillna(0)
print(a01.shape[0])
print(a01['transaction_id'].nunique())

In [None]:
%%time

# date of last idcard transaction
a01 = a01.merge(pd.DataFrame(deposit_idcardtracking.groupby(by = ['transaction_id'])['transaction_datetime'].max()).reset_index().rename(columns = {'transaction_datetime':'DateLastTxn_IDC'}), on = ['transaction_id'], how = 'left')

a01.head()

In [None]:
# Merge back to the Mobile Financial transactions

data = data.merge(a01,
                  left_on = ['transaction_id'],
                  right_on = ['transaction_id'],
                  how = 'left',
                  indicator = 'merge_IDC')

In [None]:
# Make the Time Since fields

data['TimeSinceLast_IDC'] = (data['transaction_datetime'] - pd.to_datetime(data['DateLastTxn_IDC']))//np.timedelta64(1,'D')


#### Deposit SD Activity

In [None]:
deposit_sd_activity.head()

In [None]:
%%time

# merge the Card Transactions to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the card transaction record
# will include only transactions within 30 days

data = data.sort_values(by = ['transaction_datetime'])
deposit_sd_activity = deposit_sd_activity.sort_values(by = ['transaction_datetime'])

deposit_sd_activity = pd.merge_asof(deposit_sd_activity,
                                       data[['transaction_id','transaction_datetime','customer_id','from_account_no','citizen_id']],
              left_on = ['transaction_datetime'], right_on = ['transaction_datetime'],
              left_by = ['tmb_account_id'], right_by = ['from_account_no'],
              tolerance = pd.Timedelta(days = 30),
              direction = 'forward',
              suffixes = ['_sd', None])

deposit_sd_activity.head()

In [None]:
deposit_sd_activity['transaction_id'].notna().sum()

In [None]:
deposit_sd_activity.shape

In [None]:
%%time

# number of DCM transactions
a01 = pd.DataFrame(deposit_sd_activity.groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_SD_L30D'}).fillna(0)
print(a01.shape[0])
print(a01['transaction_id'].nunique())

In [None]:
%%time

# date of last idcard transaction
a01 = a01.merge(pd.DataFrame(deposit_sd_activity.groupby(by = ['transaction_id'])['transaction_datetime'].max()).reset_index().rename(columns = {'transaction_datetime':'DateLastTxn_SD'}), on = ['transaction_id'], how = 'left')

a01.head()

In [None]:
%%time

# Merge back to the Mobile Financial transactions

data = data.merge(a01,
                  left_on = ['transaction_id'],
                  right_on = ['transaction_id'],
                  how = 'left',
                  indicator = 'merge_SD')

In [None]:
# Make the Time Since fields

data['TimeSinceLast_SD'] = (data['transaction_datetime'] - pd.to_datetime(data['DateLastTxn_SD']))//np.timedelta64(1,'D')


#### Deposit Cheque Transaction

In [None]:
deposit_cheque_transaction.head()

In [None]:
deposit_cheque_transaction.shape

In [None]:
%%time

# merge the Card Transactions to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the card transaction record
# will include only transactions within 30 days

data = data.sort_values(by = ['transaction_datetime'])
deposit_cheque_transaction = deposit_cheque_transaction.sort_values(by = ['clearing_date'])

deposit_cheque_transaction = pd.merge_asof(deposit_cheque_transaction,
                    data[['transaction_id','transaction_datetime','customer_id','from_account_no']],
              left_on = ['clearing_date'], right_on = ['transaction_datetime'],
              left_by = ['payin_account'], right_by = ['from_account_no'],
              tolerance = pd.Timedelta(days = 30),
              direction = 'forward',
              suffixes = ['_chq1', None])

deposit_cheque_transaction.head()

In [None]:
deposit_cheque_transaction['transaction_id'].notna().sum()

In [None]:
%%time

# number of CHQ IN transactions
a01 = pd.DataFrame(deposit_cheque_transaction.groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_CHQIN_L30D'}).fillna(0)
print(a01.shape[0])
print(a01['transaction_id'].nunique())

In [None]:
%%time

# date of last idcard transaction
a01 = a01.merge(pd.DataFrame(deposit_cheque_transaction.groupby(by = ['transaction_id'])['clearing_date'].max()).reset_index().rename(columns = {'clearing_date':'DateLastTxn_CHQIN'}), on = ['transaction_id'], how = 'left')

a01.head()

In [None]:
%%time


# Merge back to the Mobile Financial transactions

data = data.merge(a01,
                  left_on = ['transaction_id'],
                  right_on = ['transaction_id'],
                  how = 'left',
                  indicator = 'merge_CHQ')

In [None]:
# Make the Time Since fields

data['TimeSinceLast_CHQIN'] = (data['transaction_datetime'] - pd.to_datetime(data['DateLastTxn_CHQIN']))//np.timedelta64(1,'D')


#### Deposit Change Passbook

In [None]:
deposit_change_passbook.head()

In [None]:
deposit_change_passbook.shape

In [None]:
%%time

# merge the Card Transactions to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the card transaction record
# will include only transactions within 30 days

data = data.sort_values(by = ['transaction_datetime'])
deposit_change_passbook = deposit_change_passbook.sort_values(by = ['transaction_datetime'])

deposit_change_passbook = pd.merge_asof(deposit_change_passbook,
                    data[['transaction_id','transaction_datetime','customer_id','from_account_no']],
              left_on = ['transaction_datetime'], right_on = ['transaction_datetime'],
              left_by = ['account_number'], right_by = ['from_account_no'],
              tolerance = pd.Timedelta(days = 30),
              direction = 'forward',
              suffixes = ['_pass', None])

deposit_change_passbook.head()

In [None]:
%%time

# number of CHQ IN transactions
a01 = pd.DataFrame(deposit_change_passbook.groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_PASS_L30D'}).fillna(0)
print(a01.shape[0])
print(a01['transaction_id'].nunique())

# date of last idcard transaction
a01 = a01.merge(pd.DataFrame(deposit_change_passbook.groupby(by = ['transaction_id'])['transaction_datetime'].max()).reset_index().rename(columns = {'transaction_datetime':'DateLastTxn_PASS'}), on = ['transaction_id'], how = 'left')

a01.head()

In [None]:
%%time


# Merge back to the Mobile Financial transactions

data = data.merge(a01,
                  left_on = ['transaction_id'],
                  right_on = ['transaction_id'],
                  how = 'left',
                  indicator = 'merge_PASS')

In [None]:
# Make the Time Since fields

data['TimeSinceLast_PASS'] = (data['transaction_datetime'] - pd.to_datetime(data['DateLastTxn_PASS']))//np.timedelta64(1,'D')


#### Deposit Branch Activity

In [None]:
deposit_branch_activity.head()

In [None]:
deposit_branch_activity = deposit_branch_activity.loc[deposit_branch_activity['transaction_datetime'].isna() == False]

In [None]:
%%time

# merge the Card Transactions to the Mobile
# this will merge the Mobile record with the closest future transaction_datetime onto the card transaction record
# will include only transactions within 30 days

data = data.sort_values(by = ['transaction_datetime'])
deposit_branch_activity = deposit_branch_activity.loc[deposit_branch_activity['to_customer_id'].isna() == False].sort_values(by = ['transaction_datetime'])

deposit_branch_activity = pd.merge_asof(deposit_branch_activity,
                    data[['transaction_id','transaction_datetime','customer_id','from_account_no']],
              left_on = ['transaction_datetime'], right_on = ['transaction_datetime'],
              left_by = ['to_customer_id'], right_by = ['customer_id'],
              tolerance = pd.Timedelta(days = 30),
              direction = 'forward',
              suffixes = ['_branch', None])

deposit_branch_activity.head()

In [None]:
deposit_branch_activity['transaction_id'].notna().sum()

In [None]:
%%time

# number of Branch To transactions
a01 = pd.DataFrame(deposit_branch_activity.groupby(by = ['transaction_id']).size()).reset_index().rename(columns = {0:'Txn_Count_BranchTo_L30D'}).fillna(0)
print(a01.shape[0])
print(a01['transaction_id'].nunique())

# date of last Branch To transaction
a01 = a01.merge(pd.DataFrame(deposit_branch_activity.groupby(by = ['transaction_id'])['transaction_datetime'].max()).reset_index().rename(columns = {'transaction_datetime':'DateLastTxn_BranchTo'}), on = ['transaction_id'], how = 'left')

a01.head()

In [None]:
%%time

# Merge back to the Mobile Financial transactions

data = data.drop(columns = ['merge_BranchFrom','Txn_Count_BranchFrom_L30D', 'DateLastTxn_BranchFrom','merge_BranchFroma',
                            'TimeSinceLast_BranchFrom','DateLastTxn_PASS_y','DateLastTxn_PASS_x','Txn_Count_PASS_L30D_x']).rename(columns = {'Txn_Count_PASS_L30D_y':'Txn_Count_PASS_L30D'}).merge(a01,
                  left_on = ['transaction_id'],
                  right_on = ['transaction_id'],
                  how = 'left',
                  indicator = 'merge_BranchTo')

In [None]:
# Make the Time Since fields

data['TimeSinceLast_BranchTo'] = (data['transaction_datetime'] - pd.to_datetime(data['DateLastTxn_BranchTo']))//np.timedelta64(1,'D')


In [None]:
data.groupby(['merge_BranchTo'], dropna=False).size()

In [None]:
data.head()

#### Fraud Warning

In [None]:
fraud_warning.head()

In [None]:
fraud_warning.shape

In [None]:
fraud_warning['citzn_id'].nunique()

#### NO DATE Field - omit

### Write data to Disk

In [None]:
data.to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\MF_Phase2.csv',
           index=False)

### Read data from Disk

In [None]:
date_fields = ['end_date',
'transaction_datetime',
'time_row_before',
'time_row_before_transaction_type',
'time_row_before_from_account_to_account',
'time_row_before_to_account',
'time_row_before_to_bank_code',
'time_row_before_any_id_type',    
'transaction_datetime_f',
'TimeLastTxn_NF',
'TimeLastTxn_NF_01',
'TimeLastTxn_NF_02',
'TimeLastTxn_NF_03',
'TimeLastTxn_NF_04',  
'DateLastTxn_Dep',
'date_of_birth',
'account_open_date']

In [None]:
data = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\MF_Phase2.csv',
                  parse_dates = date_fields)

In [None]:
data.info(verbose=True)

### Feature Selection

In [None]:
pd.DataFrame(data.columns).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\mob_data_columns.csv', index=False)

In [None]:
data.drop_duplicates(subset = ['from_account_no']).groupby(['flag']).size()

In [None]:
data.drop_duplicates(subset = ['from_account_no']).shape

In [None]:
print(data.shape)
df_train = data.loc[data['flag'] != 'TEST']
print(df_train.shape)

In [None]:
df_train_mule = data.loc[data['flag'] != 'TEST'].loc[data['incident_type'] != '[Social Engineering]']
df_train_se = data.loc[data['flag'] != 'TEST'].loc[data['incident_type'] != '[Mule]']
print(df_train_mule.groupby(['flag','incident_type','Fraud'], dropna=False).size())
print(df_train_se.groupby(['flag','incident_type','Fraud'], dropna=False).size())

In [None]:
print(df_train_mule.groupby(['flag','incident_type','fraud_txn'], dropna=False).size())
print(df_train_se.groupby(['flag','incident_type','fraud_txn'], dropna=False).size())

In [None]:
print(data.shape)
df_test = data.loc[data['flag'] == 'TEST']
print(df_test.shape)

df_test['Fraud'] = np.where(df_test['transaction_id'] == 'NT1937830033075700', 1, 0)
df_test.groupby(['Fraud'], dropna=False).size()

In [None]:
df_train_mule.head()

#### Pandas Profiling

In [None]:
profile = ProfileReport(data, title="Exploratory Data Analysis - TTB", minimal = True)

In [None]:
# Write the report to a file
profile.to_file(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\EDA_Mobile.html')

In [None]:
cols = ['to_bank_code',
'transaction_type',
'amount',
'top_up_billers',
'any_id_type',
'available_balance',
'TSLastMFTxn_mins',
'TSLastMFTxn_TransactionType_mins',
'TSLastMFTxn_fromacct_to_acct_mins',
'TSLastMFTxn_toacct_mins',
'TSLastMFTxn_tobankcode_mins',
'TSLastMFTxn_anyidtype_mins',
'MFTxnCount_L15M',
'MFTxnCount_L1H',
'MFTxnCount_L1D',
'MFTxnCount_L7D',
'MFTxnCount_L14D',
'MFTxnCount_L30D',
'MFTxnCount_L90D',
'amount_balance_ratio',
'Unique_To_Account_No_L15M',
'Unique_To_Account_No_L1H',
'Unique_To_Account_No_L1D',
'Unique_To_Account_No_L7D',
'Unique_To_Account_No_L14D',
'Unique_To_Account_No_L30D',
'Unique_To_Account_No_L90D',
'Unique_From_Account_No_L15M',
'Unique_From_Account_No_L1H',
'Unique_From_Account_No_L1D',
'Unique_From_Account_No_L7D',
'Unique_From_Account_No_L14D',
'Unique_From_Account_No_L30D',
'Unique_From_Account_No_L90D',
'Unique_AmountTo_L15M',
'Unique_AmountTo_L1H',
'Unique_AmountTo_L1D',
'Unique_AmountTo_L7D',
'Unique_AmountTo_L14D',
'Unique_AmountTo_L30D',
'Unique_AmountTo_L90D',
'Unique_AmountFrom_L15M',
'Unique_AmountFrom_L1H',
'Unique_AmountFrom_L1D',
'Unique_AmountFrom_L7D',
'Unique_AmountFrom_L14D',
'Unique_AmountFrom_L30D',
'Unique_AmountFrom_L90D',
'MFTxnCountSameType_L15M',
'MFTxnCountSameType_L1H',
'MFTxnCountSameType_L1D',
'MFTxnCountSameType_L7D',
'MFTxnCountSameType_L14D',
'MFTxnCountSameType_L30D',
'MFTxnCountSameType_L90D',
'MFTxnCountSameToAcc_L15M',
'MFTxnCountSameToAcc_L1H',
'MFTxnCountSameToAcc_L1D',
'MFTxnCountSameToAcc_L7D',
'MFTxnCountSameToAcc_L14D',
'MFTxnCountSameToAcc_L30D',
'MFTxnCountSameToAcc_L90D',
'Avg_Amt_L15M',
'Avg_Amt_L1H',
'Avg_Amt_L1D',
'Avg_Amt_L7D',
'Avg_Amt_L14D',
'Avg_Amt_L30D',
'Avg_Amt_L90D',
'Avg_Amt_SameToAcc_L15M',
'Avg_Amt_SameToAcc_L1H',
'Avg_Amt_SameToAcc_L1D',
'Avg_Amt_SameToAcc_L7D',
'Avg_Amt_SameToAcc_L14D',
'Avg_Amt_SameToAcc_L30D',
'Avg_Amt_SameToAcc_L90D',
'Avg_Amt_SameType_L15M',
'Avg_Amt_SameType_L1H',
'Avg_Amt_SameType_L1D',
'Avg_Amt_SameType_L7D',
'Avg_Amt_SameType_L14D',
'Avg_Amt_SameType_L30D',
'Avg_Amt_SameType_L90D',
'Avg_Bal_L15M',
'Avg_Bal_L1H',
'Avg_Bal_L1D',
'Avg_Bal_L7D',
'Avg_Bal_L14D',
'Avg_Bal_L30D',
'Avg_Bal_L90D',
'Avg_Bal_SameToAcc_L15M',
'Avg_Bal_SameToAcc_L1H',
'Avg_Bal_SameToAcc_L1D',
'Avg_Bal_SameToAcc_L7D',
'Avg_Bal_SameToAcc_L14D',
'Avg_Bal_SameToAcc_L30D',
'Avg_Bal_SameToAcc_L90D',
'Avg_AmtBalRatio_L15M',
'Avg_AmtBalRatio_L1H',
'Avg_AmtBalRatio_L1D',
'Avg_AmtBalRatio_L7D',
'Avg_AmtBalRatio_L14D',
'Avg_AmtBalRatio_L30D',
'Avg_AmtBalRatio_L90D',
'Max_Amt_L15M',
'Max_Amt_L1H',
'Max_Amt_L1D',
'Max_Amt_L7D',
'Max_Amt_L14D',
'Max_Amt_L30D',
'Max_Amt_L90D',
'Max_Bal_L15M',
'Max_Bal_L1H',
'Max_Bal_L1D',
'Max_Bal_L7D',
'Max_Bal_L14D',
'Max_Bal_L30D',
'Max_Bal_L90D',
'Max_AmtBalRatio_L15M',
'Max_AmtBalRatio_L1H',
'Max_AmtBalRatio_L1D',
'Max_AmtBalRatio_L7D',
'Max_AmtBalRatio_L14D',
'Max_AmtBalRatio_L30D',
'Max_AmtBalRatio_L90D',
'Txn_Count_NF_L30D',
'Txn_Count_NF_01_L30D',
'Txn_Count_NF_02_L30D',
'Txn_Count_NF_03_L30D',
'Txn_Count_NF_04_L30D',
'NF_Unique_Transaction_Type_L30D',
'NF_Unique_IP_Address_L30D',
'Txn_Count_Dep_L30D',
'Txn_Count_Dep_40_L30D',
'Txn_Count_Dep_20_L30D',
'Dep_Unique_TellerID_L30D',
'Dep_Unique_TransCode_L30D',
'Dep_Unique_Amount_L30D',
'Dep_Unique_RecordType_L30D',
'Dep_Unique_Branch_L30D',
'Dep_AvgAmoutL30D',
'Dep_MaxAmount_L30D',
'Dep_AvgBalance_L30D',
'Dep_MaxBalance_L30D',
'Dep_MinBalance_L30D',
'Dep_AvgAmtBalRatio_L30D',
'Dep_MaxAmtBalRatio_L30D',
'Dep_MinAmtBalRatio_L30D',
'Dep_AvgAcctAge_L30D',
'Dep_MaxAcctAge_L30D',
'Dep_MinAcctAge_L30D',
'customer_subtype',
'customer_segment',
'occupation',
'account_open_amount',
'account_type',
'TimeSinceLast_NF',
'TimeSinceLast_NF01',
'TimeSinceLast_NF02',
'TimeSinceLast_NF03',
'TimeSinceLast_NF04',
'DaysSinceLast_Dep',
'Customer_Age',
'Deposit_Account_Age',
'Txn_Count_ChannelNM_L30D',
'Txn_Count_Ch_MOB_L30D',
'Txn_Count_Ch_INT_L30D',
'Txn_Count_Ch_BIT_L30D',
'Txn_Count_Ch_74_L30D',
'Txn_Count_Ch_76_L30D',
'Ch_Unique_ToBank_L30D',
'Ch_Unique_TransCode_L30D',
'Ch_Unique_Amount_L30D',
'Ch_Unique_Channel_L30D',
'Ch_Unique_TermBankId_L30D',
'Ch_Unique_TermId_L30D',
'Ch_AvgAmoutL30D',
'Ch_MaxAmount_L30D',
'Txn_Count_DCM_L30D',
'Txn_Count_DCM_A_L30D',
'Txn_Count_DCM_U_L30D',
'DCM_Unique_ActionCode_L30D',
'DCM_Unique_UserId_L30D',
'DCM_Unique_UFieldCode_L30D',
'TimeSinceLast_DCM',
'TimeSinceLast_Ch',
'Txn_Count_IDC_L30D',
'TimeSinceLast_IDC',
'Txn_Count_SD_L30D',
'TimeSinceLast_SD',
'Txn_Count_CHQIN_L30D',
'TimeSinceLast_CHQIN',
'Txn_Count_PASS_L30D',
'TimeSinceLast_PASS',
'Txn_Count_BranchTo_L30D',
'TimeSinceLast_BranchTo']

cols_num = ['amount',
'available_balance',
'TSLastMFTxn_mins',
'TSLastMFTxn_TransactionType_mins',
'TSLastMFTxn_fromacct_to_acct_mins',
'TSLastMFTxn_toacct_mins',
'TSLastMFTxn_tobankcode_mins',
'TSLastMFTxn_anyidtype_mins',
'MFTxnCount_L15M',
'MFTxnCount_L1H',
'MFTxnCount_L1D',
'MFTxnCount_L7D',
'MFTxnCount_L14D',
'MFTxnCount_L30D',
'MFTxnCount_L90D',
'amount_balance_ratio',
'Unique_To_Account_No_L15M',
'Unique_To_Account_No_L1H',
'Unique_To_Account_No_L1D',
'Unique_To_Account_No_L7D',
'Unique_To_Account_No_L14D',
'Unique_To_Account_No_L30D',
'Unique_To_Account_No_L90D',
'Unique_From_Account_No_L15M',
'Unique_From_Account_No_L1H',
'Unique_From_Account_No_L1D',
'Unique_From_Account_No_L7D',
'Unique_From_Account_No_L14D',
'Unique_From_Account_No_L30D',
'Unique_From_Account_No_L90D',
'Unique_AmountTo_L15M',
'Unique_AmountTo_L1H',
'Unique_AmountTo_L1D',
'Unique_AmountTo_L7D',
'Unique_AmountTo_L14D',
'Unique_AmountTo_L30D',
'Unique_AmountTo_L90D',
'Unique_AmountFrom_L15M',
'Unique_AmountFrom_L1H',
'Unique_AmountFrom_L1D',
'Unique_AmountFrom_L7D',
'Unique_AmountFrom_L14D',
'Unique_AmountFrom_L30D',
'Unique_AmountFrom_L90D',
'MFTxnCountSameType_L15M',
'MFTxnCountSameType_L1H',
'MFTxnCountSameType_L1D',
'MFTxnCountSameType_L7D',
'MFTxnCountSameType_L14D',
'MFTxnCountSameType_L30D',
'MFTxnCountSameType_L90D',
'MFTxnCountSameToAcc_L15M',
'MFTxnCountSameToAcc_L1H',
'MFTxnCountSameToAcc_L1D',
'MFTxnCountSameToAcc_L7D',
'MFTxnCountSameToAcc_L14D',
'MFTxnCountSameToAcc_L30D',
'MFTxnCountSameToAcc_L90D',
'Avg_Amt_L15M',
'Avg_Amt_L1H',
'Avg_Amt_L1D',
'Avg_Amt_L7D',
'Avg_Amt_L14D',
'Avg_Amt_L30D',
'Avg_Amt_L90D',
'Avg_Amt_SameToAcc_L15M',
'Avg_Amt_SameToAcc_L1H',
'Avg_Amt_SameToAcc_L1D',
'Avg_Amt_SameToAcc_L7D',
'Avg_Amt_SameToAcc_L14D',
'Avg_Amt_SameToAcc_L30D',
'Avg_Amt_SameToAcc_L90D',
'Avg_Amt_SameType_L15M',
'Avg_Amt_SameType_L1H',
'Avg_Amt_SameType_L1D',
'Avg_Amt_SameType_L7D',
'Avg_Amt_SameType_L14D',
'Avg_Amt_SameType_L30D',
'Avg_Amt_SameType_L90D',
'Avg_Bal_L15M',
'Avg_Bal_L1H',
'Avg_Bal_L1D',
'Avg_Bal_L7D',
'Avg_Bal_L14D',
'Avg_Bal_L30D',
'Avg_Bal_L90D',
'Avg_Bal_SameToAcc_L15M',
'Avg_Bal_SameToAcc_L1H',
'Avg_Bal_SameToAcc_L1D',
'Avg_Bal_SameToAcc_L7D',
'Avg_Bal_SameToAcc_L14D',
'Avg_Bal_SameToAcc_L30D',
'Avg_Bal_SameToAcc_L90D',
'Avg_AmtBalRatio_L15M',
'Avg_AmtBalRatio_L1H',
'Avg_AmtBalRatio_L1D',
'Avg_AmtBalRatio_L7D',
'Avg_AmtBalRatio_L14D',
'Avg_AmtBalRatio_L30D',
'Avg_AmtBalRatio_L90D',
'Max_Amt_L15M',
'Max_Amt_L1H',
'Max_Amt_L1D',
'Max_Amt_L7D',
'Max_Amt_L14D',
'Max_Amt_L30D',
'Max_Amt_L90D',
'Max_Bal_L15M',
'Max_Bal_L1H',
'Max_Bal_L1D',
'Max_Bal_L7D',
'Max_Bal_L14D',
'Max_Bal_L30D',
'Max_Bal_L90D',
'Max_AmtBalRatio_L15M',
'Max_AmtBalRatio_L1H',
'Max_AmtBalRatio_L1D',
'Max_AmtBalRatio_L7D',
'Max_AmtBalRatio_L14D',
'Max_AmtBalRatio_L30D',
'Max_AmtBalRatio_L90D',
'Txn_Count_NF_L30D',
'Txn_Count_NF_01_L30D',
'Txn_Count_NF_02_L30D',
'Txn_Count_NF_03_L30D',
'Txn_Count_NF_04_L30D',
'NF_Unique_Transaction_Type_L30D',
'NF_Unique_IP_Address_L30D',
'Txn_Count_Dep_L30D',
'Txn_Count_Dep_40_L30D',
'Txn_Count_Dep_20_L30D',
'Dep_Unique_TellerID_L30D',
'Dep_Unique_TransCode_L30D',
'Dep_Unique_Amount_L30D',
'Dep_Unique_RecordType_L30D',
'Dep_Unique_Branch_L30D',
'Dep_AvgAmoutL30D',
'Dep_MaxAmount_L30D',
'Dep_AvgBalance_L30D',
'Dep_MaxBalance_L30D',
'Dep_MinBalance_L30D',
'Dep_AvgAmtBalRatio_L30D',
'Dep_MaxAmtBalRatio_L30D',
'Dep_MinAmtBalRatio_L30D',
'Dep_AvgAcctAge_L30D',
'Dep_MaxAcctAge_L30D',
'Dep_MinAcctAge_L30D',
'account_open_amount',
'TimeSinceLast_NF',
'TimeSinceLast_NF01',
'TimeSinceLast_NF02',
'TimeSinceLast_NF03',
'TimeSinceLast_NF04',
'DaysSinceLast_Dep',
'Customer_Age',
'Deposit_Account_Age',
           
'Txn_Count_ChannelNM_L30D',
'Txn_Count_Ch_MOB_L30D',
'Txn_Count_Ch_INT_L30D',
'Txn_Count_Ch_BIT_L30D',
'Txn_Count_Ch_74_L30D',
'Txn_Count_Ch_76_L30D',
'Ch_Unique_ToBank_L30D',
'Ch_Unique_TransCode_L30D',
'Ch_Unique_Amount_L30D',
'Ch_Unique_Channel_L30D',
'Ch_Unique_TermBankId_L30D',
'Ch_Unique_TermId_L30D',
'Ch_AvgAmoutL30D',
'Ch_MaxAmount_L30D',
'Txn_Count_DCM_L30D',
'Txn_Count_DCM_A_L30D',
'Txn_Count_DCM_U_L30D',
'DCM_Unique_ActionCode_L30D',
'DCM_Unique_UserId_L30D',
'DCM_Unique_UFieldCode_L30D',
'TimeSinceLast_DCM',
'TimeSinceLast_Ch',
'Txn_Count_IDC_L30D',
'TimeSinceLast_IDC',
'Txn_Count_SD_L30D',
'TimeSinceLast_SD',
'Txn_Count_CHQIN_L30D',
'TimeSinceLast_CHQIN',
'Txn_Count_PASS_L30D',
'TimeSinceLast_PASS',
'Txn_Count_BranchTo_L30D',
'TimeSinceLast_BranchTo']

cols_cat = ['to_bank_code',
'transaction_type',
'top_up_billers',
'any_id_type',
'customer_subtype',
'customer_segment',
'occupation',
'account_type']



In [None]:
# Define the optimal binning objects
optb_cat = OptimalBinning(dtype="categorical", solver="cp")
optb_num = OptimalBinning(dtype="numerical", solver="cp")

In [None]:
# Function to create the dataframe for numeric columns
def multi_table_num(col_num, data):
    dfn = pd.DataFrame()
    for i in col_num:
        x = data[i].values
        y = data['Fraud']
        optb_num.fit(x, y)
        dfn1 = optb_num.binning_table.build()
        dfn1['Name'] = i
        dfn = dfn1.append(dfn)
    return dfn   

In [None]:
# Display Results
dfn = multi_table_num(cols_num, df_train_se)

In [None]:
# Save to file
dfn.to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\ob_num_data_train_se.txt', sep='\t')

In [None]:
# Function to create the dataframe for categorical columns
def multi_table_cat(col_cat, data):
    dfc = pd.DataFrame()
    for i in col_cat:
        x = data[i].values
        y = data['Fraud']
        optb_cat.fit(x, y)
        dfc1 = optb_cat.binning_table.build()
        dfc1['Name'] = i
        dfc = dfc1.append(dfc)
    return dfc

In [None]:
# Display Results
dfc = multi_table_cat(cols_cat, df_train_se)

In [None]:
# Save to file
dfc.to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\ob_cat_data_train_se.txt', sep='\t')

In [None]:
bivariate_tables = dfn.append(dfc)

bivariate_tables.to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\CA_Reports_data_train_se.txt', sep='\t')

In [None]:
# 1) Define list of features and categorical ones
y = df_train_se['Fraud']
X = df_train_se[cols]
# 2) Instantiate BinningProcess
binning_process = BinningProcess(
    categorical_variables = cols_cat,
    variable_names = cols)
# 3) Fit and transform dataset
df_binned = binning_process.fit(X, y)

In [None]:
featurelist = df_binned.summary()

featurelist.to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Feature_List_data__train_se.txt', sep='\t')

### Make the Input Files for MLS

#### Mule Target

In [None]:
df_train_mule[['transaction_id',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'MFTxnCountSameToAcc_L1H',
'TSLastMFTxn_sametoacc_mins',
'Unique_Amount_L1D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'Unique_Amount_L1H',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'TSLastMFTxn_sametype_mins',
'MFTxnCountSameType_L1H',
'Avg_Amt_SameToAcc_L1D',
'Unique_AmountFrom_L1H',
'Max_Amt_L1D',
'MFTxnCount_L1H',
'Avg_Amt_SameType_L1D',
'Avg_Amt_SameType_L1H',
'Unique_Amount_L7D',
'TSLastMFTxn_mins',
'Avg_Amt_L1D',
'Max_Amt_L1H',
'Avg_Amt_L1H',
'Unique_From_Account_No_L1H']].to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Train_Mule001.csv')

In [None]:
df_train_mule[['transaction_id',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'MFTxnCountSameToAcc_L1H',
'TSLastMFTxn_sametoacc_mins',
'Unique_Amount_L1D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'Unique_Amount_L1H',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'TSLastMFTxn_sametype_mins',
'MFTxnCountSameType_L1H',
'Avg_Amt_SameToAcc_L1D',
'Unique_AmountFrom_L1H',
'Max_Amt_L1D',
'MFTxnCount_L1H',
'Avg_Amt_SameType_L1D',
'Avg_Amt_SameType_L1H',
'Unique_Amount_L7D',
'TSLastMFTxn_mins',
'Avg_Amt_L1D',
'Max_Amt_L1H',
'Avg_Amt_L1H',
'Unique_From_Account_No_L1H']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Train_Mule001_RF.csv')

In [None]:
df_train_mule[['transaction_id',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'MFTxnCountSameToAcc_L1H',
'TSLastMFTxn_sametoacc_mins',
'Unique_Amount_L1D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'Unique_Amount_L1H',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'TSLastMFTxn_sametype_mins',
'MFTxnCountSameType_L1H',
'Avg_Amt_SameToAcc_L1D',
'Unique_AmountFrom_L1H',
'Max_Amt_L1D',
'MFTxnCount_L1H',
'Avg_Amt_SameType_L1D',
'Avg_Amt_SameType_L1H',
'Unique_Amount_L7D',
'TSLastMFTxn_mins',
'Avg_Amt_L1D',
'Max_Amt_L1H',
'Avg_Amt_L1H',
'Unique_From_Account_No_L1H',
'Unique_Amount_L14D',
'MFTxnCountSameToAcc_L7D',
'Dep_Unique_TellerID_L30D',
'Unique_To_Account_No_L1H',
'Txn_Count_Dep_L30D',
'Avg_Amt_SameToAcc_L7D',
'Unique_Amount_L30D',
'transaction_type',
'Dep_Unique_RecordType_L30D',
'Avg_Amt_SameToAcc_L14D',
'Unique_From_Account_No_L1D',
'Avg_Amt_SameType_L7D',
'Dep_Unique_Amount_L30D',
'MFTxnCountSameType_L7D',
'MFTxnCountSameToAcc_L14D',
'Dep_Unique_Branch_L30D',
'Unique_AmountFrom_L7D',
'Txn_Count_Dep_40_L30D',
'Max_Amt_L7D',
'Avg_Amt_L7D',
'Dep_Unique_TransCode_L30D',
'Unique_Amount_L90D',
'MFTxnCount_L7D',
'Avg_Amt_SameType_L14D',
'Avg_Amt_SameToAcc_L30D',
'Unique_From_Account_No_L7D',
'MFTxnCountSameToAcc_L30D',
'Dep_MaxAmount_L30D',
'Dep_MaxBalance_L30D',
'Avg_Amt_SameToAcc_L90D',
'Avg_Amt_L14D',
'Unique_From_Account_No_L90D',
'MFTxnCountSameType_L14D',
'Unique_To_Account_No_L1D',
'Dep_MinBalance_L30D',
'Unique_From_Account_No_L14D',
'Avg_Amt_SameType_L30D',
'available_balance',
'Dep_AvgAmoutL30D',
'Dep_AvgBalance_L30D',
'Max_Amt_L14D',
'Unique_AmountFrom_L14D',
'Unique_From_Account_No_L30D',
'Avg_Amt_L30D',
'MFTxnCountSameToAcc_L90D',
'DaysSinceLast_Dep',
'occupation',
'MFTxnCount_L14D',
'Unique_To_Account_No_L90D',
'Customer_Age',
'MFTxnCountSameType_L30D',
'Unique_To_Account_No_L30D',
'Avg_Amt_L90D',
'Unique_To_Account_No_L14D',
'Unique_To_Account_No_L7D',
'Txn_Count_Dep_20_L30D',
'Max_Amt_L30D',
'to_bank_code',
'MFTxnCountSameType_L90D',
'Unique_AmountFrom_L30D',
'customer_segment',
'MFTxnCount_L30D',
'Max_Amt_L90D',
'Avg_Amt_SameType_L90D',
'any_id_type',
'MFTxnCount_L90D',
'Unique_AmountFrom_L90D',
'amount']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Train_Mule001_Large_RF.csv')

In [None]:
data.loc[data['flag'] == 'TEST'][['transaction_id',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'MFTxnCountSameToAcc_L1H',
'TSLastMFTxn_sametoacc_mins',
'Unique_Amount_L1D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'Unique_Amount_L1H',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'TSLastMFTxn_sametype_mins',
'MFTxnCountSameType_L1H',
'Avg_Amt_SameToAcc_L1D',
'Unique_AmountFrom_L1H',
'Max_Amt_L1D',
'MFTxnCount_L1H',
'Avg_Amt_SameType_L1D',
'Avg_Amt_SameType_L1H',
'Unique_Amount_L7D',
'TSLastMFTxn_mins',
'Avg_Amt_L1D',
'Max_Amt_L1H',
'Avg_Amt_L1H',
'Unique_From_Account_No_L1H']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\TEST_Mule001_RF.csv')

In [None]:
data.loc[data['flag'] == 'TEST'][['transaction_id',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'MFTxnCountSameToAcc_L1H',
'TSLastMFTxn_sametoacc_mins',
'Unique_Amount_L1D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'Unique_Amount_L1H',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'TSLastMFTxn_sametype_mins',
'MFTxnCountSameType_L1H',
'Avg_Amt_SameToAcc_L1D',
'Unique_AmountFrom_L1H',
'Max_Amt_L1D',
'MFTxnCount_L1H',
'Avg_Amt_SameType_L1D',
'Avg_Amt_SameType_L1H',
'Unique_Amount_L7D',
'TSLastMFTxn_mins',
'Avg_Amt_L1D',
'Max_Amt_L1H',
'Avg_Amt_L1H',
'Unique_From_Account_No_L1H',
'Unique_Amount_L14D',
'MFTxnCountSameToAcc_L7D',
'Dep_Unique_TellerID_L30D',
'Unique_To_Account_No_L1H',
'Txn_Count_Dep_L30D',
'Avg_Amt_SameToAcc_L7D',
'Unique_Amount_L30D',
'transaction_type',
'Dep_Unique_RecordType_L30D',
'Avg_Amt_SameToAcc_L14D',
'Unique_From_Account_No_L1D',
'Avg_Amt_SameType_L7D',
'Dep_Unique_Amount_L30D',
'MFTxnCountSameType_L7D',
'MFTxnCountSameToAcc_L14D',
'Dep_Unique_Branch_L30D',
'Unique_AmountFrom_L7D',
'Txn_Count_Dep_40_L30D',
'Max_Amt_L7D',
'Avg_Amt_L7D',
'Dep_Unique_TransCode_L30D',
'Unique_Amount_L90D',
'MFTxnCount_L7D',
'Avg_Amt_SameType_L14D',
'Avg_Amt_SameToAcc_L30D',
'Unique_From_Account_No_L7D',
'MFTxnCountSameToAcc_L30D',
'Dep_MaxAmount_L30D',
'Dep_MaxBalance_L30D',
'Avg_Amt_SameToAcc_L90D',
'Avg_Amt_L14D',
'Unique_From_Account_No_L90D',
'MFTxnCountSameType_L14D',
'Unique_To_Account_No_L1D',
'Dep_MinBalance_L30D',
'Unique_From_Account_No_L14D',
'Avg_Amt_SameType_L30D',
'available_balance',
'Dep_AvgAmoutL30D',
'Dep_AvgBalance_L30D',
'Max_Amt_L14D',
'Unique_AmountFrom_L14D',
'Unique_From_Account_No_L30D',
'Avg_Amt_L30D',
'MFTxnCountSameToAcc_L90D',
'DaysSinceLast_Dep',
'occupation',
'MFTxnCount_L14D',
'Unique_To_Account_No_L90D',
'Customer_Age',
'MFTxnCountSameType_L30D',
'Unique_To_Account_No_L30D',
'Avg_Amt_L90D',
'Unique_To_Account_No_L14D',
'Unique_To_Account_No_L7D',
'Txn_Count_Dep_20_L30D',
'Max_Amt_L30D',
'to_bank_code',
'MFTxnCountSameType_L90D',
'Unique_AmountFrom_L30D',
'customer_segment',
'MFTxnCount_L30D',
'Max_Amt_L90D',
'Avg_Amt_SameType_L90D',
'any_id_type',
'MFTxnCount_L90D',
'Unique_AmountFrom_L90D',
'amount']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\TEST_Mule001_Large_RF.csv')

In [None]:
data.loc[data['flag'] == 'TEST'].loc[data['transaction_type'].str.len() > 3][['transaction_id',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'MFTxnCountSameToAcc_L1H',
'TSLastMFTxn_sametoacc_mins',
'Unique_Amount_L1D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'Unique_Amount_L1H',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'TSLastMFTxn_sametype_mins',
'MFTxnCountSameType_L1H',
'Avg_Amt_SameToAcc_L1D',
'Unique_AmountFrom_L1H',
'Max_Amt_L1D',
'MFTxnCount_L1H',
'Avg_Amt_SameType_L1D',
'Avg_Amt_SameType_L1H',
'Unique_Amount_L7D',
'TSLastMFTxn_mins',
'Avg_Amt_L1D',
'Max_Amt_L1H',
'Avg_Amt_L1H',
'Unique_From_Account_No_L1H']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\TEST_Mule001_RF_TTRestricted.csv')

### Phase 2 Files for ML Module

In [None]:
df_train_mule.groupby(['flag','fraud_txn','Fraud'], dropna=False).size()

In [None]:
df_train_mule[['transaction_id',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'TSLastMFTxn_fromacct_to_acct_mins',
'Unique_AmountTo_L1D',
'Dep_AvgAcctAge_L30D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'TSLastMFTxn_tobankcode_mins',
'TSLastMFTxn_toacct_mins',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'Avg_Amt_L1H',
'Avg_Bal_L14D',
'Max_AmtBalRatio_L1H',
'transaction_type',
'available_balance',
'occupation',
'Customer_Age',
'amount_balance_ratio',
'customer_segment',
'TimeSinceLast_Ch',
'amount',
'Txn_Count_SD_L30D',
'Txn_Count_IDC_L30D',
'Txn_Count_DCM_L30D',
'Txn_Count_PASS_L30D',
'Txn_Count_CHQIN_L30D',
'Txn_Count_BranchTo_L30D']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Phase 2\Train_Mule_Small01_FillNA.csv')

In [None]:
df_test[['transaction_id',
         'customer_id',
         'account_number',
         transaction_datetime',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'TSLastMFTxn_fromacct_to_acct_mins',
'Unique_AmountTo_L1D',
'Dep_AvgAcctAge_L30D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'TSLastMFTxn_tobankcode_mins',
'TSLastMFTxn_toacct_mins',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'Avg_Amt_L1H',
'Avg_Bal_L14D',
'Max_AmtBalRatio_L1H',
'transaction_type',
'available_balance',
'occupation',
'Customer_Age',
'amount_balance_ratio',
'customer_segment',
'TimeSinceLast_Ch',
'amount',
'Txn_Count_SD_L30D',
'Txn_Count_IDC_L30D',
'Txn_Count_DCM_L30D',
'Txn_Count_PASS_L30D',
'Txn_Count_CHQIN_L30D',
'Txn_Count_BranchTo_L30D']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Phase 2\TEST_Mule_Small01_FillNA.csv')


In [None]:
df_train_se[['transaction_id',
'Fraud',
'transaction_type',
'to_bank_code',
'TSLastMFTxn_fromacct_to_acct_mins',
'TSLastMFTxn_toacct_mins',
'Max_Amt_L1D',
'Customer_Age',
'TSLastMFTxn_mins',
'Avg_Amt_L1D',
'Max_AmtBalRatio_L1D',
'Avg_AmtBalRatio_L1D',
'Avg_Amt_SameToAcc_L1H',
'Unique_AmountFrom_L90D']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Phase 2\Train_SE_Small01_FillNA.csv',
                                            index=False)


In [None]:
df_test[['transaction_id',
'Fraud',
'transaction_type',
'to_bank_code',
'TSLastMFTxn_fromacct_to_acct_mins',
'TSLastMFTxn_toacct_mins',
'Max_Amt_L1D',
'Customer_Age',
'TSLastMFTxn_mins',
'Avg_Amt_L1D',
'Max_AmtBalRatio_L1D',
'Avg_AmtBalRatio_L1D',
'Avg_Amt_SameToAcc_L1H',
'Unique_AmountFrom_L90D']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Phase 2\TEST_SE_Small01_FillNA.csv',
                                            index=False)


In [None]:
df_test[['transaction_id',
         'customer_id',
         'account_number',
         'transaction_datetime',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'TSLastMFTxn_fromacct_to_acct_mins',
'Unique_AmountTo_L1D',
'Dep_AvgAcctAge_L30D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'TSLastMFTxn_tobankcode_mins',
'TSLastMFTxn_toacct_mins',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'Avg_Amt_L1H',
'Avg_Bal_L14D',
'Max_AmtBalRatio_L1H',
'transaction_type',
'available_balance',
'occupation',
'Customer_Age',
'amount_balance_ratio',
'customer_segment',
'TimeSinceLast_Ch',
'amount',
'Txn_Count_SD_L30D',
'Txn_Count_IDC_L30D',
'Txn_Count_DCM_L30D',
'Txn_Count_PASS_L30D',
'Txn_Count_CHQIN_L30D',
'Txn_Count_BranchTo_L30D']].to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Phase 2\TEST_Mule_Small02.csv')


In [None]:
x = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Deposit\Deposit Mule Customer IDs ML Detected.txt')

In [None]:
x.shape

In [None]:
x = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\Mobile Financial Mule Customer IDs ML Detected.txt')

In [None]:
a = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Phase 2\Test_Mule_Small02.csv')

In [None]:
a.shape

In [None]:
a.columns

In [None]:
b = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Phase2_TEST_MULE_LGBM02.csv')

In [None]:
b.shape

In [None]:
b.columns

In [None]:
c = pd.concat([a[['transaction_id','customer_id','account_number',
                 'transaction_datetime']],b[['fraud','ML_Score','transaction_type']]], axis=1)

In [None]:
c['transaction_id'].nunique()

In [None]:
d = c.loc[c['transaction_type'] > 1000].merge(mobile_rule_result[['transaction_id','rules']],
            on = ['transaction_id'],
            how = 'left',
            indicator = "merge_rules")

In [None]:
d.groupby(['rules'], dropna=False).size()

In [None]:

print('Number of TEST cases: ',d.shape[0])

print('Number of ML Alerts: ',d.loc[d['ML_Score'] >= 0.353].shape[0])

print('ML Alert Rate: ', (d.loc[d['ML_Score'] >= 0.353].shape[0])/d.shape[0])

print('Number of Rule-Based Alerts: ',d.loc[d['rules'].isna() == False].shape[0])

print('Rules-based Alert Rate:', d.loc[d['rules'].isna() == False].shape[0]/d.shape[0])

print('Number of ML+Rules Alerts:',d.loc[(d['ML_Score'] >= 0.353) & (d['rules'].isna() == False)].shape[0])

print('ML+Rules Alert Rate:',d.loc[(d['ML_Score'] >= 0.353) & (d['rules'].isna() == False)].shape[0]/d.shape[0])



In [None]:
d.loc[d['ML_Score'] >= 0.353].drop(columns=['transaction_type','merge_rules','fraud']).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\MobileFinancial_MLAlerts_Phase2.txt',
                                    sep = '\t',
                                    index=False)

In [None]:
d.loc[(d['rules'].isna() == False) & (d['ML_Score'] >= 0.353)].drop(columns=['transaction_type','merge_rules','fraud']).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\MobileFinancial_ML+RulesAlerts_Phase2.txt',
                                    sep = '\t',
                                    index=False)

In [None]:
d.groupby

In [None]:
# Make a file with flagged Customer ID's and number of transactions

pd.DataFrame(d.loc[d['ML_Score'] >= 0.353].groupby(['customer_id']).size()).reset_index().rename(columns = {0:'Number of Transactions'}).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\Mobile Financial Mule Customer IDs ML Detected.txt', index=False)

In [None]:
# Make a file with flagged account numbers and number of transactions

pd.DataFrame(d.loc[d['ML_Score'] >= 0.353].groupby(['account_number']).size()).reset_index().rename(columns = {0:'Number of Transactions'}).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\Mobile Financial Mule Account Numbers ML Detected.txt', index=False)

In [None]:
pd.DataFrame(d.loc[d['ML_Score'] >= 0.353].groupby(['account_number']).size()).sum()

In [None]:
df_train_mule.shape

In [None]:
f = df_train_mule[['transaction_id','customer_id','account_number','transaction_datetime','Fraud','fraud_txn']].merge(mobile_rule_result[['transaction_id','rules']],
                                                                                                                 on = ['transaction_id'],
                                                                                                                 how = 'left')


f['rule_triggered'] = f['rules'].isna() == False

In [None]:
f.groupby(['rule_triggered','Fraud'], dropna=False).size()

In [None]:
f.groupby(['rule_triggered'], dropna=False).size()

In [None]:
f.loc[f['rule_triggered'] == True].shape[0]/f.shape[0]

In [78]:
mobile_financial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4096478 entries, 0 to 4096477
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   transaction_id         object        
 1   transaction_datetime   datetime64[ns]
 2   customer_id            object        
 3   from_account_no        object        
 4   to_account_no          object        
 5   to_bank_code           object        
 6   transaction_type       object        
 7   amount                 Float64       
 8   top_up_billers         object        
 9   top_up_reference_no    object        
 10  transaction_status     object        
 11  any_id_type            object        
 12  any_id_value           object        
 13  ip_address             object        
 14  device_id              object        
 15  available_balance      Float64       
 16  old_value              object        
 17  new_value              object        
 18  transaction_datetime1 

In [77]:
mobile_financial['transaction_datetime'] = mobile_financial['transaction_datetime1']

In [80]:
df_train_in = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Phase 2\Train_Mule_Small01.csv',
                        dtype = {'transaction_type':'object'}).drop(columns = ['Unnamed: 0'])

In [81]:
df_train_in.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299009 entries, 0 to 1299008
Data columns (total 31 columns):
 #   Column                             Non-Null Count    Dtype  
---  ------                             --------------    -----  
 0   transaction_id                     1299009 non-null  object 
 1   Fraud                              1299009 non-null  int64  
 2   Deposit_Account_Age                1298822 non-null  float64
 3   MFTxnCountSameToAcc_L1D            1299009 non-null  float64
 4   Unique_AmountFrom_L1D              1299009 non-null  float64
 5   TSLastMFTxn_fromacct_to_acct_mins  1299009 non-null  float64
 6   Unique_AmountTo_L1D                1299009 non-null  float64
 7   Dep_AvgAcctAge_L30D                781639 non-null   float64
 8   MFTxnCount_L1D                     1299009 non-null  float64
 9   MFTxnCountSameType_L1D             1299009 non-null  float64
 10  TSLastMFTxn_tobankcode_mins        1299009 non-null  float64
 11  TSLastMFTxn_toacct_mins 

In [82]:
df_train_in['transaction_id'] = df_train_in['transaction_id'].astype('str')
mobile_financial['transaction_id'] = mobile_financial['transaction_id'].astype('str')

print(df_train_in['transaction_id'].isna().sum())
print(mobile_financial['transaction_id'].isna().sum())
print(df_train_in.shape[0])
print(mobile_financial.shape[0])

0
0
1299009
4096478


In [83]:
df_train_in_test = df_train_in.merge(mobile_financial[['transaction_id','transaction_datetime','customer_id','from_account_no']],
                                     left_on = ['transaction_id'],
                                     right_on = ['transaction_id'],
                                     how = 'left',
                                     indicator = 'merge_mf_train')

In [84]:
print(df_train_in.shape[0])
print(df_train_in['transaction_id'].isna().sum())
print(mobile_financial.shape[0])
print(mobile_financial['transaction_id'].isna().sum())
print(df_train_in_test['transaction_id'].isna().sum())
print(df_train_in_test.shape[0])
print(mobile_financial['transaction_id'].nunique())
print(df_train_in_test['transaction_id'].nunique())

1299009
0
4096478
0
0
1299009
4096478
1299009


In [85]:
df_train_in_test.groupby(['merge_mf_train'], dropna=False).size()

merge_mf_train
left_only           0
right_only          0
both          1299009
dtype: int64

In [86]:
df_train_in_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1299009 entries, 0 to 1299008
Data columns (total 35 columns):
 #   Column                             Non-Null Count    Dtype         
---  ------                             --------------    -----         
 0   transaction_id                     1299009 non-null  object        
 1   Fraud                              1299009 non-null  int64         
 2   Deposit_Account_Age                1298822 non-null  float64       
 3   MFTxnCountSameToAcc_L1D            1299009 non-null  float64       
 4   Unique_AmountFrom_L1D              1299009 non-null  float64       
 5   TSLastMFTxn_fromacct_to_acct_mins  1299009 non-null  float64       
 6   Unique_AmountTo_L1D                1299009 non-null  float64       
 7   Dep_AvgAcctAge_L30D                781639 non-null   float64       
 8   MFTxnCount_L1D                     1299009 non-null  float64       
 9   MFTxnCountSameType_L1D             1299009 non-null  float64       
 10  TSLast

In [None]:
# OLD Data from EL/CR (Phase 1)

mobile_rule_result = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Rules\mobile_rule_result_v2.csv',
                                parse_dates = ['transaction_date_time'])

In [9]:
# NEW data from EL/CR

mobile_rule_result = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Rules\poc2_result.txt',
                                parse_dates = ['datetime']).rename(columns = {'string_agg':'rules'})

In [87]:
mobile_rule_result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686924 entries, 0 to 686923
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   reference_number  686924 non-null  object        
 1   customer_id       686924 non-null  object        
 2   account_number    686824 non-null  object        
 3   datetime          686924 non-null  datetime64[ns]
 4   rules             686924 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 26.2+ MB


In [88]:
df_train_out = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Rules\Mobile Train Result\validation.csv',
                         dtype = {'transaction_type':'object'})

In [89]:
df_train_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299009 entries, 0 to 1299008
Data columns (total 31 columns):
 #   Column                             Non-Null Count    Dtype  
---  ------                             --------------    -----  
 0   deposit_account_age                1298822 non-null  float64
 1   mftxncountsametoacc_l1d            1299009 non-null  float64
 2   unique_amountfrom_l1d              1299009 non-null  float64
 3   tslastmftxn_fromacct_to_acct_mins  1299009 non-null  float64
 4   unique_amountto_l1d                1299009 non-null  float64
 5   dep_avgacctage_l30d                781639 non-null   float64
 6   mftxncount_l1d                     1299009 non-null  float64
 7   mftxncountsametype_l1d             1299009 non-null  float64
 8   tslastmftxn_tobankcode_mins        1299009 non-null  float64
 9   tslastmftxn_toacct_mins            1299009 non-null  float64
 10  account_open_amount                1296417 non-null  float64
 11  avg_amt_sametoacc_l1h   

In [90]:
df_train_out.tail()

Unnamed: 0,deposit_account_age,mftxncountsametoacc_l1d,unique_amountfrom_l1d,tslastmftxn_fromacct_to_acct_mins,unique_amountto_l1d,dep_avgacctage_l30d,mftxncount_l1d,mftxncountsametype_l1d,tslastmftxn_tobankcode_mins,tslastmftxn_toacct_mins,account_open_amount,avg_amt_sametoacc_l1h,avg_amt_l1h,avg_bal_l14d,max_amtbalratio_l1h,transaction_type,available_balance,occupation,customer_age,amount_balance_ratio,customer_segment,timesincelast_ch,amount,txn_count_sd_l30d,txn_count_idc_l30d,txn_count_dcm_l30d,txn_count_pass_l30d,txn_count_chqin_l30d,txn_count_branchto_l30d,fraud,ML_Score
1299004,2.0,0.0,0.0,-1.0,0.0,209.0,0.0,0.0,-1.0,-1.0,0.1,0.0,0.0,0.0,0.0,100400220,0.23,407.0,26.0,0.434783,7.0,28.0,0.1,,,,,,,1,0.097268
1299005,119.0,0.0,0.0,-1.0,2.0,,0.0,0.0,7085.083333,92.366667,6.0,0.0,0.0,1913.44,0.0,100400220,51.48,229.0,23.0,77.700078,7.0,,4000.0,,,,,,,1,0.032419
1299006,119.0,0.0,1.0,-1.0,0.0,,1.0,1.0,40.583333,-1.0,6.0,0.0,4000.0,1603.113333,77.700078,100400220,51.04,229.0,23.0,88.166144,7.0,0.0,4500.0,,,,,,,1,0.056278
1299007,119.0,0.0,2.0,39520.983333,0.0,,2.0,2.0,117.016667,39520.983333,6.0,0.0,0.0,1381.388571,0.0,100400220,51.04,229.0,23.0,195.924765,7.0,,10000.0,,,,,,,1,0.032419
1299008,4178.0,0.0,0.0,138314.266667,0.0,,0.0,0.0,77771.95,1891.483333,5000.0,0.0,0.0,0.0,0.0,100400220,15409.89,105.0,49.0,0.08501,6.0,,1310.0,,,,,,,0,0.025338


In [91]:
df_train_in.tail()

Unnamed: 0,transaction_id,Fraud,Deposit_Account_Age,MFTxnCountSameToAcc_L1D,Unique_AmountFrom_L1D,TSLastMFTxn_fromacct_to_acct_mins,Unique_AmountTo_L1D,Dep_AvgAcctAge_L30D,MFTxnCount_L1D,MFTxnCountSameType_L1D,TSLastMFTxn_tobankcode_mins,TSLastMFTxn_toacct_mins,account_open_amount,Avg_Amt_SameToAcc_L1H,Avg_Amt_L1H,Avg_Bal_L14D,Max_AmtBalRatio_L1H,transaction_type,available_balance,occupation,Customer_Age,amount_balance_ratio,customer_segment,TimeSinceLast_Ch,amount,Txn_Count_SD_L30D,Txn_Count_IDC_L30D,Txn_Count_DCM_L30D,Txn_Count_PASS_L30D,Txn_Count_CHQIN_L30D,Txn_Count_BranchTo_L30D
1299004,202305041201091175,1,2.0,0.0,0.0,-1.0,0.0,209.0,0.0,0.0,-1.0,-1.0,0.1,0.0,0.0,0.0,0.0,100400220,0.23,407.0,26.0,0.434783,7.0,28.0,0.1,,,,,,
1299005,202305051601748443,1,119.0,0.0,0.0,-1.0,2.0,,0.0,0.0,7085.083333,92.366667,6.0,0.0,0.0,1913.44,0.0,100400220,51.48,229.0,23.0,77.700078,7.0,,4000.0,,,,,,
1299006,202305051701875577,1,119.0,0.0,1.0,-1.0,0.0,,1.0,1.0,40.583333,-1.0,6.0,0.0,4000.0,1603.113333,77.700078,100400220,51.04,229.0,23.0,88.166144,7.0,0.0,4500.0,,,,,,
1299007,202305051902283327,1,119.0,0.0,2.0,39520.983333,0.0,,2.0,2.0,117.016667,39520.983333,6.0,0.0,0.0,1381.388571,0.0,100400220,51.04,229.0,23.0,195.924765,7.0,,10000.0,,,,,,
1299008,202305091401457631,0,4178.0,0.0,0.0,138314.266667,0.0,,0.0,0.0,77771.95,1891.483333,5000.0,0.0,0.0,0.0,0.0,100400220,15409.89,105.0,49.0,0.08501,6.0,,1310.0,,,,,,


In [92]:
df_train_in_out = pd.concat([df_train_out[['ML_Score','fraud']],df_train_in_test[['transaction_id','transaction_type','customer_id','from_account_no','transaction_datetime']]],
                           axis = 1).merge(mobile_rule_result[['rules','customer_id','account_number','datetime']],
                                           left_on = ['customer_id','from_account_no','transaction_datetime'],
                                           right_on = ['customer_id','account_number','datetime'],
                                                                   how = 'left',
                                                                   indicator = "merge_rules")

In [93]:
df_train_in_out.groupby(['rules'], dropna=False).size()

rules
BRANCH010                                                                     162
BRANCH010,BRANCH011                                                             3
BRANCH011                                                                  111645
BRANCH011,BRANCH010                                                             6
MOBILE002                                                                   29863
MOBILE002,MOBILE002                                                         28234
MOBILE002,MOBILE002,MULE017                                                   511
MOBILE002,MOBILE002,MULE017,MULE017                                             6
MOBILE002,MOBILE002,MULE017,MULE017,MULE025                                     1
MOBILE002,MOBILE002,MULE017,MULE025                                             5
MOBILE002,MOBILE002,MULE017,MULE025,MULE017                                     5
MOBILE002,MOBILE002,MULE017,MULE025,MULE025                                     1
MOBILE002,

In [94]:
df_train_in_out.loc[df_train_in_out['rules'].isna() == False].groupby(['transaction_type'], dropna=False).size()

transaction_type
100400120      5539
100400220     59091
100400320    321380
100400420       120
100400520      1819
100400620         1
100400720        41
100600120       656
100600220       146
101200120       306
101200220      1607
101200320        22
101200420       226
23              123
24              343
25            12226
30                2
604              13
605             253
607              22
611              79
63                3
dtype: int64

In [None]:
df_train_in_out.head()

In [95]:
df_train_in_out.groupby(['merge_rules'], dropna=False).size()

merge_rules
left_only     988109
right_only         0
both          404018
dtype: int64

In [96]:
df_train_in_out['rules_alert'] = np.where(df_train_in_out['rules'].isna() == False,1,0)
df_train_in_out['ML_alert'] = np.where(df_train_in_out['ML_Score'] >= 0.353,1,0)
df_train_in_out['ML_rules_alert'] = np.where((df_train_in_out['ML_Score'] >= 0.353) & (df_train_in_out['rules'].isna() == False),1,0)

In [97]:
print(df_train_in_out.groupby(['rules_alert','fraud'], dropna=False).size())
print(df_train_in_out.groupby(['ML_alert','fraud'], dropna=False).size())
print(df_train_in_out.groupby(['ML_rules_alert','fraud'], dropna=False).size())

rules_alert  fraud
0            0        988109
1            0        286537
             1        117481
dtype: int64
ML_alert  fraud
0         0        1254447
          1          56596
1         0          20199
          1          60885
dtype: int64
ML_rules_alert  fraud
0               0        1261766
                1          56596
1               0          12880
                1          60885
dtype: int64


In [98]:
print('Rules Alerts = ',df_train_in_out.loc[df_train_in_out['rules_alert'] == 1].shape[0])
print('Rules Alert Rate = ',df_train_in_out.loc[df_train_in_out['rules_alert'] == 1].shape[0]/df_train_in_out.shape[0])
print('ML Alerts = ',df_train_in_out.loc[df_train_in_out['ML_alert'] == 1].shape[0])
print('ML Alert Rate = ',df_train_in_out.loc[df_train_in_out['ML_alert'] == 1].shape[0]/df_train_in_out.shape[0])
print('ML + Rules Alerts = ',df_train_in_out.loc[df_train_in_out['ML_rules_alert'] == 1].shape[0])
print('ML + Rules Alert Rate = ',df_train_in_out.loc[df_train_in_out['ML_rules_alert'] == 1].shape[0]/df_train_in_out.shape[0])


Rules Alerts =  404018
Rules Alert Rate =  0.2902163380208846
ML Alerts =  81084
ML Alert Rate =  0.05824468600925059
ML + Rules Alerts =  73765
ML + Rules Alert Rate =  0.05298726337467774


In [99]:
print('Rules Alert Precision = ',df_train_in_out.loc[(df_train_in_out['rules_alert'] == 1)&(df_train_in_out['fraud'] == 1)].shape[0]/df_train_in_out.loc[df_train_in_out['rules_alert'] == 1].shape[0])
print('Rules Alert Recall = ',df_train_in_out.loc[(df_train_in_out['rules_alert'] == 1)&(df_train_in_out['fraud'] == 1)].shape[0]/df_train_in_out.loc[df_train_in_out['fraud'] == 1].shape[0])
print('ML Alert Precision = ',df_train_in_out.loc[(df_train_in_out['ML_alert'] == 1)&(df_train_in_out['fraud'] == 1)].shape[0]/df_train_in_out.loc[df_train_in_out['ML_alert'] == 1].shape[0])
print('ML Alert Recall = ',df_train_in_out.loc[(df_train_in_out['ML_alert'] == 1)&(df_train_in_out['fraud'] == 1)].shape[0]/df_train_in_out.loc[df_train_in_out['fraud'] == 1].shape[0])
print('ML+Rules Alert Precision = ',df_train_in_out.loc[(df_train_in_out['ML_rules_alert'] == 1)&(df_train_in_out['fraud'] == 1)].shape[0]/df_train_in_out.loc[df_train_in_out['ML_rules_alert'] == 1].shape[0])
print('ML+Rules Alert Recall = ',df_train_in_out.loc[(df_train_in_out['ML_rules_alert'] == 1)&(df_train_in_out['fraud'] == 1)].shape[0]/df_train_in_out.loc[df_train_in_out['fraud'] == 1].shape[0])


Rules Alert Precision =  0.29078159883965565
Rules Alert Recall =  1.0
ML Alert Precision =  0.7508879680331508
ML Alert Recall =  0.5182540155429388
ML+Rules Alert Precision =  0.8253914458076324
ML+Rules Alert Recall =  0.5182540155429388


### TEST data

In [36]:
test = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\TEST Data MF Model Phase 2.csv')

In [37]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1807633 entries, 0 to 1807632
Data columns (total 31 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   deposit_account_age                float64
 1   mftxncountsametoacc_l1d            float64
 2   unique_amountfrom_l1d              float64
 3   tslastmftxn_fromacct_to_acct_mins  float64
 4   unique_amountto_l1d                float64
 5   dep_avgacctage_l30d                float64
 6   mftxncount_l1d                     float64
 7   mftxncountsametype_l1d             float64
 8   tslastmftxn_tobankcode_mins        float64
 9   tslastmftxn_toacct_mins            float64
 10  account_open_amount                float64
 11  avg_amt_sametoacc_l1h              float64
 12  avg_amt_l1h                        float64
 13  avg_bal_l14d                       float64
 14  max_amtbalratio_l1h                float64
 15  transaction_type                   int64  
 16  available_balance 

In [38]:
test_in = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Phase 2\TEST_Mule_Small01.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [39]:
test_in.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1807633 entries, 0 to 1807632
Data columns (total 32 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   Unnamed: 0                         int64  
 1   transaction_id                     object 
 2   Fraud                              int64  
 3   Deposit_Account_Age                float64
 4   MFTxnCountSameToAcc_L1D            float64
 5   Unique_AmountFrom_L1D              float64
 6   TSLastMFTxn_fromacct_to_acct_mins  float64
 7   Unique_AmountTo_L1D                float64
 8   Dep_AvgAcctAge_L30D                float64
 9   MFTxnCount_L1D                     float64
 10  MFTxnCountSameType_L1D             float64
 11  TSLastMFTxn_tobankcode_mins        float64
 12  TSLastMFTxn_toacct_mins            float64
 13  account_open_amount                float64
 14  Avg_Amt_SameToAcc_L1H              float64
 15  Avg_Amt_L1H                        float64
 16  Avg_Bal_L14D      

In [40]:
test.head()

Unnamed: 0,deposit_account_age,mftxncountsametoacc_l1d,unique_amountfrom_l1d,tslastmftxn_fromacct_to_acct_mins,unique_amountto_l1d,dep_avgacctage_l30d,mftxncount_l1d,mftxncountsametype_l1d,tslastmftxn_tobankcode_mins,tslastmftxn_toacct_mins,account_open_amount,avg_amt_sametoacc_l1h,avg_amt_l1h,avg_bal_l14d,max_amtbalratio_l1h,transaction_type,available_balance,occupation,customer_age,amount_balance_ratio,customer_segment,timesincelast_ch,amount,txn_count_sd_l30d,txn_count_idc_l30d,txn_count_dcm_l30d,txn_count_pass_l30d,txn_count_chqin_l30d,txn_count_branchto_l30d,fraud,ML_Score
0,2498.0,0.0,0.0,-1.0,0.0,,0.0,0.0,-1.0,-1.0,2585.0,0.0,0.0,0.0,0.0,25,3818.0,401.0,47.0,0.130959,6.0,,500.0,,,,,,,1,0.025338
1,2500.0,0.0,0.0,2317.25,0.0,,0.0,0.0,2317.25,2317.25,2585.0,0.0,0.0,3818.0,0.0,25,2818.0,401.0,47.0,0.354862,6.0,,1000.0,,,,,,,0,0.025338
2,2500.0,0.0,1.0,-1.0,0.0,2500.0,1.0,1.0,580.233333,-1.0,2585.0,0.0,0.0,3318.0,0.0,25,1818.0,401.0,47.0,0.550055,6.0,,1000.0,,,,,,,0,0.025338
3,2501.0,0.0,0.0,-1.0,0.0,2501.0,0.0,0.0,-1.0,-1.0,2585.0,0.0,0.0,2818.0,0.0,25,19318.0,401.0,47.0,0.051765,6.0,,1000.0,,,,,,,0,0.025338
4,2503.0,0.0,0.0,-1.0,0.0,2502.0,0.0,0.0,3670.783333,-1.0,2585.0,0.0,0.0,6943.0,0.0,25,16818.0,401.0,47.0,0.02973,6.0,,500.0,,,,,,,0,0.025338


In [41]:
test_in.head()

Unnamed: 0.1,Unnamed: 0,transaction_id,Fraud,Deposit_Account_Age,MFTxnCountSameToAcc_L1D,Unique_AmountFrom_L1D,TSLastMFTxn_fromacct_to_acct_mins,Unique_AmountTo_L1D,Dep_AvgAcctAge_L30D,MFTxnCount_L1D,MFTxnCountSameType_L1D,TSLastMFTxn_tobankcode_mins,TSLastMFTxn_toacct_mins,account_open_amount,Avg_Amt_SameToAcc_L1H,Avg_Amt_L1H,Avg_Bal_L14D,Max_AmtBalRatio_L1H,transaction_type,available_balance,occupation,Customer_Age,amount_balance_ratio,customer_segment,TimeSinceLast_Ch,amount,Txn_Count_SD_L30D,Txn_Count_IDC_L30D,Txn_Count_DCM_L30D,Txn_Count_PASS_L30D,Txn_Count_CHQIN_L30D,Txn_Count_BranchTo_L30D
0,0,NT1937830033075700,1,2498.0,0.0,0.0,-1.0,0.0,,0.0,0.0,-1.0,-1.0,2585.0,0.0,0.0,0.0,0.0,25,3818.0,401.0,47.0,0.130959,6.0,,500.0,,,,,,
1,1,NT1937880074349300,0,2500.0,0.0,0.0,2317.25,0.0,,0.0,0.0,2317.25,2317.25,2585.0,0.0,0.0,3818.0,0.0,25,2818.0,401.0,47.0,0.354862,6.0,,1000.0,,,,,,
2,2,NT1937980086848600,0,2500.0,0.0,1.0,-1.0,0.0,2500.0,1.0,1.0,580.233333,-1.0,2585.0,0.0,0.0,3318.0,0.0,25,1818.0,401.0,47.0,0.550055,6.0,,1000.0,,,,,,
3,3,NT1938500119987600,0,2501.0,0.0,0.0,-1.0,0.0,2501.0,0.0,0.0,-1.0,-1.0,2585.0,0.0,0.0,2818.0,0.0,25,19318.0,401.0,47.0,0.051765,6.0,,1000.0,,,,,,
4,4,NT1938850156163200,0,2503.0,0.0,0.0,-1.0,0.0,2502.0,0.0,0.0,3670.783333,-1.0,2585.0,0.0,0.0,6943.0,0.0,25,16818.0,401.0,47.0,0.02973,6.0,,500.0,,,,,,


In [42]:
test.tail()

Unnamed: 0,deposit_account_age,mftxncountsametoacc_l1d,unique_amountfrom_l1d,tslastmftxn_fromacct_to_acct_mins,unique_amountto_l1d,dep_avgacctage_l30d,mftxncount_l1d,mftxncountsametype_l1d,tslastmftxn_tobankcode_mins,tslastmftxn_toacct_mins,account_open_amount,avg_amt_sametoacc_l1h,avg_amt_l1h,avg_bal_l14d,max_amtbalratio_l1h,transaction_type,available_balance,occupation,customer_age,amount_balance_ratio,customer_segment,timesincelast_ch,amount,txn_count_sd_l30d,txn_count_idc_l30d,txn_count_dcm_l30d,txn_count_pass_l30d,txn_count_chqin_l30d,txn_count_branchto_l30d,fraud,ML_Score
1807628,3886.0,0.0,1.0,2895.85,0.0,3886.0,1.0,1.0,2895.85,2895.85,500000.0,0.0,0.0,5733.333333,0.0,100400320,5000.0,406.0,58.0,7.426,6.0,,37130.0,,,,,,,0,0.025338
1807629,3069.0,0.0,1.0,4319.05,0.0,,1.0,1.0,122.416667,4319.05,30000.0,0.0,0.0,960.065283,0.0,100400320,194.14,102.0,43.0,9.374678,6.0,,1820.0,,,,,,,0,0.025338
1807630,11623.0,0.0,2.0,1945.133333,0.0,,2.0,0.0,1945.133333,1945.133333,0.0,0.0,0.0,67147.12,0.0,100400720,55027.92,109.0,93.0,0.045431,6.0,,2500.0,,,,,,,0,0.025338
1807631,1548.0,4.0,11.0,44.916667,4.0,1548.0,12.0,4.0,44.916667,44.916667,2000.0,570.0,755.0,4425.205347,3.524175,100400220,1171.74,306.0,51.0,0.418182,5.0,,490.0,,,,,,,0,0.031608
1807632,2438.0,5.0,13.0,8.333333,5.0,,14.0,14.0,8.333333,8.333333,400.0,100.74,100.74,1437.032659,0.241972,100400320,315.4,407.0,50.0,0.319087,6.0,,100.64,,,,,,,0,0.031013


In [43]:
test_in.tail()

Unnamed: 0.1,Unnamed: 0,transaction_id,Fraud,Deposit_Account_Age,MFTxnCountSameToAcc_L1D,Unique_AmountFrom_L1D,TSLastMFTxn_fromacct_to_acct_mins,Unique_AmountTo_L1D,Dep_AvgAcctAge_L30D,MFTxnCount_L1D,MFTxnCountSameType_L1D,TSLastMFTxn_tobankcode_mins,TSLastMFTxn_toacct_mins,account_open_amount,Avg_Amt_SameToAcc_L1H,Avg_Amt_L1H,Avg_Bal_L14D,Max_AmtBalRatio_L1H,transaction_type,available_balance,occupation,Customer_Age,amount_balance_ratio,customer_segment,TimeSinceLast_Ch,amount,Txn_Count_SD_L30D,Txn_Count_IDC_L30D,Txn_Count_DCM_L30D,Txn_Count_PASS_L30D,Txn_Count_CHQIN_L30D,Txn_Count_BranchTo_L30D
1807628,3698362,202305262303583668,0,3886.0,0.0,1.0,2895.85,0.0,3886.0,1.0,1.0,2895.85,2895.85,500000.0,0.0,0.0,5733.333333,0.0,100400320,5000.0,406.0,58.0,7.426,6.0,,37130.0,,,,,,
1807629,3698363,202305262303584044,0,3069.0,0.0,1.0,4319.05,0.0,,1.0,1.0,122.416667,4319.05,30000.0,0.0,0.0,960.065283,0.0,100400320,194.14,102.0,43.0,9.374678,6.0,,1820.0,,,,,,
1807630,3698364,202305262303587943,0,11623.0,0.0,2.0,1945.133333,0.0,,2.0,0.0,1945.133333,1945.133333,0.0,0.0,0.0,67147.12,0.0,100400720,55027.92,109.0,93.0,0.045431,6.0,,2500.0,,,,,,
1807631,3698365,202305262303591022,0,1548.0,4.0,11.0,44.916667,4.0,1548.0,12.0,4.0,44.916667,44.916667,2000.0,570.0,755.0,4425.205347,3.524175,100400220,1171.74,306.0,51.0,0.418182,5.0,,490.0,,,,,,
1807632,3698366,202305262303591175,0,2438.0,5.0,13.0,8.333333,5.0,,14.0,14.0,8.333333,8.333333,400.0,100.74,100.74,1437.032659,0.241972,100400320,315.4,407.0,50.0,0.319087,6.0,,100.64,,,,,,


In [44]:
test_in_out = pd.concat([test[['fraud','ML_Score']],test_in[['transaction_id','transaction_type']]], axis = 1)
test_in_out['transaction_type'] = test_in_out['transaction_type'].astype('object')

In [45]:
test_in_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1807633 entries, 0 to 1807632
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   fraud             int64  
 1   ML_Score          float64
 2   transaction_id    object 
 3   transaction_type  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 55.2+ MB


In [55]:
test_in_out['transaction_id'] = test_in_out['transaction_id'].astype('str')
mobile_financial['transaction_id'] = mobile_financial['transaction_id'].astype('str')

In [56]:
test_in_out_test = test_in_out.merge(mobile_financial[['transaction_id','transaction_datetime1','customer_id','from_account_no']],
                                     left_on = ['transaction_id'],
                                     right_on = ['transaction_id'],
                                     how = 'left',
                                     indicator = 'merge_test_mf')

In [57]:
test_in_out_test.groupby(['merge_test_mf'], dropna=False).size()

merge_test_mf
left_only           0
right_only          0
both          1807633
dtype: int64

In [58]:
test_in_out_test.loc[test_in_out_test['merge_test_mf'] == 'left_only'].head()

Unnamed: 0,fraud,ML_Score,transaction_id,transaction_type,transaction_datetime1,customer_id,from_account_no,merge_test_mf


In [59]:
test_in_out_test.head()

Unnamed: 0,fraud,ML_Score,transaction_id,transaction_type,transaction_datetime1,customer_id,from_account_no,merge_test_mf
0,1,0.025338,NT1937830033075700,25,2019-12-17 17:53:38,001100000000000000000008854967,PPTM64v4En,both
1,0,0.025338,NT1937880074349300,25,2019-12-19 08:30:53,001100000000000000000008854967,PPTM64v4En,both
2,0,0.025338,NT1937980086848600,25,2019-12-19 18:11:07,001100000000000000000008854967,PPTM64v4En,both
3,0,0.025338,NT1938500119987600,25,2019-12-20 19:25:25,001100000000000000000008854967,PPTM64v4En,both
4,0,0.025338,NT1938850156163200,25,2019-12-22 07:21:54,001100000000000000000008854967,PPTM64v4En,both


In [60]:
test_in_out_rules = test_in_out_test.drop(columns = ['fraud','merge_test_mf']).merge(mobile_rule_result[['rules','customer_id','account_number','datetime']],
                                           left_on = ['customer_id','from_account_no','transaction_datetime1'],
                                           right_on = ['customer_id','account_number','datetime'],
                                                                   how = 'left',
                                                                   indicator = "merge_rules")

In [61]:
test_in_out_rules.groupby(['merge_rules'], dropna=False).size()

merge_rules
left_only     1720407
right_only          0
both           103593
dtype: int64

In [62]:
test_in_out_rules_sub = test_in_out_rules.loc[test_in_out_rules['transaction_type'].astype('str').str.len() > 3]
test_in_out_rules_sub.groupby(['merge_rules'], dropna=False).size()

merge_rules
left_only     1617557
right_only          0
both           100794
dtype: int64

In [63]:
test_in_out_rules_sub.groupby(['rules'], dropna=False).size()

rules
BRANCH010                              242
BRANCH011                            31167
BRANCH011,BRANCH010                      2
MOBILE002                              453
MOBILE002,MOBILE002                     67
MOBILE002,MULE017                        1
MOBILE002,MULE025                        3
MOBILE002,MULE026                        4
MULE017                              19329
MULE017,MOBILE002                        2
MULE017,MOBILE002,MOBILE002              2
MULE017,MULE017                        415
MULE017,MULE017,MULE017                  4
MULE017,MULE017,MULE017,MULE017          8
MULE017,MULE017,MULE026                  3
MULE017,MULE017,MULE026,MULE017          1
MULE017,MULE017,MULE027                  3
MULE017,MULE017,MULE027,MULE027          1
MULE017,MULE022                          9
MULE017,MULE022,MULE022                  3
MULE017,MULE025                       1430
MULE017,MULE025,MULE017                  1
MULE017,MULE025,MULE026                 10
MULE0

In [64]:
test_in_out_rules_sub['rules_alert'] = np.where(test_in_out_rules_sub['rules'].isna() == False,1,0)
test_in_out_rules_sub['ML_alert'] = np.where(test_in_out_rules_sub['ML_Score'] >= 0.353,1,0)
test_in_out_rules_sub['ML_rules_alert'] = np.where((test_in_out_rules_sub['ML_Score'] >= 0.353) & (test_in_out_rules_sub['rules'].isna() == False),1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [65]:
print(test_in_out_rules_sub.groupby(['rules_alert'], dropna=False).size())
print(test_in_out_rules_sub.groupby(['ML_alert'], dropna=False).size())
print(test_in_out_rules_sub.groupby(['ML_rules_alert'], dropna=False).size())

rules_alert
0    1617557
1     100794
dtype: int64
ML_alert
0    1674223
1      44128
dtype: int64
ML_rules_alert
0    1687104
1      31247
dtype: int64


In [66]:
print('Rules Alerts = ',test_in_out_rules_sub.loc[test_in_out_rules_sub['rules_alert'] == 1].shape[0])
print('Rules Alert Rate = ',test_in_out_rules_sub.loc[test_in_out_rules_sub['rules_alert'] == 1].shape[0]/test_in_out_rules_sub.shape[0])
print('ML Alerts = ',test_in_out_rules_sub.loc[test_in_out_rules_sub['ML_alert'] == 1].shape[0])
print('ML Alert Rate = ',test_in_out_rules_sub.loc[test_in_out_rules_sub['ML_alert'] == 1].shape[0]/test_in_out_rules_sub.shape[0])
print('ML + Rules Alerts = ',test_in_out_rules_sub.loc[test_in_out_rules_sub['ML_rules_alert'] == 1].shape[0])
print('ML + Rules Alert Rate = ',test_in_out_rules_sub.loc[test_in_out_rules_sub['ML_rules_alert'] == 1].shape[0]/test_in_out_rules_sub.shape[0])


Rules Alerts =  100794
Rules Alert Rate =  0.05865739886670418
ML Alerts =  44128
ML Alert Rate =  0.02568043432337165
ML + Rules Alerts =  31247
ML + Rules Alert Rate =  0.018184294128498774


In [None]:
# Output the transactions

test_in_out_rules_sub.loc[test_in_out_rules_sub['ML_rules_alert'] == 1].drop(columns = ['merge_rules']).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\Mobile Financial TEST ML+Rules Alerts.txt',
                                                                              sep = '\t',
                                                                              index=False)

In [None]:
# Output the account numbers

test_in_out_rules_sub.loc[test_in_out_rules_sub['ML_rules_alert'] == 1].groupby(['from_account_no'], dropna=False).size().reset_index().rename(columns = {0:'Number of Transactions'}).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\Mobile Financial TEST ML+Rules Alerts - Account Numbers.txt',
                                                                              sep = '\t',
                                                                              index=False)

In [None]:
# Output the customer ids

test_in_out_rules_sub.loc[test_in_out_rules_sub['ML_rules_alert'] == 1].groupby(['customer_id'], dropna=False).size().reset_index().rename(columns = {0:'Number of Transactions'}).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\Mobile Financial TEST ML+Rules Alerts - Customer Ids.txt',
                                                                              sep = '\t',
                                                                              index=False)

In [70]:
# output ALL the transactions

test_in_out_rules_sub.drop(columns = ['account_number','datetime','merge_rules','rules_alert','ML_alert','ML_rules_alert','transaction_type']).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\Mobile Financial Transaction ML Scores_Phase2.txt',
                                                                              sep = '\t',
                                                                              index=False)

In [None]:
x = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\Mobile Financial TEST ML+Rules Alerts.txt',
                                                                              sep = '\t')

In [None]:
x.head()

In [None]:
x.drop(columns = ['transaction_type','account_number','datetime','rules_alert','ML_alert','ML_rules_alert']).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mobile Financial Phase 2\Mobile Financial TEST ML+Rules Alerts - Transactions.txt',
                                                                              sep = '\t',
                                                                                                                   index=False)

### ***** data DF already has a dummy Fraud in the TEST sample *****

In [None]:
data.loc[data['flag'] == 'TEST']['Fraud'].sum()

In [None]:
data.loc[(data['flag'] == 'TEST') & (data['Fraud'] == 1)]

### Get the Result Files from MLS

In [None]:
data.loc[data['flag'] == 'TEST'][['transaction_id',
'Fraud',
'Deposit_Account_Age',
'MFTxnCountSameToAcc_L1D',
'Unique_AmountFrom_L1D',
'MFTxnCountSameToAcc_L1H',
'TSLastMFTxn_sametoacc_mins',
'Unique_Amount_L1D',
'MFTxnCount_L1D',
'MFTxnCountSameType_L1D',
'Unique_Amount_L1H',
'account_open_amount',
'Avg_Amt_SameToAcc_L1H',
'TSLastMFTxn_sametype_mins',
'MFTxnCountSameType_L1H',
'Avg_Amt_SameToAcc_L1D',
'Unique_AmountFrom_L1H',
'Max_Amt_L1D',
'MFTxnCount_L1H',
'Avg_Amt_SameType_L1D',
'Avg_Amt_SameType_L1H',
'Unique_Amount_L7D',
'TSLastMFTxn_mins',
'Avg_Amt_L1D',
'Max_Amt_L1H',
'Avg_Amt_L1H',
'Unique_From_Account_No_L1H']].head()

In [None]:
test_result_mule = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\TEST Data with Mule ML Score.csv')

In [None]:
test_result_mule.shape

In [None]:
test_result_mule.reset_index().head()

In [None]:
test_base = data.loc[data['flag'] == 'TEST'][['transaction_id']].reset_index()

In [None]:
test_base.shape

In [None]:
test_base.head()

In [None]:
test_result_mule = pd.concat([test_base, test_result_mule], axis = 1) 

In [None]:
test_result_mule.shape

In [None]:
test_result_mule.head()

In [None]:
test_result_mule['ML_Score'].describe(percentiles=[0.1,0.2,0.3,0.38,0.4,0.5,0.52,0.6,0.7,0.8,0.85,0.9,0.91,0.95]).apply("{0:.6f}".format)

In [None]:
test_result_mule.loc[test_result_mule['ML_Score'] >= 0.35].shape[0] / test_result_mule.shape[0]

In [None]:
test_out_lgbm = test_result_mule[['transaction_id','ML_Score']].merge(data[['transaction_id','customer_id','account_number','transaction_datetime','transaction_type']],
                                                     on = ['transaction_id'],
                                                     how = 'left')


In [None]:
test_out_lgbm.shape

In [None]:
test_out_lgbm.head()

In [None]:
test_out_lgbm = test_out_lgbm.loc[test_out_lgbm['transaction_type'].str.len() > 3]

In [None]:
test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35].shape[0] / test_out_lgbm.shape[0]

In [None]:
test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35].shape[0]

In [None]:
test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35]['customer_id'].nunique()

#### Output TEST Records for TTB checking June-27 ([MULE])

In [None]:
print('Number of flagged accounts (ML Score >= 0.35):', test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35]['account_number'].nunique())
print('Total Number of Accounts in TEST data:', test_out_lgbm['account_number'].nunique())
print('Proportion of Accounts getting Flagged:', test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35]['account_number'].nunique()/test_out_lgbm['account_number'].nunique())


In [None]:
print('Number of flagged Customers (ML Score >= 0.35):', test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35]['customer_id'].nunique())
print('Total Number of Customers in TEST data:', test_out_lgbm['customer_id'].nunique())
print('Proportion of Customers getting Flagged:', test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35]['customer_id'].nunique()/test_out_lgbm['customer_id'].nunique())


In [None]:
print('Number of flagged Transactions (ML Score >= 0.35):', test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35].shape[0])
print('Total Number of Transactions in TEST data:', test_out_lgbm['customer_id'].shape[0])
print('Proportion of Transactions getting Flagged:', test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35].shape[0]/test_out_lgbm.shape[0])


In [None]:
# Make a file with flagged Customer ID's and number of transactions

pd.DataFrame(test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35].groupby(['customer_id']).size()).reset_index().rename(columns = {0:'Number of Transactions'}).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mule Customer IDs ML Detected.txt', index=False)

In [None]:
# Make a file with flagged Account Numbers and number of transactions

pd.DataFrame(test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35].groupby(['account_number']).size()).reset_index().rename(columns = {0:'Number of Transactions'}).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mule Accounts ML Detected.txt', index=False)

In [None]:
# Make a file with the flagged transactions

pd.DataFrame(test_out_lgbm.loc[test_out_lgbm['ML_Score'] >= 0.35][['transaction_id','account_number','customer_id', 'transaction_datetime']]).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\Mule Transactions ML Detected.txt',
                                                                                                                            index=False)

#### Social Engineering Target

In [None]:
df_train_se[['transaction_id',
'Fraud',
'transaction_type',
'to_bank_code',
'TSLastMFTxn_sametoacc_mins',
'Dep_MaxAmount_L30D',
'Dep_MaxBalance_L30D',
'Dep_AvgAmoutL30D',
'Dep_AvgBalance_L30D',
'Max_Amt_L1D',
'Dep_Unique_Amount_L30D',
'Txn_Count_Dep_40_L30D',
'Txn_Count_Dep_L30D',
'Customer_Age',
'TSLastMFTxn_mins',
'Dep_Unique_TellerID_L30D',
'Avg_Amt_L1D',
'Dep_Unique_Branch_L30D',
'Avg_Amt_SameToAcc_L1H']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\Train_SE.csv')

In [None]:
data.loc[data['flag'] == 'TEST'][['transaction_id',
'Fraud',
'transaction_type',
'to_bank_code',
'TSLastMFTxn_sametoacc_mins',
'Dep_MaxAmount_L30D',
'Dep_MaxBalance_L30D',
'Dep_AvgAmoutL30D',
'Dep_AvgBalance_L30D',
'Max_Amt_L1D',
'Dep_Unique_Amount_L30D',
'Txn_Count_Dep_40_L30D',
'Txn_Count_Dep_L30D',
'Customer_Age',
'TSLastMFTxn_mins',
'Dep_Unique_TellerID_L30D',
'Avg_Amt_L1D',
'Dep_Unique_Branch_L30D',
'Avg_Amt_SameToAcc_L1H']].fillna(-1).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files for MLS\TEST_SE.csv')

In [None]:
test_result_se = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\TEST Data with SE ML Score.csv')

In [None]:
test_result_se.reset_index().head()

In [None]:
test_result_se.shape

In [None]:
test_base = data.loc[data['flag'] == 'TEST'][['transaction_id']].reset_index()

In [None]:
test_result_se = pd.concat([test_base, test_result_se], axis = 1) 

In [None]:
test_result_se.shape

In [None]:
test_result_se.head()

In [None]:
test_out_se_lgbm = test_result_se[['transaction_id','ML_Score']].merge(data[['transaction_id','customer_id','account_number','transaction_datetime','transaction_type']],
                                                     on = ['transaction_id'],
                                                     how = 'left')

In [None]:
print(test_out_se_lgbm.shape[0])
test_out_se_lgbm = test_out_se_lgbm.loc[test_out_se_lgbm['transaction_type'].str.len() > 3]
print(test_out_se_lgbm.shape[0])

In [None]:
test_out_se_lgbm.drop(columns = ['transaction_type']).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\TEST_SE_LGBM_ML_Score.txt',
                                                       index=False)

In [None]:
test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= 0.16].shape[0]

In [None]:
test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= 0.16].shape[0]/test_out_se_lgbm.shape[0]

In [None]:
test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= 0.16]['customer_id'].nunique()

#### Output TEST Records for TTB checking June-27 ([Social Engineering])

In [None]:
threshold = 0.25

In [None]:
print('Number of flagged SE accounts (ML Score >= threshold):', test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= threshold]['account_number'].nunique())
print('Total Number of Accounts in TEST data:', test_out_se_lgbm['account_number'].nunique())
print('Proportion of Accounts getting Flagged:', test_out_se_lgbm.loc[test_out_lgbm['ML_Score'] >= threshold]['account_number'].nunique()/test_out_se_lgbm['account_number'].nunique())


In [None]:
print('Number of flagged Customers (ML Score >= threshold):', test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= threshold]['customer_id'].nunique())
print('Total Number of Customers in TEST data:', test_out_se_lgbm['customer_id'].nunique())
print('Proportion of Customers getting Flagged:', test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= threshold]['customer_id'].nunique()/test_out_se_lgbm['customer_id'].nunique())


In [None]:
print('Number of flagged Transactions (ML Score >= threshold):', test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= threshold].shape[0])
print('Total Number of Transactions in TEST data:', test_out_se_lgbm['customer_id'].shape[0])
print('Proportion of Transactions getting Flagged:', test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= threshold].shape[0]/test_out_se_lgbm.shape[0])


In [None]:
# Make a file with flagged Account Numbers and number of transactions

pd.DataFrame(test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= threshold].groupby(['account_number']).size()).reset_index().rename(columns = {0:'Number of Transactions'}).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\SE Accounts ML Detected.txt', index=False)

In [None]:
# Make a file with flagged Customer ID's and number of transactions

pd.DataFrame(test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= threshold].groupby(['customer_id']).size()).reset_index().rename(columns = {0:'Number of Transactions'}).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\SE Customer IDs ML Detected.txt', index=False)

In [None]:
# Make a file with the flagged transactions

pd.DataFrame(test_out_se_lgbm.loc[test_out_se_lgbm['ML_Score'] >= threshold][['transaction_id','account_number','customer_id', 'transaction_datetime']]).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\Sales\TTB\PoC 2\Data\Files from MLS\SE Transactions ML Detected.txt',
                                                                                                                            index=False)

### RCBC

In [71]:
df = pd.read_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\RCBC\Post-Implementation\12_1_20230420135526_Score_Validation (2)\Data.csv')

In [72]:
df.head()

Unnamed: 0,Identifier,Fraud_Clean,Score
0,497232424,1.0,0.009453
1,497328709,1.0,0.009409
2,497480613,1.0,0.021668
3,497543153,1.0,0.078542
4,497585300,1.0,0.002321


In [None]:
# Round the score to 4 decimal places
df['Score_Round'] = round(df['Score'],4)

# Sort by the rounded score
df.sort_values(by = 'Score_Round', ascending = False)

# Create Total and Fraud columns
df1 = pd.DataFrame(df.groupby(['Score_Round'], dropna=False).size().reset_index().sort_values(by='Score_Round',ascending=False)).rename(columns = {0:'Total'})
df2 = pd.DataFrame(df.groupby(['Score_Round'], dropna=False)['Fraud_Clean'].sum().reset_index().sort_values(by='Score_Round',ascending=False)).rename(columns={'Fraud_Clean':'Fraud'})

# Merge the two previous dataframes, add the remaining columns
df3=df1.merge(df2,on ='Score_Round',how='left')
df3['Clean']= df3['Total']-df3['Fraud']
df3['Alerts']=df3['Total'].cumsum()
df3['Total_Fraud']=df3['Fraud'].cumsum()
df3['Total_Clean']=df3['Clean'].cumsum()
df3['TP']=df3['Total_Fraud']
df3['FP']=df3['Total_Clean']
df3['TN']= df3['Clean'].sum()-df3['Total_Clean']
df3['FN']=df3['Fraud'].sum()-df3['Total_Fraud']
df3['Alert_Rate']= df3['Alerts']/df3['Total'].sum()
df3['Recall']=df3['TP']/(df3['TP']+df3['FN'])
df3['Precision']=df3['TP']/(df3['TP']+df3['FP'])

# Write the dataframe to a CSV file
df3.drop(columns = ['TP','TN','FP','FN']).to_csv(r'C:\Users\AEmslie\OneDrive - GB Group PLC\RCBC\Post-Implementation\C09_ScoreDist.csv', index = False)


In [73]:
from sklearn import metrics

In [74]:
y_true = df['Fraud_Clean'].to_numpy()
y_pred = df['Score'].to_numpy()

In [75]:
auc = metrics.roc_auc_score(y_true, y_pred)
print(auc)

0.9694599140041499


In [103]:
df['Score'].describe(percentiles = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]).apply("{0:.6f}".format)

count    5034195.000000
mean           0.000878
std            0.006756
min            0.000054
10%            0.000072
20%            0.000078
30%            0.000090
40%            0.000099
50%            0.000119
60%            0.000181
70%            0.000544
80%            0.000786
90%            0.001268
max            0.459609
Name: Score, dtype: object