# Bank Churn ML

Imports

In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import copy

### Load Data

- Order of Operations
  - Load Customer Dataset
  - Perform necessary Customer Dataset cleaning
  - Convert Customer Dataset to transactional rows
  - Load Transactional Dataset
  - Append transactional Customer Dataset to top of Transactional Dataset

In [20]:
# Load customer dataset
c_df_original = pd.read_csv(r"C:\bank-churn-data\customers_tm1_e.csv")
c_df = c_df_original[:1000]
c_array = c_df.to_numpy()

# Convert customer data to transactional rows
c_to_t = pd.DataFrame()
c_to_t['date'] = c_df['creation_date']
c_to_t['account_id'] = 0
c_to_t['customer_id'] = c_df['customer_id']
c_to_t['amount'] = c_df['start_balance']
c_to_t['transaction_date'] = c_df['creation_date']
c_to_t['deposit'] = c_df['start_balance'].apply(lambda x: 0 if pd.isna(x) else x)
c_to_t['withdrawal'] = 0
print(c_df_original.shape)

# Load transactional dataset
t_df_original = pd.read_csv(r"C:\bank-churn-data\transactions_tm1_e.csv")
t_df_original = pd.concat([c_to_t, t_df_original])
t_df = t_df_original.iloc[:1000]
t_array = t_df.to_numpy()
display(t_df_original)



(116016, 5)


Unnamed: 0,date,account_id,customer_id,amount,transaction_date,deposit,withdrawal
0,2007-01-31,0,91,10180.56,2007-01-31,10180.56,0.00
1,2007-01-31,0,92,4757.68,2007-01-31,4757.68,0.00
2,2007-01-31,0,93,6796.72,2007-01-31,6796.72,0.00
3,2007-01-31,0,94,9870.48,2007-01-31,9870.48,0.00
4,2007-01-31,0,95,12500.72,2007-01-31,12500.72,0.00
...,...,...,...,...,...,...,...
4977967,2020-05-31,24253959,116103,-3056.19,2020-05-09,0.00,-3056.19
4977968,2020-05-31,24253960,116104,2900.20,2020-05-31,2900.20,0.00
4977969,2020-05-31,24253960,116104,-4002.30,2020-05-29,0.00,-4002.30
4977970,2020-05-31,24253961,116105,2246.93,2020-05-31,2246.93,0.00


### Profiling the Datasets

In [19]:
# Shows missing values per column of each dataset
missing_t_df = t_df_original.isna()
missing_c_df = c_df_original.isna()
missing_t = missing_t_df.sum(axis=0)
missing_c = missing_c_df.sum(axis=0)
print(missing_t)
print(missing_c)
missing_c_starting_balance = c_df_original['start_balance'].isna()
display(c_df_original.loc[missing_c_starting_balance])
print(c_to_t['deposit'].isna().sum())
display(c_to_t)

date                0
account_id          0
customer_id         0
amount              0
transaction_date    0
deposit             0
withdrawal          0
dtype: int64
customer_id      0
dob              0
state            0
start_balance    3
creation_date    0
dtype: int64


Unnamed: 0,customer_id,dob,state,start_balance,creation_date
21243,21334,1974-09-06,Missouri,,2010-01-31
45891,45982,1991-09-15,New York,,2012-08-31
95939,96030,1997-04-09,Missouri,,2017-10-31


0


Unnamed: 0,date,account_id,customer_id,amount,transaction_date,deposit,withdrawal
0,2007-01-31,0,91,10180.56,2007-01-31,10180.56,0
1,2007-01-31,0,92,4757.68,2007-01-31,4757.68,0
2,2007-01-31,0,93,6796.72,2007-01-31,6796.72,0
3,2007-01-31,0,94,9870.48,2007-01-31,9870.48,0
4,2007-01-31,0,95,12500.72,2007-01-31,12500.72,0
...,...,...,...,...,...,...,...
995,2007-01-31,0,1086,3663.48,2007-01-31,3663.48,0
996,2007-01-31,0,1087,6089.60,2007-01-31,6089.60,0
997,2007-01-31,0,1088,11928.68,2007-01-31,11928.68,0
998,2007-01-31,0,1089,7000.36,2007-01-31,7000.36,0


In [None]:
# Prints column names and the lake for ease of reference
print(t_df.columns,'\n',t_array[:2],'\n\n',c_df.columns,'\n',c_array[:2])

In [None]:
# Code I ran to show that each customer_id has only 1 account_id
ex_df = t_df.groupby('customer_id')['account_id'].nunique()
display(ex_df)
display(type(ex_df))
display(ex_df.max())

In [None]:
# Failed Testing
practice = t_df.iloc[[2,3,4]]
display(practice)

t_df['magnitude'] = t_df['amount'].map(abs) + t_df['deposit'].map(abs) + t_df['withdrawal'].map(abs)
display(t_df.query('amount != 0'))


### Transformations

Features to add list:
- From transactions, groupby(['account_id', month('transaction_date')])
  - sum(deposit) in month ~ total deposited
  - sum(withdrawal) in month ~ total withdrawn
  - sum(amount) in month ~ total delta
  - count(deposit != 0) in month ~ total deposits
  - count(withdrawal != 0) in month ~ total withdrawals
  - count(amount != 0) in month ~ total transactions
  - month('transaction_date') - {customer_id:dob} ~ age of customer at given time
  - month('transaction_date') - {customer_id:creation_date} ~ age of account at given time
  - cumsum(amount) ~ total in account at end of month
  - 'transaction_date' - df['transaction_date'].shift(1)~ Period of time between this transaction and next transaction (or with 6/1/2020)
  - ? Whether there was a no-amount transaction? Would those be interpreted as just balance checks?

- From customers (also have to factor in having multiple accounts):
  - initials['customer_id', 'start_balance']
  - ~ time since last transaction
  - ~ current age
  - ~ current age of account
  - ~ current account balance
  - ~ last transaction amount
  - ~ period of time between first and last transaction (+ activity rate)
  - ~ total number of withdrawals made (+ ratio over time)
  - ~ total number of deposits made (+ ratio over time)
  - ~ Number of 

- ideas
  - transactional data must be analyzed in a time-series manner while customer data would be through a more standard logistical model?
  - People with many accounts will be more likely to close AN account (1 of them) since they have other accounts usable
    - People with only 1 account will be less likely to close their account
  - look if a certain quantity of withdrawal (amount, proportion of account) in a given month correlates with churning
    - for example, if they withdrawal X% of their account (which may be associated with subsequent inactivity) => churn
  - Period of time between this transaction and next transaction (or with 6/1/2020)
    - naturally older churned accounts will have a much higher forward inactivity value.
    - What's the average of churned and un-churned accounts and how well does one forward inactivity value distinguish between them
  - Someone who deposits more than they withdrawal in general probably is not churning over a long enough timespan?
    - big withdrawals should be counted for more tho since an account may get filled steadily but being brought down to 0 should be more significant
    - So must be combined with info on final account value
  - 
  - 


In [47]:
# Adds a month and year date columns and binary columns to check if the transaction
# was a deposit, withdrawal, or check_balance (?, a transaction with amount=0) occurred on each line
    # The binary column will be used in group summation to find # of deposits and withdrawals per month

# deepcopy just to be very safe that these transformation tests don't interfere
# with other attempted transformations


t_df_original['transaction_date'] = t_df_original['transaction_date'].apply(pd.to_datetime)
t_df_original = t_df_original.sort_values(by=['customer_id', 'transaction_date'])
t_df_original['month'] = t_df_original['transaction_date'].dt.month
t_df_original['year'] = t_df_original['transaction_date'].dt.year
t_df_original['deposit_y'] = t_df_original['deposit'].apply(lambda x: 1 if abs(x) > 0 else 0)
t_df_original['withdrawal_y'] = t_df_original['withdrawal'].apply(lambda x: 1 if abs(x) > 0 else 0)
t_df_original['checked_balance'] = t_df_original['amount'].apply(lambda x: 1 if abs(x) == 0 else 0)
t_df_original['time_since_last_trans'] = t_df_original.groupby('customer_id')['transaction_date'].diff()
t_df_original['time_since_last_trans_in_days'] = t_df_original['time_since_last_trans'].dt.total_seconds()/(60*60*24)
display(t_df_original)

Unnamed: 0,date,account_id,customer_id,amount,transaction_date,deposit,withdrawal,month,year,deposit_y,withdrawal_y,checked_balance,time_since_last_trans,time_since_last_trans_in_days
1,2007-01-31,24137947,91,-5295.18,2007-01-16,0.00,-5295.18,1,2007,0,1,0,NaT,
0,2007-01-31,0,91,10180.56,2007-01-31,10180.56,0.00,1,2007,1,0,0,15 days,15.0
0,2007-01-31,24137947,91,3034.26,2007-01-31,3034.26,0.00,1,2007,1,0,0,0 days,0.0
2,2007-02-28,24137947,91,0.00,2007-02-28,0.00,0.00,2,2007,0,0,1,28 days,28.0
4,2007-03-31,24137947,91,-0.00,2007-03-11,0.00,-0.00,3,2007,0,0,1,11 days,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4977965,2020-05-31,24253959,116103,3354.40,2020-05-31,3354.40,0.00,5,2020,1,0,0,22 days,22.0
4977969,2020-05-31,24253960,116104,-4002.30,2020-05-29,0.00,-4002.30,5,2020,0,1,0,NaT,
4977968,2020-05-31,24253960,116104,2900.20,2020-05-31,2900.20,0.00,5,2020,1,0,0,2 days,2.0
4977970,2020-05-31,24253961,116105,2246.93,2020-05-31,2246.93,0.00,5,2020,1,0,0,NaT,


In [22]:
# Group by customer_id, year, month
# Creates sum_deposit, sum_withdrawal, sum_amount per month
# and count_deposit and count_withdrawal (counts number of each per month)
# and binary check if the customer checked their balance
# Adds mean of deposits and withdrawals in each month and the running balance at end of month

t_df_grouped = t_df_original.groupby(
    ["customer_id","year","month"])['amount','deposit','withdrawal',
                                    'deposit_y','withdrawal_y','checked_balance'].agg(
    sum_deposit = ("deposit","sum"),
    sum_withdrawal = ("withdrawal","sum"),
    count_deposit = ("deposit_y", "sum"),
    count_withdrawal = ("withdrawal_y", 'sum'),
    checked_balance = ("checked_balance",'max')
)

t_df_grouped['sum_delta'] = t_df_grouped['sum_deposit'] + t_df_grouped['sum_withdrawal']
t_df_grouped['mean_deposit'] = (
    t_df_grouped['sum_deposit']/t_df_grouped['count_deposit'].apply(lambda x:max(1,x)))
t_df_grouped['mean_withdrawal'] = (
    t_df_grouped['sum_withdrawal']/t_df_grouped['count_withdrawal'].apply(lambda x:max(1,x)))

t_df_grouped['running_balance'] = t_df_grouped.groupby('customer_id')['sum_delta'].cumsum()
display(t_df_grouped)

## This is a Customer Dataset transformation - There's a simpler method below!
#final_c_balances =t_df_time_grouped.groupby('customer_id')['running_balance'].last()



  t_df_grouped = t_df_original.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum_deposit,sum_withdrawal,sum_amount,count_deposit,count_withdrawal,checked_balance,sum_delta,mean_deposit,mean_withdrawal,running_balance
customer_id,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
91,2007,1,13214.82,-5295.18,7919.64,2,1,0,7919.64,6607.41,-5295.18,7919.64
91,2007,2,0.00,0.00,0.00,0,0,1,0.00,0.00,0.00,7919.64
91,2007,3,0.00,0.00,0.00,0,0,1,0.00,0.00,0.00,7919.64
92,2007,1,4757.68,0.00,4757.68,1,0,1,4757.68,4757.68,0.00,4757.68
92,2007,2,1164.90,0.00,1164.90,1,0,0,1164.90,1164.90,0.00,5922.58
...,...,...,...,...,...,...,...,...,...,...,...,...
116102,2020,5,872.99,-1223.94,-350.95,1,1,0,-350.95,872.99,-1223.94,-350.95
116103,2020,5,3354.40,-4357.62,-1003.22,1,2,0,-1003.22,3354.40,-2178.81,-1003.22
116104,2020,5,2900.20,-4002.30,-1102.10,1,1,0,-1102.10,2900.20,-4002.30,-1102.10
116105,2020,5,2246.93,0.00,2246.93,1,0,0,2246.93,2246.93,0.00,2246.93


Create transactional rows based off customer dataset rows (Box below complete)

Summary Stats for Customer Dataset

In [48]:
c_summary = t_df_original.groupby(
    "customer_id")['deposit','withdrawal', 'deposit_y', 'withdrawal_y','checked_balance',
                   'transaction_date','time_since_last_trans_in_days'].agg(
    total_deposits = ("deposit","sum"),
    total_withdrawals = ("withdrawal","sum"),
    num_deposit = ("deposit_y", "sum"),
    num_withdrawal = ("withdrawal_y", 'sum'),
    num_checked_balance = ("checked_balance",'max'),
    num_transactions = ('deposit','count'),
    first_transaction = ('transaction_date','first'),
    last_transaction = ('transaction_date','last'),
    mean_time_between_trans_in_days = ('time_since_last_trans_in_days','mean')
    )

c_summary['final_balance'] = c_summary['total_deposits'] + c_summary['total_withdrawals']
# act_lifespan - lifespan of account, time from first transaction to last transaction
c_summary['act_lifespan_in_days'] = c_summary['last_transaction'] - c_summary['first_transaction']
c_summary['act_lifespan_in_days'] = c_summary['act_lifespan_in_days'].dt.total_seconds()/(60*60*24) + 1
# time_since_last_trans - time between 2020-06-01 (end of data window) and last transaction of customer
c_summary['time_since_last_trans_in_days'] = pd.to_datetime('2020-06-01') - c_summary['last_transaction']
c_summary['time_since_last_trans_in_days'] = c_summary['time_since_last_trans_in_days'].dt.total_seconds()/(60*60*24)
c_summary['transactional_rate'] = c_summary['num_transactions']/c_summary['act_lifespan_in_days']
display(c_summary)

  c_summary = t_df_original.groupby(


Unnamed: 0_level_0,total_deposits,total_withdrawals,num_deposit,num_withdrawal,num_checked_balance,num_transactions,first_transaction,last_transaction,mean_time_between_trans_in_days,final_balance,act_lifespan_in_days,time_since_last_trans_in_days,transactional_rate
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
91,13214.82,-5295.18,2,1,1,7,2007-01-16,2007-03-30,12.166667,7919.64,74.0,4812.0,0.094595
92,18049.77,-7629.15,11,16,1,32,2007-01-31,2008-03-14,13.161290,10420.62,409.0,4462.0,0.078240
93,26619.33,-18965.15,11,19,0,30,2007-01-31,2007-11-30,10.448276,7654.18,304.0,4567.0,0.098684
94,27674.87,-16791.91,7,18,1,34,2007-01-01,2007-10-31,9.181818,10882.96,304.0,4597.0,0.111842
95,73832.84,-48886.87,18,26,1,48,2007-01-31,2008-08-31,12.297872,24945.97,579.0,4292.0,0.082902
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116102,872.99,-1223.94,1,1,0,2,2020-05-12,2020-05-31,19.000000,-350.95,20.0,1.0,0.100000
116103,3354.40,-4357.62,1,2,0,3,2020-05-09,2020-05-31,11.000000,-1003.22,23.0,1.0,0.130435
116104,2900.20,-4002.30,1,1,0,2,2020-05-29,2020-05-31,2.000000,-1102.10,3.0,1.0,0.666667
116105,2246.93,0.00,1,0,0,1,2020-05-31,2020-05-31,,2246.93,1.0,1.0,1.000000
