# Bank Churn ML

Imports

In [3]:
import numpy as np
import pandas as pd
import datetime as dt
import copy

### Load Data

- Order of Operations
  - Load Customer Dataset
  - Perform necessary Customer Dataset cleaning
  - Convert Customer Dataset to transactional rows
  - Load Transactional Dataset
  - Append transactional Customer Dataset to top of Transactional Dataset

In [4]:
# Load customer dataset
c_df_original = pd.read_csv(r"C:\bank-churn-data\customers_tm1_e.csv")
c_df = c_df_original[:1000]
c_array = c_df.to_numpy()

# Convert customer data to transactional rows
c_to_t = pd.DataFrame()
c_to_t['date'] = c_df['creation_date']
c_to_t['account_id'] = 0
c_to_t['customer_id'] = c_df['customer_id']
c_to_t['amount'] = c_df['start_balance']
c_to_t['transaction_date'] = c_df['creation_date']
c_to_t['deposit'] = c_df['start_balance']
c_to_t['withdrawal'] = 0

# Load transactional dataset
t_df_original = pd.read_csv(r"C:\bank-churn-data\transactions_tm1_e.csv")
t_df = t_df_original.iloc[:1000]
t_array = t_df.to_numpy()



### Profiling the Datasets

In [3]:
# Shows missing values per column of each dataset
missing_t_df = t_df_original.isna()
missing_c_df = c_df_original.isna()
missing_t = missing_t_df.sum(axis=0)
missing_c = missing_c_df.sum(axis=0)
print(missing_t)
print(missing_c)

date                0
account_id          0
customer_id         0
amount              0
transaction_date    0
deposit             0
withdrawal          0
dtype: int64
customer_id      0
dob              0
state            0
start_balance    3
creation_date    0
dtype: int64


In [4]:
# Prints column names and the lake for ease of reference
print(t_df.columns,'\n',t_array[:2],'\n\n',c_df.columns,'\n',c_array[:2])

Index(['date', 'account_id', 'customer_id', 'amount', 'transaction_date',
       'deposit', 'withdrawal'],
      dtype='object') 
 [['2007-01-31' 24137947 91 3034.26 '2007-01-31' 3034.26 0.0]
 ['2007-01-31' 24137947 91 -5295.18 '2007-01-16' 0.0 -5295.18]] 

 Index(['customer_id', 'dob', 'state', 'start_balance', 'creation_date'], dtype='object') 
 [[91 '1993-07-01' 'California' 10180.56 '2007-01-31']
 [92 '1985-12-05' 'New York' 4757.68 '2007-01-31']]


In [5]:
# Code I ran to show that each customer_id has only 1 account_id
ex_df = t_df.groupby('customer_id')['account_id'].nunique()
display(ex_df)
display(type(ex_df))
display(ex_df.max())

customer_id
91     1
92     1
93     1
94     1
95     1
96     1
97     1
98     1
99     1
100    1
101    1
102    1
103    1
104    1
105    1
106    1
107    1
108    1
Name: account_id, dtype: int64

pandas.core.series.Series

1

In [89]:
# Failed Testing
practice = t_df.iloc[[2,3,4]]
display(practice)

t_df['magnitude'] = t_df['amount'].map(abs) + t_df['deposit'].map(abs) + t_df['withdrawal'].map(abs)
display(t_df.query('amount != 0'))


Unnamed: 0,date,account_id,customer_id,amount,transaction_date,deposit,withdrawal,magnitude
2,2007-02-28,24137947,91,0.0,2007-02-28,0.0,0.0,0.0
3,2007-03-31,24137947,91,-0.0,2007-03-30,0.0,-0.0,0.0
4,2007-03-31,24137947,91,-0.0,2007-03-11,0.0,-0.0,0.0


Unnamed: 0,date,account_id,customer_id,amount,transaction_date,deposit,withdrawal,magnitude
0,2007-01-31,24137947,91,3034.26,2007-01-31,3034.26,0.00,6068.52
1,2007-01-31,24137947,91,-5295.18,2007-01-16,0.00,-5295.18,10590.36
7,2007-02-28,24137948,92,1164.90,2007-02-28,1164.90,0.00,2329.80
8,2007-03-31,24137948,92,1257.38,2007-03-31,1257.38,0.00,2514.76
9,2007-04-30,24137948,92,1338.12,2007-04-30,1338.12,0.00,2676.24
...,...,...,...,...,...,...,...,...
4977967,2020-05-31,24253959,116103,-3056.19,2020-05-09,0.00,-3056.19,6112.38
4977968,2020-05-31,24253960,116104,2900.20,2020-05-31,2900.20,0.00,5800.40
4977969,2020-05-31,24253960,116104,-4002.30,2020-05-29,0.00,-4002.30,8004.60
4977970,2020-05-31,24253961,116105,2246.93,2020-05-31,2246.93,0.00,4493.86


### Transformations

Features to add list:
- From transactions, groupby(['account_id', month('transaction_date')])
  - sum(deposit) in month ~ total deposited
  - sum(withdrawal) in month ~ total withdrawn
  - sum(amount) in month ~ total delta
  - count(deposit != 0) in month ~ total deposits
  - count(withdrawal != 0) in month ~ total withdrawals
  - count(amount != 0) in month ~ total transactions
  - month('transaction_date') - {customer_id:dob} ~ age of customer at given time
  - month('transaction_date') - {customer_id:creation_date} ~ age of account at given time
  - cumsum(amount) ~ total in account at end of month
  - 'transaction_date' - df['transaction_date'].shift(1)~ Period of time between this transaction and next transaction (or with 6/1/2020)
  - ? Whether there was a no-amount transaction? Would those be interpreted as just balance checks?

- From customers (also have to factor in having multiple accounts):
  - initials['customer_id', 'start_balance']
  - ~ time since last transaction
  - ~ current age
  - ~ current age of account
  - ~ current account balance
  - ~ last transaction amount
  - ~ period of time between first and last transaction (+ activity rate)
  - ~ total number of withdrawals made (+ ratio over time)
  - ~ total number of deposits made (+ ratio over time)
  - ~ Number of 

- ideas
  - transactional data must be analyzed in a time-series manner while customer data would be through a more standard logistical model?
  - People with many accounts will be more likely to close AN account (1 of them) since they have other accounts usable
    - People with only 1 account will be less likely to close their account
  - look if a certain quantity of withdrawal (amount, proportion of account) in a given month correlates with churning
    - for example, if they withdrawal X% of their account (which may be associated with subsequent inactivity) => churn
  - Period of time between this transaction and next transaction (or with 6/1/2020)
    - naturally older churned accounts will have a much higher forward inactivity value.
    - What's the average of churned and un-churned accounts and how well does one forward inactivity value distinguish between them
  - Someone who deposits more than they withdrawal in general probably is not churning over a long enough timespan?
    - big withdrawals should be counted for more tho since an account may get filled steadily but being brought down to 0 should be more significant
    - So must be combined with info on final account value
  - 
  - 


In [59]:
# Adds a month and year date columns and binary columns to check if the transaction
# was a deposit, withdrawal, or check_balance (?, a transaction with amount=0) occurred on each line
    # The binary column will be used in group summation to find # of deposits and withdrawals per month

# deepcopy just to be very safe that these transformation tests don't interfere
# with other attempted transformations
t_df_time_t = copy.deepcopy(t_df)
t_df_time_t['month'] = pd.to_datetime(t_df_time_t.loc[:,'transaction_date']).dt.month
t_df_time_t['year'] = pd.to_datetime(t_df_time_t.loc[:,'transaction_date']).dt.year
t_df_time_t['deposit_y'] = t_df_time_t['deposit'].apply(lambda x: 1 if abs(x) > 0 else 0)
t_df_time_t['withdrawal_y'] = t_df_time_t['withdrawal'].apply(lambda x: 1 if abs(x) > 0 else 0)
t_df_time_t['checked_balance'] = t_df_time_t['amount'].apply(lambda x: 1 if abs(x) == 0 else 0)
display(t_df_time_t)

Unnamed: 0,date,account_id,customer_id,amount,transaction_date,deposit,withdrawal,month,year,deposit_y,withdrawal_y,checked_balance
0,2007-01-31,24137947,91,3034.26,2007-01-31,3034.26,0.00,1,2007,1,0,0
1,2007-01-31,24137947,91,-5295.18,2007-01-16,0.00,-5295.18,1,2007,0,1,0
2,2007-02-28,24137947,91,0.00,2007-02-28,0.00,0.00,2,2007,0,0,1
3,2007-03-31,24137947,91,-0.00,2007-03-30,0.00,-0.00,3,2007,0,0,1
4,2007-03-31,24137947,91,-0.00,2007-03-11,0.00,-0.00,3,2007,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
995,2009-08-31,24137964,108,-270.60,2009-08-22,0.00,-270.60,8,2009,0,1,0
996,2009-08-31,24137964,108,-703.16,2009-08-22,0.00,-703.16,8,2009,0,1,0
997,2009-09-30,24137964,108,957.61,2009-09-30,957.61,0.00,9,2009,1,0,0
998,2009-09-30,24137964,108,-1525.12,2009-09-20,0.00,-1525.12,9,2009,0,1,0


In [128]:
# Group by customer_id, year, month
# Creates sum_deposit, sum_withdrawal, sum_amount per month
# and count_deposit and count_withdrawal (counts number of each per month)
# and binary check if the customer checked their balance
# Adds mean of deposits and withdrawals in each month and the running balance at end of month

t_df_time_grouped = t_df_time_t.groupby(
    ["customer_id","year","month"])['amount','deposit','withdrawal',
                                    'deposit_y','withdrawal_y','checked_balance'].agg(
    sum_deposit = ("deposit","sum"),
    sum_withdrawal = ("withdrawal","sum"),
    sum_amount = ("amount","sum"),
    count_deposit = ("deposit_y", "sum"),
    count_withdrawal = ("withdrawal_y", 'sum'),
    checked_balance = ("checked_balance",'max')
)

t_df_time_grouped['mean_deposit'] = (
    t_df_time_grouped['sum_deposit']/t_df_time_grouped['count_deposit'].apply(lambda x:max(1,x)))
t_df_time_grouped['mean_withdrawal'] = (
    t_df_time_grouped['sum_withdrawal']/t_df_time_grouped['count_withdrawal'].apply(lambda x:max(1,x)))

t_df_time_grouped['running_balance'] = t_df_time_grouped.groupby('customer_id')['sum_amount'].cumsum()
display(t_df_time_grouped)

## This is a Customer Dataset transformation - There's a simpler method below!
#final_c_balances =t_df_time_grouped.groupby('customer_id')['running_balance'].last()



  t_df_time_grouped = t_df_time_t.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum_deposit,sum_withdrawal,sum_amount,count_deposit,count_withdrawal,checked_balance,mean_deposit,mean_withdrawal,running_balance
customer_id,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
91,2007,1,3034.26,-5295.18,-2260.92,1,1,0,3034.26,-5295.1800,-2260.92
91,2007,2,0.00,0.00,0.00,0,0,1,0.00,0.0000,-2260.92
91,2007,3,0.00,0.00,0.00,0,0,1,0.00,0.0000,-2260.92
92,2007,1,0.00,0.00,0.00,0,0,1,0.00,0.0000,0.00
92,2007,2,1164.90,0.00,1164.90,1,0,0,1164.90,0.0000,1164.90
...,...,...,...,...,...,...,...,...,...,...,...
108,2009,6,1149.37,-1961.19,-811.82,1,5,0,1149.37,-392.2380,-2718.97
108,2009,7,902.19,0.00,902.19,1,0,0,902.19,0.0000,-1816.78
108,2009,8,968.66,-1898.05,-929.39,1,4,0,968.66,-474.5125,-2746.17
108,2009,9,957.61,-1525.12,-567.51,1,1,0,957.61,-1525.1200,-3313.68


customer_id
91     -2260.92
92      5662.94
93       857.46
94      1012.48
95     12445.25
96     -2994.25
97      7962.53
98       268.83
99     -9299.73
100    -4669.80
101     3830.65
102     2665.43
103     4916.51
104     3179.21
105     3444.67
106    -5110.83
107    20040.68
108    -2317.55
Name: running_balance, dtype: float64

Create transactional rows based off customer dataset rows (Box below complete)

Summary Stats for Customer Dataset

In [123]:
# REDUNDANT, WILL CREATE USING GROUPBY.SUM ON TRANSACTIONAL TABLE + ACCOUNT CREATION TRANSACTIONS
# Code to create user_final_balance feature with final account balance
# Extract start_balance series indexed by user_id
c_start_balance = c_df.loc[:,'start_balance']
c_customer_ids = list(c_df.loc[:,'customer_id'].values)
c_start_balance.index = c_customer_ids

# Group Transactions by user and sum amount - creates column of transaction deltas
t_transaction_sum = t_df.groupby('customer_id')['amount'].aggregate('sum')

# Adding transaction deltas to starting balances by customer_id SHOULD give users' final account balances
user_final_balances = c_start_balance.add(t_transaction_sum)
display(user_final_balances)

# Was using to check that start balance + transactional delta = the user_final_balances column
print(f"start+delta:{c_start_balance[91]+t_transaction_sum[91]} = total:{user_final_balances[91]}")

91       7919.64
92      10420.62
93       7654.18
94      10882.96
95      24945.97
          ...   
1086         NaN
1087         NaN
1088         NaN
1089         NaN
1090         NaN
Length: 1000, dtype: float64

In [13]:
# Creating first and last transaction features, diff between the 2 feature
# These are to be attached to the Customer dataset

t_df['transaction_date'] = t_df['transaction_date'].apply(pd.to_datetime)
t_min_max_date = t_df.groupby('customer_id')['transaction_date'].aggregate(['min','max'])
display(t_min_max_date)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t_df['transaction_date'] = t_df['transaction_date'].apply(pd.to_datetime)


customer_id
91    2007-03-30
92    2008-03-14
93    2007-11-30
94    2007-10-31
95    2008-08-31
96    2007-08-23
97    2008-03-03
98    2007-04-16
99    2009-04-30
100   2008-02-29
101   2010-02-28
102   2007-02-15
103   2010-04-30
104   2013-09-30
105   2008-03-31
106   2008-03-31
107   2014-01-31
108   2009-10-31
Name: max, dtype: datetime64[ns]