# Bank Churn ML

Imports

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import copy

### Load Data

- Order of Operations
  - Load Customer Dataset
  - Perform necessary Customer Dataset cleaning
  - Convert Customer Dataset to transactional rows
  - Load Transactional Dataset
  - Append transactional Customer Dataset to top of Transactional Dataset

In [None]:
# Load customer dataset
c_df_original = pd.read_csv(r"C:\bank-churn-data\customers_tm1_e.csv")
c_df = c_df_original[:1000]
c_array = c_df.to_numpy()

# Convert customer data to transactional rows
c_to_t = pd.DataFrame()
c_to_t['date'] = c_df['creation_date']
c_to_t['account_id'] = 0
c_to_t['customer_id'] = c_df['customer_id']
c_to_t['amount'] = c_df['start_balance']
c_to_t['transaction_date'] = c_df['creation_date']
c_to_t['deposit'] = c_df['start_balance']
c_to_t['withdrawal'] = 0

# Load transactional dataset
t_df_original = pd.read_csv(r"C:\bank-churn-data\transactions_tm1_e.csv")
t_df = t_df_original.iloc[:1000]
t_array = t_df.to_numpy()



### Profiling the Datasets

In [None]:
# Shows missing values per column of each dataset
missing_t_df = t_df_original.isna()
missing_c_df = c_df_original.isna()
missing_t = missing_t_df.sum(axis=0)
missing_c = missing_c_df.sum(axis=0)
print(missing_t)
print(missing_c)

In [None]:
# Prints column names and the lake for ease of reference
print(t_df.columns,'\n',t_array[:2],'\n\n',c_df.columns,'\n',c_array[:2])

In [None]:
# Code I ran to show that each customer_id has only 1 account_id
ex_df = t_df.groupby('customer_id')['account_id'].nunique()
display(ex_df)
display(type(ex_df))
display(ex_df.max())

In [None]:
# Failed Testing
practice = t_df.iloc[[2,3,4]]
display(practice)

t_df['magnitude'] = t_df['amount'].map(abs) + t_df['deposit'].map(abs) + t_df['withdrawal'].map(abs)
display(t_df.query('amount != 0'))


### Transformations

Features to add list:
- From transactions, groupby(['account_id', month('transaction_date')])
  - sum(deposit) in month ~ total deposited
  - sum(withdrawal) in month ~ total withdrawn
  - sum(amount) in month ~ total delta
  - count(deposit != 0) in month ~ total deposits
  - count(withdrawal != 0) in month ~ total withdrawals
  - count(amount != 0) in month ~ total transactions
  - month('transaction_date') - {customer_id:dob} ~ age of customer at given time
  - month('transaction_date') - {customer_id:creation_date} ~ age of account at given time
  - cumsum(amount) ~ total in account at end of month
  - 'transaction_date' - df['transaction_date'].shift(1)~ Period of time between this transaction and next transaction (or with 6/1/2020)
  - ? Whether there was a no-amount transaction? Would those be interpreted as just balance checks?

- From customers (also have to factor in having multiple accounts):
  - initials['customer_id', 'start_balance']
  - ~ time since last transaction
  - ~ current age
  - ~ current age of account
  - ~ current account balance
  - ~ last transaction amount
  - ~ period of time between first and last transaction (+ activity rate)
  - ~ total number of withdrawals made (+ ratio over time)
  - ~ total number of deposits made (+ ratio over time)
  - ~ Number of 

- ideas
  - transactional data must be analyzed in a time-series manner while customer data would be through a more standard logistical model?
  - People with many accounts will be more likely to close AN account (1 of them) since they have other accounts usable
    - People with only 1 account will be less likely to close their account
  - look if a certain quantity of withdrawal (amount, proportion of account) in a given month correlates with churning
    - for example, if they withdrawal X% of their account (which may be associated with subsequent inactivity) => churn
  - Period of time between this transaction and next transaction (or with 6/1/2020)
    - naturally older churned accounts will have a much higher forward inactivity value.
    - What's the average of churned and un-churned accounts and how well does one forward inactivity value distinguish between them
  - Someone who deposits more than they withdrawal in general probably is not churning over a long enough timespan?
    - big withdrawals should be counted for more tho since an account may get filled steadily but being brought down to 0 should be more significant
    - So must be combined with info on final account value
  - 
  - 


In [None]:
# Adds a month and year date columns and binary columns to check if the transaction
# was a deposit, withdrawal, or check_balance (?, a transaction with amount=0) occurred on each line
    # The binary column will be used in group summation to find # of deposits and withdrawals per month

# deepcopy just to be very safe that these transformation tests don't interfere
# with other attempted transformations
t_df_time_t = copy.deepcopy(t_df)
t_df_time_t['month'] = pd.to_datetime(t_df_time_t.loc[:,'transaction_date']).dt.month
t_df_time_t['year'] = pd.to_datetime(t_df_time_t.loc[:,'transaction_date']).dt.year
t_df_time_t['deposit_y'] = t_df_time_t['deposit'].apply(lambda x: 1 if abs(x) > 0 else 0)
t_df_time_t['withdrawal_y'] = t_df_time_t['withdrawal'].apply(lambda x: 1 if abs(x) > 0 else 0)
t_df_time_t['checked_balance'] = t_df_time_t['amount'].apply(lambda x: 1 if abs(x) == 0 else 0)
display(t_df_time_t)

In [None]:
# Group by customer_id, year, month
# Creates sum_deposit, sum_withdrawal, sum_amount per month
# and count_deposit and count_withdrawal (counts number of each per month)
# and binary check if the customer checked their balance
# Adds mean of deposits and withdrawals in each month and the running balance at end of month

t_df_time_grouped = t_df_time_t.groupby(
    ["customer_id","year","month"])['amount','deposit','withdrawal',
                                    'deposit_y','withdrawal_y','checked_balance'].agg(
    sum_deposit = ("deposit","sum"),
    sum_withdrawal = ("withdrawal","sum"),
    sum_amount = ("amount","sum"),
    count_deposit = ("deposit_y", "sum"),
    count_withdrawal = ("withdrawal_y", 'sum'),
    checked_balance = ("checked_balance",'max')
)

t_df_time_grouped['mean_deposit'] = (
    t_df_time_grouped['sum_deposit']/t_df_time_grouped['count_deposit'].apply(lambda x:max(1,x)))
t_df_time_grouped['mean_withdrawal'] = (
    t_df_time_grouped['sum_withdrawal']/t_df_time_grouped['count_withdrawal'].apply(lambda x:max(1,x)))

t_df_time_grouped['running_balance'] = t_df_time_grouped.groupby('customer_id')['sum_amount'].cumsum()
display(t_df_time_grouped)

## This is a Customer Dataset transformation - There's a simpler method below!
#final_c_balances =t_df_time_grouped.groupby('customer_id')['running_balance'].last()



Create transactional rows based off customer dataset rows (Box below complete)

Summary Stats for Customer Dataset

In [None]:
# REDUNDANT, WILL CREATE USING GROUPBY.SUM ON TRANSACTIONAL TABLE + ACCOUNT CREATION TRANSACTIONS
# Code to create user_final_balance feature with final account balance
# Extract start_balance series indexed by user_id
c_start_balance = c_df.loc[:,'start_balance']
c_customer_ids = list(c_df.loc[:,'customer_id'].values)
c_start_balance.index = c_customer_ids

# Group Transactions by user and sum amount - creates column of transaction deltas
t_transaction_sum = t_df.groupby('customer_id')['amount'].aggregate('sum')

# Adding transaction deltas to starting balances by customer_id SHOULD give users' final account balances
user_final_balances = c_start_balance.add(t_transaction_sum)
display(user_final_balances)

# Was using to check that start balance + transactional delta = the user_final_balances column
print(f"start+delta:{c_start_balance[91]+t_transaction_sum[91]} = total:{user_final_balances[91]}")

In [None]:
# Creating first and last transaction features, diff between the 2 feature
# These are to be attached to the Customer dataset

t_df['transaction_date'] = t_df['transaction_date'].apply(pd.to_datetime)
t_min_max_date = t_df.groupby('customer_id')['transaction_date'].aggregate(['min','max'])
display(t_min_max_date)
