# Bank Churn Data Cleaning and EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter, DateLocator
from datetime import datetime, date
from dateutil.relativedelta import relativedelta


In [None]:
# Load data
customers_fp = r"C:\Users\GriffenRoweGaddis\OneDrive - Kubrick Group\Desktop\Training\Projects\Bank Churn\data\customers_tm1_e.csv"
tran_fp = r"C:\Users\GriffenRoweGaddis\OneDrive - Kubrick Group\Desktop\Training\Projects\Bank Churn\data\transactions_tm1_e.csv"

cust_df = pd.read_csv(customers_fp)
trans_df = pd.read_csv(tran_fp)

In [None]:
print(cust_df.columns.values)
print(trans_df.columns.values)

In [None]:
cust_df.head()

### Customer data processing and validation

In [None]:
# check and fix data types
cust_df.info()

cust_df["dob"] = pd.to_datetime(cust_df["dob"])
cust_df["creation_date"] = pd.to_datetime(cust_df["creation_date"])



In [None]:
# check and clean state columns

# standardize columns
cust_df['state'] = cust_df['state'].replace('MASS', 'Massachusetts')
cust_df['state'] = cust_df['state'].replace('CALIFORNIA', 'California')
cust_df['state'] = cust_df['state'].replace('NY', 'New York')
cust_df['state'] = cust_df['state'].replace('TX', 'Texas')
cust_df['state'] = cust_df['state'].replace(['UNK','-999','Australia'], 'N/A')


In [None]:
state_count = cust_df.groupby('state').count()
print(state_count)

In [None]:
# check and clean date columns
print(cust_df["dob"].max())
print(cust_df["dob"].min())
print(cust_df["creation_date"].max())
print(cust_df["creation_date"].min())

# all dates seem to be coded correctly


In [None]:
print(cust_df["start_balance"].min()) #-10000000000.0 needs to be dropped
outliers = cust_df[(cust_df['start_balance'] <= -100000.0)]
cust_df = cust_df.drop(outliers.index)

In [None]:
cust_df['start_balance'].describe()

In [None]:
# this will get the total number of transactions per customer and join to customer table
trans_df['trans_num'] = trans_df.index
trans_df['num_trans'] = trans_df.groupby('customer_id')['trans_num'].transform('count')
grouped_tdf = trans_df.groupby(['customer_id',])['num_trans'].first().reset_index()
cust_df = pd.merge(cust_df,grouped_tdf, how='left', on='customer_id')
del grouped_tdf


In [None]:
# this will get the average number of transactions per month per customer
trans_num_df = trans_df.groupby(['customer_id','date'])['trans_num'].count()


In [None]:
# average transaction number, average deposit amount, average withdrawal amount
avg_trans_num = cust_df['num_trans'].mean() # average of transaction amount for each customer
avg_deposit = trans_df['deposit'].mean()
avg_withdrawal = trans_df['withdrawal'].mean()
sd_trans_num = cust_df['num_trans'].std()

In [None]:
# create response variable that =1 if num_trans
from scipy.stats import zscore

# z-score standardize the num_trans column
cust_df['z_scores'] = zscore(cust_df['num_trans'])
z_score_sd = cust_df['z_scores'].std()
z_score_avg = cust_df['z_scores'].mean()
threshold = z_score_avg - z_score_sd


cust_df['risk_churn'] = np.where(cust_df['z_scores']<threshold,1,0)

### Transaction Data Cleaning and Processing

In [None]:
display(cust_df['risk_churn'].sum())

In [None]:
display(cust_df)

In [None]:
trans_df.head()


In [None]:
trans_df.info()

In [None]:
trans_df["transaction_date"] = pd.to_datetime(trans_df["transaction_date"])
trans_df["date"] = pd.to_datetime(trans_df["date"])


In [None]:
trans_df['month'] = trans_df['date'].dt.month
trans_df['year'] = trans_df['date'].dt.year


#### Edwin's code modified

In [None]:
# Extract start_balance series indexed by user_id
c_start_balance = cust_df.loc[:,'start_balance']
c_customer_ids = list(cust_df.loc[:,'customer_id'].values)
c_start_balance.index = c_customer_ids
#display(c_start_balance)

# Group Transactions by user
t_transaction_sum = trans_df.groupby('customer_id')['amount'].aggregate('sum')
#display(t_transaction_sum)

user_final_balances = c_start_balance.add(t_transaction_sum)

# convert final balances series to dataframe to join with transaction data
bal_df = user_final_balances.reset_index()
bal_df = bal_df.rename(columns={0:'final_account_total',"index":"customer_id"})
bal_df.head()

# join with transaction data
merged_df = pd.merge(trans_df, bal_df, on="customer_id",how='left')

In [None]:
cust_df.head()

In [None]:
display(merged_df[merged_df['customer_id']==91])

In [None]:
# aggregate data by month, customer_id
grouped_df = merged_df.groupby(['year','month','customer_id','final_account_total']).agg({'amount':'sum','deposit':'sum','withdrawal':'sum'}).reset_index()

In [None]:
# merge grouped data with customer data to get the starting balance
grouped_df2 = pd.merge(grouped_df, cust_df[['customer_id','start_balance']], on='customer_id',how='left')

In [None]:
# set month_start_bal to be starting balance iff row has first instance of customer_id
grouped_df2['month_start_bal'] = grouped_df2.groupby('customer_id')['start_balance'].transform('first')
grouped_df2.loc[grouped_df2.duplicated(subset=['customer_id']), 'month_start_bal'] = 0
# grouped_df2['month_total_trans'] = grouped_df2.groupby(['year','month','customer_id']).sum()
grouped_df2['month_end_bal'] = grouped_df2['month_start_bal'] + grouped_df2['deposit'] + grouped_df2['withdrawal']

In [None]:
grouped_df2.head()

In [None]:
display(grouped_df2[grouped_df2['customer_id']==94])


### Customer EDA

In [None]:
cust_df.head()

In [None]:
# customer demographics - add age column
cust_df["last_date"] = datetime.strptime('2020-06-01', "%Y-%m-%d")
cust_df["age"] = cust_df.apply(lambda row: relativedelta(row['last_date'], row['dob']).years, axis=1)

#### Age notes:
- Need to be wary of customers that pass away and be sure they aren't included.
- Older people in general are less likely to change banks, should be excluded from analysis.
- Youngest is 20 at time of end analysis. Are there people who open bank accounts for their teenagers and only deposit infrequently? May want to exclude them as well.
- Can I plot the average (relative) age over time?

In [None]:
cust_df["age"].describe() # oldest person is 79 and youngest person is 20

In [None]:
cust_df["start_balance"].describe() # heavily skewed by one outlier


### Transaction EDA

In [None]:
# plot average account total over time
# avg_acct_total = grouped_data.groupby('year')["account_total"].mean()

In [None]:
# avg_acct_total.head()

In [None]:
# plt.plot()
# plt.xlabel('Time (Years)')
# plt.ylabel('Average Value')
# plt.title('Average Over Time')
# plt.show()

## Economic Data

In [None]:
gdp_fp = r"C:\Users\GriffenRoweGaddis\OneDrive - Kubrick Group\Desktop\Training\Projects\Bank Churn\data\gdp.csv"
ff_fp = r"C:\Users\GriffenRoweGaddis\OneDrive - Kubrick Group\Desktop\Training\Projects\Bank Churn\data\FEDFUNDS.csv"
unem_fp = r"C:\Users\GriffenRoweGaddis\OneDrive - Kubrick Group\Desktop\Training\Projects\Bank Churn\data\UNRATE.csv"

gdp_df = pd.read_csv(gdp_fp)
fedfunds_df = pd.read_csv(ff_fp)
unem_df = pd.read_csv(unem_fp)

In [None]:
gdp_df.head()

In [None]:
fedfunds_df.head()

In [None]:
unem_df.head()

## Feature Engineering

#### To-do:
1) Join the starting balance to the transactions df.
2) Aggregate by month, cust ID
3) Create new features
4) Create visualizations
5) Create feature to be our response

##### Features to add
- Monthly start balance
  - Have to include the start balance as the first month balance for each customer ID
  - Then the next month will be the monthly end balance for each customer ID
- Monthly end balance
  - Have to add the `amount` column to the monthly start for each customer_ID and month combo
  - Then this will be the monthly start balance for each customer

- Average withdrawal amount
- Average deposit amount
- Average monthly start/end balance
- Binary variable that indicates whether or not they have $0.00 monthly ending balance
  - Avg time to have $0.00 monthly ending balance
  - Amount of months in a row having $0.00 ending balance per customer ID
- Binary variable that indicates whether number of months having $0.00 is above average
  - Can create another feature that indicates 
- Number of transactions per account (per month?)
  - Cust_df
  - Avg number of transactions per account per month
    - S.d. too
-

##### Visualizations to make
- Histograms for each continuous variable to see distribution
- Total number of accounts per year over time
- Average deposit/withdrawal amount over time
- 

In [None]:
# Check start_balance
filtered_data = cust_df[cust_df['start_balance'] <= 20000000]

plt.hist(filtered_data['start_balance'], bins=100,edgecolor='black')

# Add labels and title
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Start Balance')

# Display the plot
plt.show()


In [None]:
plt.hist(trans_df['amount'], bins=100,edgecolor='black')

# Add labels and title
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title("Histogram of 'amount'")

# Display the plot
plt.show()

In [None]:
plt.hist(trans_df['deposit'], bins=30,edgecolor='black')

# Add labels and title
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title("Histogram of 'deposit'")

# Display the plot
plt.show()

In [None]:
plt.hist(trans_df['withdrawal'], bins=30,edgecolor='black')

# Add labels and title
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title("Histogram of 'withdrawal'")

# Display the plot
plt.show()

In [None]:
customer_count = trans_df.groupby('year')['customer_id'].nunique()

In [None]:
data_to_plot = customer_count.iloc[:-1]
data_to_plot.plot()

plt.xlabel('Year')
plt.ylabel('Total number of Customers')
plt.title('Total Number of Active Customers Over Time')

In [None]:
avg_dep = trans_df.groupby('year')['deposit'].mean()
avg_with = trans_df.groupby('year')['withdrawal'].mean()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(avg_dep)
ax1.set_title("Average deposit amount over time")

ax2.plot(avg_with)
ax2.set_title("Average withdrawal amount over time")

### First attempt at a model:
- y = risk_churn
- features = age, state dummies, start balance, 

### Response variable idea:
- If average number of monthly transactions is below avg(monthly_transaction_count) - 2 * sd(monthly_transaction_count)
- AND if ending monthly balance is below a threshold for a specific amount of time.

In [None]:
display(cust_df)


In [None]:
print(cust_df["start_balance"].max()) #-10000000000.0 needs to be dropped
outliers = cust_df[(cust_df['start_balance'] >= 1000000.0)]
cust_df = cust_df.drop(outliers.index)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
# use k-means to cluster customers based upon number of transactions
selected_features = ['customer_id', 'num_trans','start_balance']
clust_df = cust_df[selected_features]

# Standardize the transaction_count column
scaler = StandardScaler()
clust_df['num_trans'] = scaler.fit_transform(clust_df['num_trans'].values.reshape(-1, 1))

# Determine the number of clusters
num_clusters = 6

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(clust_df[['num_trans']])

# Assign cluster labels to each customer
clust_df['cluster_label'] = kmeans.labels_

cluster_counts = clust_df['cluster_label'].value_counts()
cluster_means = clust_df.groupby('cluster_label')['num_trans'].mean()

# visualize the clusters
plt.figure(figsize=(8, 6))
plt.scatter(clust_df['start_balance'], clust_df['num_trans'], c=clust_df['cluster_label'], cmap='viridis')
plt.xlabel('Start Balance')
plt.ylabel('Transaction Count (Standardized)')
plt.title('K-means Clustering of Customers by Transaction Count')
plt.show()

### Features to include in model: