In [None]:
##################################################################
# Cluster the applicant
##################################################################
# K Means clusters, an unsupervised machine learning algorithm. 
# Identify clusters of lending patterns

# Only the conventional conforming loans data were considered.
# The lending patterns that of interests are:

# numeric:
# ---------
# Applicant_Income_000
# FFIEC_Median_Family_Income
# Loan_Amount_000
# Number_of_Owner_Occupied_Units
# Tract_to_MSA_MD_Income_Pct
# Conforming_Limit_000
# Loan_Amount_Bucket
# Loan_to_income

# categorical:
# ------------
# Loan_Purpose_Description
# Lien_Status_Description

In [None]:
from sklearn.cluster import KMeans
# subset the data with only variables of interest
df_model = df_hmda_conv_conf[['Applicant_Income_000', 'FFIEC_Median_Family_Income','Loan_Amount_000',
                             'Number_of_Owner_Occupied_Units','Tract_to_MSA_MD_Income_Pct','Conforming_Limit_000',
                             'Loan_Amount_Bucket','Loan_to_income','Loan_Purpose_Description','Lien_Status_Description',
                            'State','As_of_Year']]
df_model.info()
# count the missing values
sr = df_model.isnull().sum()/879453 #total number of entries 879453
sr.sort_values(ascending=False)
# drop missing values
df_model1=df_model.dropna()
df_model1.info()
# 868848 entries
# convert categorical variable into dummy variables
a1 = pd.get_dummies(df_model1['Loan_Purpose_Description'],prefix='Loan_Purpose',drop_first=True)
a2 = pd.get_dummies(df_model1['Lien_Status_Description'],prefix='Lien',drop_first=True)
df_model1.drop(['Loan_Purpose_Description','Lien_Status_Description'],axis=1,inplace=True)
df_model2 = pd.concat([df_model1,a1,a2],axis=1)
df_model2.head()
# collinearity
df_model.corr()

# Check to see correlation coefficient absolute value > 0.7
# FFIEC_Median_Family_Income, Conforming_Limit_000 0.93
# Loan_Amount_000, Loan_Amount_Bucket  0.98

In [None]:
# FFIEC_Median_Family_Income, Conforming_Limit_000 0.93
# Loan_Amount_000, Loan_Amount_Bucket  0.98
# Remove FFIEC_Median_Family_Income, Loan_Amount_000

df_model3 = df_model2.drop(['FFIEC_Median_Family_Income','Loan_Amount_000'],axis=1)
df_model3.info()

In [None]:
# Standardize the Variables
# To eliminate the effect of scale of the variables.
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(df_model3.drop(['State','As_of_Year'],axis=1))
scaled_features = scaler.transform(df_model3.drop(['State','As_of_Year'],axis=1))
df_feat = pd.DataFrame(scaled_features,columns=['Applicant_Income_000','Number_of_Owner_Occupied_Units',
                                                'Tract_to_MSA_MD_Income_Pct','Conforming_Limit_000',
                                                'Loan_Amount_Bucket','Loan_to_income',
                                                'Loan_Purpose_Refinance','Lien_Subordinate Lien'])
df_feat.head()

In [None]:
# Model
from sklearn.cluster import KMeans

In [None]:
# 
# Two clusters
#
kmeans = KMeans(n_clusters=2)
kmeans.fit(df_feat)

In [None]:
pred = kmeans.predict(df_feat)

In [None]:
df_model3['Cluster2'] = pred
df_model3['Cluster2'] = df_model3['Cluster2'].apply(lambda x: 'Pattern 1' if x==1 else 'Pattern 2')

In [None]:
df_model3['Cluster2'].value_counts(normalize=True)

In [None]:
df_model3.head()

In [None]:
# visualization by state
clusterdf = df_model3.groupby(by=['State'])['Cluster2'].value_counts()
clusterdf.unstack(level=-1).head(10)

In [None]:
clusterdf.unstack().plot(kind='bar',stacked=True,figsize=(4,4),subplots=False,
                        title='Lending Patterns of 2 Clusters')

In [None]:
# 
# Three clusters
#
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_feat)
pred = kmeans.predict(df_feat)
df_model3['Cluster3'] = pred
df_model3['Cluster3'] = df_model3['Cluster3'].apply(lambda x: 'Pattern 1' if x==0 else 'Pattern 2' if x==1 else 'Pattern 3')
df_model3['Cluster3'].value_counts(normalize=True)

In [None]:
# visualization by state
clusterdf = df_model3.groupby(by=['State'])['Cluster3'].value_counts()
clusterdf.unstack(level=-1).head(10)
clusterdf.unstack().plot(kind='bar',stacked=True,figsize=(4,4),subplots=False,
                        title='Lending Patterns of 2 Clusters')