We first use clustering to see if we can find any patterns among customers in our data. This step could and probably should have been performed in the EDA phase of this project, but we include it here because we are performing all Machine Learning techniques together.

In [None]:
data.values

In [None]:
data.values.shape

In [None]:
from sklearn.cluster import KMeans
model = KMeans()

model.fit(data.values)

In [None]:
data.describe().loc['std']

Problem: The variances for our features are very different. Therefore, we need to standardize the input features to avoid giving more importance to features with large variances (that is, the LIMIT_BAL, BILL_AMT, and PAY_AMT variables, which all represent dollar amounts).

### Choosing K: The Elbow Sum-of-Squares Method

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

ks = range(1,21)


In [None]:
## Commented out because takes time to run
# inertias = []

# for k in ks:
#     scaler = StandardScaler()
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     pipeline = make_pipeline(scaler, kmeans)
#     pipeline.fit(data.values)
#     inertias.append(pipeline.named_steps['kmeans'].inertia_)

# with open ('list_of_inertias.txt', 'w') as f:
#     for inertia in inertias:
#         f.write("%f\n" % inertia)

In [None]:
with open('list_of_inertias.txt') as f:
    inertias = f.readlines()
inertias = [x.strip() for x in inertias]
inertias = list(map(float, inertias))

In [None]:
plt.plot(ks, inertias, 'o-')
plt.xlabel('K Values')
plt.ylabel('Inertia Values')
plt.title('Inertia by K Value with Scaling Beforehand')
plt.show()

So it looks like 3 or 4 clusters is best.

In [None]:
declines = dict()

for i in range(1, len(inertias)):
    declines["Difference Between " + str(i+1) + " clusters and "+ str(i) + " clusters"] = "{:,}".format(inertias[i] - inertias[i-1])
declines


Confirms that 3 clusters is optimal.

In [None]:
scaler = StandardScaler()
kmeans = KMeans(n_clusters=3, random_state=42)
pipeline = make_pipeline(scaler, kmeans)
pipeline.fit(data.values)

In [None]:
labels = pipeline.named_steps['kmeans'].labels_

In [None]:
unique, counts = np.unique(labels, return_counts=True)
print(np.asarray((unique, counts)).T)

So most of the values fall into group 1.

In [None]:
plt.bar(unique, counts, tick_label=unique)
plt.xlabel("Cluster #")
plt.ylabel("Number of Points")
plt.title("Number of Points per Cluster")
plt.show()

### Choosing K: The Silhouette Method

###### Did not run SIlhouette Method because it takes too long

In [None]:
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_samples, silhouette_score

# import matplotlib.pyplot as plt
# import matplotlib.cm as cm
# import numpy as np
# import time

# silhouette_scores = []

# range_n_clusters = range(2,21)

# start_total = time.time()
# for n_clusters in range_n_clusters:
#     loop_start = time.time()
    
#     scaler = StandardScaler()
#     kmeans = KMeans(n_clusters=n_clusters, random_state=42)
#     pipeline = make_pipeline(scaler, kmeans)
#     cluster_labels = pipeline.named_steps['kmeans'].fit_predict(data.values)

#     # The silhouette_score gives the average value for all the samples.
#     # This gives a perspective into the density and separation of the formed
#     # clusters
#     silhouette_avg = silhouette_score(data.values, cluster_labels)
#     silhouette_scores.append(silhouette_avg)
#     print("For n_clusters =", n_clusters,
#           "The average silhouette_score is :", silhouette_avg)
#     loop_end = time.time()
#     print("For n_clusters =", n_clusters, "The loop ran for {0} seconds ".format(loop_end - loop_start))
    

# end_total = time.time()

# total = end_total - start_total
# print("Total time: ", total)


# with open ('list_of_silhouettes.txt', 'w') as f:
#     for silhouettes in silhouette_scores:
#         f.write("%f\n" % silhouette)

# with open('list_of_silhouettes.txt') as f:
#     silhouettes = f.readlines()
# silhouettes = [x.strip() for x in silhouettes]
# silhouettes = list(map(float, silhouettes))


# plt.plot(n_clusters, silhouettes, 'o-')
# plt.xlabel('n_clusters')
# plt.ylabel('Silhouette Scores')
# plt.title('Silhouette Scores by n_cluseters')
# plt.show()

### Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy  import linkage, dendrogram

In [None]:
# mergings = linkage(data.values, method='complete')

In [None]:
# dendrogram(mergings,
#            labels)

In [None]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components=2)

In [None]:
# pca.fit(data.values)

In [None]:
# transformed = pca.transform(data.values)

In [None]:
# transformed.shape

In [None]:
# model = KMeans(n_clusters=10, random_state=10)
# model.fit(x_cols)

In [None]:
# plt.scatter(transformed[:,0], transformed[:,1], s=50, c=model.labels_)
# plt.colorbar()
# plt.show()

Questions About Clustering:

1. How should I go about clustering since I have no idea about how many groupings there are in advance?
2. Should I group by only a subset of features?
3. How to do proper scaling for various clustering techniques?
