# Clustering Demo

In [None]:
#import the necessary libraries
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import InterclusterDistance
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.model_selection import FeatureImportances

from zipfile import ZipFile

import seaborn as sns

plt.style.use('ggplot')

pd.set_option("display.max_columns", None)
print("Everything was loaded correctly")

In [None]:
#Enter the password to access the data
str_pwd = input('Enter the password to access the data:  ')

In [None]:
#open the encrypted zipfile
with ZipFile('clustering_file.zip') as zf:
    zf.extractall(pwd=bytes(str_pwd,'utf-8'))

In [None]:
#decrypt the file
df_data = pd.read_csv('clustering_file.csv')

#get list of features to use in visualizations
lst_features = df_data.columns.tolist()
print("data file was loaded correctly")

In [None]:
#View a sample of the data loaded in
df_data.head()

# How many clusters should we use?

In [None]:
#how many clusters do you want to see on the list?
max_clusters=10

X = np.array(df_data)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

visualizer = KElbowVisualizer(KMeans(), k=(1,max_clusters), size=(900,600))
visualizer.fit(X_scaled) # Fit the data to the visualizer
visualizer.show()  

# Run Clustering

In [None]:
#How many clusters do you want to view?
int_clusters = int(input('How many clusters do you want?  '))

In [None]:
model = KMeans(n_clusters = int_clusters, init = "k-means++", max_iter = 300, n_init = 10, random_state = 0)
y_clusters = model.fit_predict(X_scaled)

#show cluster count
ax = sns.countplot(y_clusters)
for p in ax.patches:
        ax.annotate('{:1}'.format(p.get_height()), (p.get_x()+.3, p.get_height()+5))

# View clusters

In [None]:
# Instantiate the clustering model and visualizer
model = KMeans(int_clusters)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick', size=(720,480))

visualizer.fit(X_scaled)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
# Instantiate the clustering model and visualizer
model = KMeans(int_clusters)

visualizer = InterclusterDistance(model, size=(600,600))
visualizer.fit(X_scaled)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

# What features are important to the clusters

In [None]:
model = RandomForestClassifier(n_estimators=10, random_state=14)
viz = FeatureImportances(model, labels=lst_features, topn=20, size=(720,1080))
viz.fit(X_scaled, y_clusters)
viz.show()

In [None]:
model = LogisticRegression(multi_class="auto", solver="liblinear")
viz = FeatureImportances(model, stack=True, labels=lst_features, relative=False, topn=30, size=(720,1080))
viz.fit(X_scaled, y_clusters)
viz.show()