# Working with Sample Data

### Import the packages and data 

In [1]:
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = [12, 12]
pd.set_option("precision", 5)

import pyarrow.parquet as pq


In [2]:
table = pq.read_table('Data/cleaned_data.parquet')
data = table.to_pandas()

In [3]:
data = data[0:1000]

## Working with K-means

### Loading the data 

In [4]:
data.dtypes

Date                   int64
Facility              object
Payer                 object
SpecificService       object
CategoryofService     object
Sex                   object
UniqueID              object
Age                  float64
Age_Group             object
dtype: object

In [5]:
data = data.drop(columns="UniqueID")

In [27]:
numerical_data = data.select_dtypes(np.number)
categorical_data = data.select_dtypes([object, "category"])

In [28]:
from sklearn.preprocessing import MinMaxScaler

numerical_data_normalized = MinMaxScaler().fit_transform(numerical_data)
numerical_data_normalized = pd.DataFrame(
                    numerical_data_normalized,
                    columns=numerical_data.columns) 

In [29]:
categorical_data_codified = pd.get_dummies(
                                    categorical_data, 
                                    drop_first=True
)

In [30]:
data_processed = pd.concat([
                            numerical_data_normalized,
                            categorical_data_codified
                        ], axis=1
)
data_processed.head(5)

Unnamed: 0,Date,Age,Facility_CCB,Facility_CCC,Facility_CCS,Facility_CCSJM,Facility_CCTV,Facility_HCD,Facility_HCIS,Facility_HCP,...,CategoryofService_MEDICINA NUCLEAR,CategoryofService_Missing,CategoryofService_PATOLOGIA CLINICA,CategoryofService_SERVIÇOS E TÉCNICAS GERAIS,CategoryofService_URGÊNCIAS,Sex_Masculino,Age_Group_Child,Age_Group_Elderly,Age_Group_Senior,Age_Group_YoungAdult
0,0.0,0.30882,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.30882,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.30882,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.30882,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.30882,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [31]:
data_processed.shape

(1169, 131)

### K-means

In [33]:
from sklearn.cluster import KMeans

In [34]:
data_processed=data_processed.dropna() #drop instead

In [36]:
data_processed.isnull().values.any()

False

In [37]:
estimator_kmeans = KMeans(random_state=42, n_clusters=3)
estimator_kmeans.fit(data_processed)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

In [52]:
cluster_labels = estimator_kmeans.labels_
cluster_labels

array([0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
#seperate the data into test and train and use train all the way and use ".predict" for the test

In [53]:
centroids = estimator_kmeans.cluster_centers_

In [54]:
centroids.shape

(3, 131)

In [55]:
estimator_kmeans.inertia_

2536.095940923268

In [56]:
from sklearn.metrics import euclidean_distances

In [57]:
centroid_distances = euclidean_distances(centroids)

In [58]:
centroid_distances.shape

(3, 3)

In [59]:
estimator_kmeans.labels_.shape

(831,)

In [48]:
data["Cluster"] = cluster_labels

ValueError: Length of values does not match length of index

In [23]:
data.head()

Unnamed: 0,Date,Facility,Payer,SpecificService,CategoryofService,Sex,Age,Age_Group
0,2017,HCIS,ADSE,SERVIÇOS ESPECIAIS CARDIOLOGIA,EXAMES ESPECIAIS,Feminino,37.0,YoungAdult
1,2017,CCSJM,PARTICULARES,SERVIÇOS E TÉCNICAS GERAIS,EXAMES ESPECIAIS,Feminino,37.0,YoungAdult
2,2017,CCSJM,PARTICULARES,SERVIÇOS E TÉCNICAS GERAIS,SERVIÇOS E TÉCNICAS GERAIS,Feminino,37.0,YoungAdult
3,2017,CCTV,ADSE,SERVIÇOS ESPECIAIS UROLOGIA,EXAMES ESPECIAIS,Feminino,37.0,YoungAdult
4,2017,CCSJM,PARTICULARES,SERVIÇOS ESPECIAIS CARDIOLOGIA,EXAMES ESPECIAIS,Feminino,37.0,YoungAdult


In [None]:
def cluster_summary(cluster_id):
    cluster = data[Cluster==cluster_id]
    cluster_summary = cluster[categorical_data.columns].mode().to_dict(orient="records")[0]
    cluster_summary.update(cluster.mean().to_dict())
    cluster_summary["cluster_id"] = cluster_id
    return cluster_summary

In [None]:
cluster = data[data.Cluster==2]
cluster[categorical_data.columns].mode().to_dict(orient="records")

In [None]:
cluster_summary(1)

In [None]:
# Distance between centroid of cluster 0 and 2 
centroid_distances[0,2]

In [None]:
def cluster_comparison(*cluster_ids):
    summaries = []
    for cluster_id in cluster_ids:
        summaries.append(cluster_summary(cluster_id))
    return pd.DataFrame(summaries).set_index("cluster_id").T

In [None]:
cluster_comparison(0,1,2)

In [None]:
def kmeans_cluster(df, n_clusters=3):
    model = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = model.fit_predict(df)
    cluster_results = df.copy()
    cluster_results['Cluster'] = clusters
    return cluster_results

def graph_summary_clustering(results):
    cluster_size = results.groupby(['Cluster']).size().reset_index()
    cluster_size.columns = ['Cluster', 'Count']
    cluster_means = results.groupby(['Cluster'], as_index=False).mean()
    cluster_summary = pd.merge(cluster_size, cluster_means, on='Cluster')
    cluster_summary = cluster_summary.drop(["Count"], axis=1).set_index("Cluster")
    return cluster_summary[sorted(cluster_summary.columns)]

In [None]:
cluster_results = kmeans_cluster(data_processed, 3)
cluster_summary = graph_summary_clustering(cluster_results);
cluster_summary

In [None]:
matplotlib.rcParams['figure.figsize'] = [14, 14]
import seaborn as sns
sns.heatmap(cluster_summary.transpose(), annot=True);

### Ideal number of clusters?

In [None]:
from scipy.spatial.distance import cdist

In [None]:
total_variance = cdist(
    XA=data_processed, 
    XB=np.array([data_processed.mean()]))

In [None]:
total_variance_sum = total_variance.sum()

In [None]:
#functions to calculate the Within-Cluster sum of squares (WSS)
def cluster_variance(cluster_id, cluster_centroid, cluster_labels):
    cluster_elements = data_processed[cluster_labels==cluster_id]
    return cdist(XA=cluster_elements, XB=np.array([cluster_centroid])).sum()

#explained variance (defined as the reduction of variance compared to the total variance,...
# ... which would be the variance where k=1).
def variance_measure(estimator_kmeans, total_variance):
    clusters_labels = estimator_kmeans.labels_
    wss = 0
    for i, cluster_id in enumerate(np.unique(clusters_labels)):
        cluster_centroid = estimator_kmeans.cluster_centers_[i]
        wss += cluster_variance(cluster_id, cluster_centroid, clusters_labels)
    return (total_variance-wss) / total_variance

In [None]:
# Function that returns the inertia.
def inertia_measure(estimator_kmeans):
    return estimator_kmeans.inertia_

In [None]:
# Create a function that evaluates the intertia and percentage variance explained for a value of K
def evaluate_k_kmeans(k, scoring, **kwargs):
    scoring_measure = None
    if scoring=="inertia":
        scoring_measure = inertia_measure
    elif scoring=="variance":
        scoring_measure = variance_measure
        
    estimator_kmeans = KMeans(random_state=42, n_clusters=k)
    estimator_kmeans.fit(data_processed)
    return scoring_measure(estimator_kmeans, **kwargs)

In [None]:
results_k = {}
range_k = [1,2,3,4,5,6,7,8,9] #range_k gives the number of clusters to evaluate their inertia and variance
for k in range_k:
    results_k[k] = (
        evaluate_k_kmeans(k, "inertia"), 
        evaluate_k_kmeans(k, "variance", total_variance=total_variance_sum), 
    )

In [None]:
fig, ax1 = plt.subplots()

ax1.plot(
    [c[0] for c in results_k.items()],
    [c[1][0] for c in results_k.items()], label="inertia", color="red")
ax1.set_ylabel('inertia', color="red")


ax2 = ax1.twinx()
ax2.plot(
    [c[0] for c in results_k.items()],
    [c[1][1] for c in results_k.items()], label="percentage variance explained", color="blue")
ax2.set_ylabel('percentage variance', color='blue')

plt.xlabel("K")
plt.legend()
plt.title("Percentage variance / Inertia by. K");

In [None]:
sse = {}
for k in range(1, 10):
   kmeans = KMeans(n_clusters=k).fit(data_processed)
   #print(data["clusters"])
   sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

## Change the number of clusters 

In [None]:
clusterer = KMeans(n_clusters=6)
clusterer.fit(data_processed)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import silhouette_score

In [None]:
def silhouette_score_cv(clusterer, X, y=None):
    cluster_labels = clusterer.fit_predict(X)
    return silhouette_score(X, cluster_labels)

In [None]:
cross_val_score(clusterer, data_processed, y=None, 
                scoring=silhouette_score_cv, cv=3).mean() #a small value means it is well matched

In [None]:
pd.Series(clusterer.labels_).value_counts()

In [None]:
new_clusters = clusterer.labels_
data["Cluster"] = new_clusters
data.head()