# Unsupervised Learning
Name: Koh June Wen

Admin Number: 2112956

Class: DAAAFT2A04

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

df = pd.read_csv("Company_Employee.csv")
df.info()

## Exploratory Data Analysis

In [None]:
df[df['Resign Status'] == 'Yes'].count()

In [None]:
fig, ax = plt.subplots(2,3, figsize=(20, 10))
sns.countplot(data=df, x='Gender', ax=ax[0,0])
sns.countplot(data=df, x='BusinessTravel', ax=ax[0, 1])
sns.countplot(data=df, x='Job Function', ax=ax[0,2])
sns.countplot(data=df, x='MaritalStatus', ax=ax[1,0])
sns.countplot(data=df, x='Resign Status', ax=ax[1,1])

In [None]:
corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(10, 10))

cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,square=True, linewidth=.5, cbar_kws={"shrink": .5})

## shows that there is little or not correlation between the features of the dataset

In [None]:
sns.scatterplot(data=df, x='Salary ($)', y='Age', hue='Job Function')

In [None]:
plt.figure(figsize=(10,10))
sns.histplot(data=df, x='Length of Service (Years)')

## Feature Engineering

1. Use `OrdinalEncoder`. You transform categorical feature to just one column. The problem may be that the difference between 2 categories may be different from another combination which may not be accurate.

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

### Ordinal Encoding
Some of the features have ordinal values but in categorical values.\
E.g. BusinessTravel feature has 'Non travel', 'Travel Rarely' and 'Travel Frequently'
the difference between 'Non travel' and 'Travel Frequently' is evidently greater than the difference between 'Non travel' and 'Travel Rarely'.

In [None]:
oe = OrdinalEncoder(categories=[['Non-Travel', 'Travel_Rarely', 'Travel_Frequently']])
# oe.fit(ordinal)

# oe.transform(df)

oe_businesstravel = oe.fit_transform(df[['BusinessTravel']])

In [None]:
oe_weighted = OrdinalEncoder(categories=[['Non-Travel', 'Travel_Rarely', '', 'Travel_Frequently']])
oe_weighted_businesstravel = oe_weighted.fit_transform(df[['BusinessTravel']])
oe_weighted_businesstravel

### Gower Dissimilarity

Gower Distance is a distance measure that can be used to calculate distance between two entity whose attributes has a mix categorical and numerical values. The distance is always a number between 0 (identical) and 1 (maximally dissimilar)

`Quantitative (interval)`: range-normalized (Manhattan distance)\
`Ordinal`: variable is first ranked, then Manhattan distance is used with a special adjustment for ties\
`Nominal`: variable of <i>k</i> categories are first converted into <i>k</i> binary columns and then the Dice coefficient is used

Gower Dissimilarity is non-Euclidean and non-metric. However, Gower Dissimilarity is actually Euclidean distance when no specially processed ordinal variables are used

As we are using a distance that is not obeying the Euclidean geometry, methods based on Euclidean distance must not be used e.g. K-means, Ward, etc.

This method is mainly used to deal with the many categorical variables in the dataset. The sample space for categorical data is discrete, and doesn't have a natural origin. A Euclidean distance function on such a space isn't really meaningful. As someone put it, "The fact that a snake possesses neither wheels nor legs allows us to say nothing about the relative value of wheels and legs."

In [None]:
import gower

In [None]:
gower_df = gower.gower_matrix(df)

In [None]:
gower_resign_df = gower.gower_matrix(df[df['Resign Status'] == 'Yes'])

In [None]:
oe_df = df.copy()
oe_df['BusinessTravel'] = oe_businesstravel

gower_oe_df = gower.gower_matrix(oe_df)

In [None]:
oe_weighted_df = df.copy()
oe_weighted_df['BusinessTravel'] = oe_weighted_businesstravel

gower_oe_weighted_df = gower.gower_matrix(oe_weighted_df)

In [None]:
oe_weighted_df = df.copy()
oe_weighted_df['BusinessTravel'] = oe_weighted_businesstravel
oe_weighted_male_df = oe_weighted_df[oe_weighted_df['Gender'] == 'Male']
oe_weighted_female_df = oe_weighted_df[oe_weighted_df['Gender'] == 'Female']
oe_weighted_resigned_df = oe_weighted_df[oe_weighted_df['Resign Status'] == 'Yes']
oe_weighted_notresigned_df = oe_weighted_df[oe_weighted_df['Resign Status'] == 'No']

gower_oe_weighted_male_df = gower.gower_matrix(oe_weighted_male_df)
gower_oe_weighted_female_df = gower.gower_matrix(oe_weighted_female_df)
gower_oe_weighted_resigned_df = gower.gower_matrix(oe_weighted_resigned_df)
gower_oe_weighted_notresigned_df = gower.gower_matrix(oe_weighted_notresigned_df)

### Scaling only numeric features

In [None]:
scaler = StandardScaler()
scaled_num = scaler.fit_transform(df[['Age', 'Distance Between Company and Home (KM)', 'Salary ($)', 'Length of Service (Years)']][df['Resign Status'] == 'Yes'])
scaled_num_df = pd.DataFrame(scaled_num, columns=['Age', 'Distance Between Company and Home (KM)', 'Salary ($)', 'Length of Service (Years)'])
scaled_num_df

### OHE

You will transform categorical feature to four new columns, where will be just one 1 and other 0. The problem here is that difference between 2 combinations of categories will be the same as the combinations of 2 other categories.

### Dummying the categoric features

In [None]:
## Dummying the categoric features

categories = ['Gender', 'BusinessTravel', 'Job Function', 'MaritalStatus', 'Resign Status']

ohc = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')
c_transformed_df = ohc.fit_transform(df[categories])
dummied_categories_df = pd.DataFrame(c_transformed_df, columns=ohc.get_feature_names_out())
new_df = pd.concat([dummied_categories_df, df.drop(columns=categories)], axis=1)

## Scaling the data
scaler = StandardScaler()
scaled_df = scaler.fit_transform(new_df)
new_df_scaled = pd.DataFrame(scaled_df, columns=new_df.columns)
new_df_scaled

In [None]:
new_df_scaled.info()

In [None]:
oe_df = df.copy()
oe_df['BusinessTravel'] = oe_businesstravel

categories = ['Gender', 'Job Function', 'MaritalStatus', 'Resign Status']
ohc = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')
c_transformed_df = ohc.fit_transform(oe_df[categories])
dummied_categories_df = pd.DataFrame(c_transformed_df, columns=ohc.get_feature_names_out())
new_df_oe = pd.concat([dummied_categories_df, oe_df.drop(columns=categories)], axis=1)

scaler = StandardScaler()
scaled_oe_df = scaler.fit_transform(new_df_oe)
new_df_scaled_oe = pd.DataFrame(scaled_oe_df, columns=new_df_oe.columns)
new_df_scaled_oe

In [None]:
new_df_scaled_oe.info()

In [None]:
oe_weighted_df = df.copy()
oe_weighted_df['BusinessTravel'] = oe_weighted_businesstravel

categories = ['Gender', 'Job Function', 'MaritalStatus', 'Resign Status']
ohc = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')
c_transformed_df = ohc.fit_transform(oe_weighted_df[categories])
dummied_categories_df = pd.DataFrame(c_transformed_df, columns=ohc.get_feature_names_out())
new_df_oe_weighted = pd.concat([dummied_categories_df, oe_weighted_df.drop(columns=categories)], axis=1)

scaler = StandardScaler()
scaled_oe_weighted_df = scaler.fit_transform(new_df_oe_weighted)
new_df_scaled_oe_weighted = pd.DataFrame(scaled_oe_weighted_df, columns=new_df_oe_weighted.columns)
new_df_scaled_oe_weighted

In [None]:
new_df_scaled_oe_weighted.info()

In [None]:
oe_weighted_nfirst_df = df.copy()
oe_weighted_nfirst_df['BusinessTravel'] = oe_weighted_businesstravel

categories = ['Gender', 'Job Function', 'MaritalStatus', 'Resign Status']
ohc = OneHotEncoder(sparse=False, handle_unknown='ignore')
c_transformed_df = ohc.fit_transform(oe_weighted_nfirst_df[categories])
dummied_categories_df = pd.DataFrame(c_transformed_df, columns=ohc.get_feature_names_out())
new_df_oe_weighted_nfirst = pd.concat([dummied_categories_df, oe_weighted_nfirst_df.drop(columns=categories)], axis=1)

scaler = StandardScaler()
scaled_oe_weighted_nfirst_df = scaler.fit_transform(new_df_oe_weighted_nfirst)
new_df_scaled_oe_weighted_nfirst = pd.DataFrame(scaled_oe_weighted_nfirst_df, columns=new_df_oe_weighted_nfirst.columns)
new_df_scaled_oe_weighted_nfirst.info()

### PCA
I will not use PCA for the unsupervised learning because the cumulative explained variance only reaches more than 80% when there are 10 PCs.\
With this many PCs, it will be very difficult to compare the clustering.\
Also, PCA will result in loss of information. If I were to only compare the first 3 PCs, which only have a cumulative explained variance of 0.3625, for the clustering, it will be very unreliable.

In [None]:
from sklearn.decomposition import PCA

def pca_results(data, pca):
    dimensions = [f"PC {i}" for i in range(1, len(pca.components_) + 1)]

    components = pd.DataFrame(np.round(pca.components_, 4), columns=data.keys())
    components.index = dimensions

    ev = pca.explained_variance_.reshape(len(pca.components_), 1)
    eigenvalues = pd.DataFrame(np.round(ev, 4), columns=['Eigenvalues'])
    eigenvalues.index = dimensions

    ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
    variance_ratios = pd.DataFrame(np.round(ratios, 4), columns=['Explained Variance'])
    variance_ratios.index = dimensions

    cum_ratios = pca.explained_variance_ratio_.cumsum().reshape(len(pca.components_), 1)
    cum_variance_ratios = pd.DataFrame(np.round(cum_ratios, 4), columns=['Cumulative Explained Variance'])
    cum_variance_ratios.index = dimensions

    return pd.concat([eigenvalues, variance_ratios, cum_variance_ratios, components], axis=1)


def loadingplot(data, pca, pc_plots, width=5, height=5, margin=0.5):
    x_pc = pc_plots[0] - 1
    y_pc = pc_plots[1] - 1

    fig, ax = plt.subplots(figsize=(width, height))

    x_min = min(pca.components_[x_pc,:].min(), 0) - margin
    x_max = max(pca.components_[x_pc,:].max(), 0) + margin
    
    y_min = min(pca.components_[y_pc,:].min(), 0) - margin
    y_max = max(pca.components_[y_pc,:].max(), 0) + margin

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)

    text_pos = 0.1

    for i, v in enumerate(pca.components_.T):
        ax.arrow(0, 0, v[x_pc], v[y_pc], head_width=0.1, head_length=0.1, linewidth=2, color='red')
        ax.text(v[x_pc], v[y_pc] + text_pos, data.columns[i], color='black', ha='center', va='center', fontsize=12)

    plt.plot([x_min, x_max], [0, 0], color='k', linestyle='--', linewidth=1)
    plt.plot([0, 0], [y_min, y_max], color='k', linestyle='--', linewidth=1)

    ax.set_xlabel(f"PC{x_pc + 1}", fontsize=14)
    ax.set_ylabel(f"PC{y_pc + 1}", fontsize=14)
    ax.set_title("Loading plot", fontsize=14)

    return ax

In [None]:
pca = PCA(n_components=min(len(new_df_scaled.columns), len(new_df_scaled.index))).fit(new_df_scaled)
pca_samples = pca.transform(new_df_scaled)

results = pca_results(new_df_scaled, pca)
results

## Select the first 10 PCs as their cumulative explained variance is more then 80%

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax.plot(np.arange(1, len(pca.explained_variance_) + 1), pca.explained_variance_)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set(xlabel='Component Number', ylabel='Eigenvalue', title='Screeplot')
plt.show()

In [None]:
pca2 = PCA(n_components=10).fit(new_df_scaled)
scores2 = pca2.transform(new_df_scaled)

ax2 = loadingplot(new_df_scaled, pca2, [10,6], width=7, height=7, margin=0.2)
plt.show()

In [None]:
scores2_df = pd.DataFrame(scores2, columns=[f"PC{i}" for i in range(1, 11)])
scores2_df

## Model Building / Evaluation
I decided to use Calinski Harabasz Score instead of Silhouette Score.\
As Calinski Harabasz Score does something similar to the Silhouette Score.\
Also, Silhouette Score may does not take into account the size of the cluster.\
E.g. In the tuning of Agglomerative Clustering, the linkage of <i>single</i> gave a very high Silhouette Score, but in actuality, the cluster found was just a few points which would just be identifying outliers.

In [None]:
from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering, DBSCAN, Birch, AgglomerativeClustering, MeanShift
from sklearn.mixture import GaussianMixture

from sklearn.metrics import silhouette_score
# attempts to describe how similar a datapoint is to other datapoints in its cluster,
# relative to datapoints not in its cluster
# It is bounded between -1 and 1. Closer to -1 suggests incorrect clustering,
# while closer to +1 shows that each cluster is very dense.
# Advantages:
# The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. Scores around zero indicate overlapping clusters.
# Disadvantages:
# Generally higher for convex clusters than other concepts of clusters, such as density based clusters like those obtained through DBSCAN

from sklearn.metrics import calinski_harabasz_score
# the ratio of the variance of a datapoint compared to points in other clusters, 
# against the variance compared to points within its cluster.
# A high CH index is desireable.
# This score is not bounded
# Advantages:
# The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster
# Disadvantages:
# 

from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

from tqdm import tqdm

### KMeans
As KMeans uses Euclidean distance to train, due to the high dimensionality of the dataset (16 dimensions), the Euclidean distance becomes inflated (curse of dimensionality)

Disadvantages:\
Extremely susceptible to outliers and noise.\
The Algorithem selects the centroids which is just calculated as the mean of all the points, and not a real point of the distribution, the outliers present in any cluster will cause the centroid to distort and it will also cause the SSE to blow up.

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(scaled_num_df[['Salary ($)', 'Length of Service (Years)']])
y_kmeans = kmeans.predict(scaled_num_df[['Salary ($)', 'Length of Service (Years)']])

centers = kmeans.cluster_centers_

In [None]:
x = 2
y = 3
z = 9

fig = plt.figure(figsize=(10, 10))
# ax = fig.add_subplot(111, projection = '3d')
ax = fig.add_subplot(111)

# ax.scatter(scaled_num_df[scaled_num_df.columns[x]], scaled_num_df[scaled_num_df.columns[y]], scaled_num_df[scaled_num_df.columns[z]], c = y_kmeans, s=50)
# ax.scatter(centers[:, x], centers[:, y], centers[:, z], c='red', s=200)

ax.scatter(scaled_num_df[scaled_num_df.columns[x]], scaled_num_df[scaled_num_df.columns[y]], c = y_kmeans, s=50)
ax.scatter(centers[:, 0], centers[:, 1], c='red', s=200)

ax.set_xlabel(scaled_num_df.columns[x])
ax.set_ylabel(scaled_num_df.columns[y])
# ax.set_zlabel(scaled_num_df.columns[z])

plt.show()

In [None]:
inertia_arr = []
silhouette_arr =[]

x_ = np.arange(2, 15)

for k in range(2, 15):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(scaled_num_df)
    pred = kmeans.predict(scaled_num_df)
    inertia_arr.append(kmeans.inertia_)
    silhouette_arr.append(silhouette_score(scaled_num_df, pred))

In [None]:
plt.plot(x_, inertia_arr)
plt.show()

In [None]:
plt.plot(x_, silhouette_arr)
plt.show()

### Affinity Propagation

In [None]:
ap_model = AffinityPropagation(random_state=5)
ap_model.fit(new_df_scaled)
pred = ap_model.predict(new_df_scaled)

x_ax = 'Salary ($)'
y_ax = 'Length of Service (Years)'

plt.scatter(new_df_scaled[x_ax], new_df_scaled[y_ax], c=pred, s=50, cmap='viridis')

centers = ap_model.cluster_centers_
plt.scatter(centers[:, 12], centers[:, 15], c='red', s=200, alpha=0.8)

plt.xlabel(x_ax)
plt.ylabel(y_ax)
plt.show()

In [None]:
silhouette_score(new_df_scaled, pred)

### Mean Shift

In [None]:
ms_model = MeanShift(cluster_all=False)
ms_result = ms_model.fit_predict(new_df_scaled)

centers = ms_model.cluster_centers_

In [None]:
np.unique(ms_result)

In [None]:
x = 12
y = 15
z = 9

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection = '3d')

ax.scatter(new_df_scaled[new_df_scaled.columns[x]], new_df_scaled[new_df_scaled.columns[y]], new_df_scaled[new_df_scaled.columns[z]], c = ms_result, s=50)
ax.scatter(centers[:, x], centers[:, y], centers[:, z], c='red', s=200)
plt.show()

### Spectral Clustering

In [None]:
sc_model = SpectralClustering(n_clusters=4, assign_labels='discretize')
sc_result = sc_model.fit_predict(new_df_scaled)
sc_result

In [None]:
x = 12
y = 15

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)

# ax.scatter(new_df_scaled.iloc[:, x], new_df_scaled.iloc[:, y], c=sc_result, s=50)
ax.scatter(new_df_scaled[new_df_scaled.columns[x]], new_df_scaled[new_df_scaled.columns[y]], c=sc_result, s=50)
ax.set_xlabel(new_df_scaled.columns[x])
ax.set_ylabel(new_df_scaled.columns[y])
plt.show()

In [None]:
silhouette_score(new_df_scaled, sc_result)

### Agglomerative Clustering
A hierarchical clustering model\
Each point initially starts as a cluster, and slowly the nearest or similar most clusters merge to create one cluster\
We keep on merging the clusters which are nearest or have a high similarity score to one cluster. So, if we define a cut-off or threshold score for the merging we will get multiple clusters instead of a single one.\
E.g. If we say the threshold similarity metrics score is 0.5, it means the algorithm will stop merging the clusters if no two clusters are found with a similarity score less than 0.5, and the number of clusters present at that step will give the final number of clusters that need to be created to the clusters.

In [None]:
silhouette_arr = {'complete': [], 'single': [], 'average': []}
calinski_arr = {'complete': [], 'single': [], 'average': []}
x = np.arange(2, 15)

for n in tqdm(x): 
    ac_model_complete = AgglomerativeClustering(n_clusters=n, compute_distances=True, affinity='precomputed', linkage='complete')
    pred_ac_complete = ac_model_complete.fit_predict(gower_oe_weighted_df)

    silhouette_arr['complete'].append(silhouette_score(gower_oe_weighted_df, pred_ac_complete))
    calinski_arr['complete'].append(calinski_harabasz_score(gower_oe_weighted_df, pred_ac_complete))

    ac_model_average = AgglomerativeClustering(n_clusters=n, compute_distances=True, affinity='precomputed', linkage='average')
    pred_ac_average = ac_model_average.fit_predict(gower_oe_weighted_df)

    silhouette_arr['average'].append(silhouette_score(gower_oe_weighted_df, pred_ac_average))
    calinski_arr['average'].append(calinski_harabasz_score(gower_oe_weighted_df, pred_ac_average))

    ac_model_single = AgglomerativeClustering(n_clusters=n, compute_distances=True, affinity='precomputed', linkage='single')
    pred_ac_single = ac_model_single.fit_predict(gower_oe_weighted_df)

    silhouette_arr['single'].append(silhouette_score(gower_oe_weighted_df, pred_ac_single))
    calinski_arr['single'].append(calinski_harabasz_score(gower_oe_weighted_df, pred_ac_single))
    # distances_arr.append(ac_model_single.distances_)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
# silhouette score plot
ax1.title.set_text('Silhouette Score')
ax1.plot(x, silhouette_arr['complete'], label='complete')
ax1.plot(x, silhouette_arr['single'], label='single')
ax1.plot(x, silhouette_arr['average'], label='average')

# calinski harabas score plot
ax2.title.set_text('Calinski Harabas Z Score')
ax2.plot(x, calinski_arr['complete'], label='complete')
ax2.plot(x, calinski_arr['single'], label='single')
ax2.plot(x, calinski_arr['average'], label='average')

plt.legend()
plt.show()

In [None]:
ac_model = AgglomerativeClustering(n_clusters=2, compute_distances=True, affinity='precomputed', linkage='complete')
ac_result = ac_model.fit_predict(gower_oe_weighted_df)
print(f"Silhouette Score : {silhouette_score(gower_oe_weighted_df, ac_result)}")
print(f"Calinski Score : {calinski_harabasz_score(gower_oe_weighted_df, ac_result)}")

#### Observations (n_clusters = 2 | linkage = complete)
Categoric Variables:
- `Resign Status` - 1

Numeric Variables:
- Snake Plot
    - `Age` - 1
    - `BusinessTravel` - 1
    - `Salary` - 1
    - `Length of Service` - 1
- Heat Map
    - `BusinessTravel` - 1
    - `Salary` - 1
    - `Length of Service` - 1

#### Observations (n_clusters = 3 | linakge = average)
Categoric Variables:
- `Resign Status` - 2
- `Sales` - 1
- `R&D` - 1

Numeric Variables:
- Snake Plot
    - `Age` - 1
    - `BusinessTravel` - 1
    - `Distance Between Company and Home` - 1
    - `Education` - 1
    - `Salary` - 1
    - `Performance Rating` - 1
    - `Length of Service` - 1

Problem:
- Label 1 only has 8 staff clustered, which might heavily skew the insights of the cluster.
- Label 1 are all outliers.

In [None]:
categories = [
    'Gender_Female',
    'Job Function_Human Resources',
    'Job Function_Research & Development',
    'Job Function_Sales',
    'MaritalStatus_Divorced',
    'MaritalStatus_Married',
    'MaritalStatus_Single',
    'Resign Status_Yes'
]
numeric = [
    'Age',
    'BusinessTravel',
    'Distance Between Company and Home (KM)',
    'Education (1 is lowest, 5 is highest)',
    'Job Satisfaction (1 is lowest, 4 is highest)',
    'Salary ($)',
    'Performance Rating (1 is lowest, 4 is highest)',
    'Work Life Balance (1 is worst, 4 is best)',
    'Length of Service (Years)'
]

labels = pd.DataFrame(ac_result, columns=["Labels"])
labelled_df = pd.concat([new_df_oe_weighted_nfirst, labels], axis=1)
labelled_scaled_df = pd.concat([new_df_scaled_oe_weighted_nfirst, labels], axis=1)

In [None]:
labels = pd.DataFrame(ac_result, columns=["Labels"])
labelled_df = pd.concat([new_df_oe_weighted_nfirst, labels], axis=1)

labelled_df_cat = labelled_df[categories + ['Labels']]
cluster_count_proportion = labelled_df_cat.groupby(['Labels']).sum() / labelled_df_cat.groupby(['Labels']).count()
population_count_proportion = labelled_df_cat.drop(columns=['Labels']).sum() / labelled_df_cat.drop(columns=['Labels']).count()
relative_imp = cluster_count_proportion - population_count_proportion

plt.figure(figsize=(25,8))
plt.title('Relative Importance of attributes')
sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='viridis')
plt.show()

In [None]:
labels = pd.DataFrame(ac_result, columns=["Labels"])
labelled_scaled_df = pd.concat([new_df_scaled_oe_weighted_nfirst, labels], axis=1)

labelled_scaled_df_num = labelled_scaled_df[numeric + ['Labels']]
labelled_scaled_df_num_melt = pd.melt(labelled_scaled_df_num, id_vars=['Labels'], value_vars=numeric, var_name='Attribute', value_name='Value')

plt.figure(figsize=(20,5))
plt.title('Snake plot of standardized variables')
sns.lineplot(x='Attribute', y='Value', hue='Labels', data=labelled_scaled_df_num_melt, palette='viridis')
plt.xticks(rotation=90)
plt.hlines(0, 0, len(numeric)-1, colors='red')
plt.show()

In [None]:
labels = pd.DataFrame(ac_result, columns=["Labels"])
labelled_df = pd.concat([new_df_oe_weighted_nfirst, labels], axis=1)
labelled_df_num = labelled_df[numeric + ['Labels']]
cluster_avg = labelled_df_num.groupby(['Labels']).mean()
population_avg = labelled_df_num.drop(columns=['Labels']).mean()
relative_imp = cluster_avg / population_avg - 1

plt.figure(figsize=(25,8))
plt.title('Relative Importance of attributes')
sns.heatmap(data=relative_imp, annot=True, fmt='.2f')
plt.show()

In [None]:
labels = pd.DataFrame(ac_result, columns=["Labels"])
labelled_df = pd.concat([new_df_oe_weighted_nfirst, labels], axis=1)

x = 18
fig, ax = plt.subplots(1,2, figsize=(20, 10))

ax[0].title.set_text('Label 0')
ax[1].title.set_text('Label 1')

sns.histplot(data=labelled_df, x=labelled_df.columns[x], ax=ax[0])
sns.histplot(data=labelled_df[labelled_df['Labels'] == 1], x=labelled_df.columns[x], ax=ax[1])
ax[0].set_xlim([0, 45])
ax[1].set_xlim([0, 45])
plt.show()

In [None]:
labels = pd.DataFrame(ac_result, columns=["Labels"])
labelled_df = pd.concat([new_df_oe_weighted_nfirst, labels], axis=1)

x =9
fig, ax = plt.subplots(1,2, figsize=(20, 10))

ax[0].title.set_text('Label 0')
ax[1].title.set_text('Label 1')

sns.countplot(data=labelled_df[labelled_df['Labels'] == 0], x=labelled_df.columns[x], ax=ax[0])
sns.countplot(data=labelled_df[labelled_df['Labels'] == 1], x=labelled_df.columns[x], ax=ax[1])
# ax[0].set_ylim([0, 1000])
# ax[1].set_ylim([0, 1000])
plt.show()

In [None]:
scaled[['Age']].reset_index()

In [None]:
labels = pd.DataFrame(ac_result, columns=["Labels"])
labelled_df = pd.concat([new_df_oe_weighted_nfirst, labels], axis=1)

scaled = labelled_df.groupby(['Labels', 'BusinessTravel']).count() / labelled_df.groupby(['Labels']).count()
scaled = scaled[['Age']].reset_index()

x = 11
fig, ax = plt.subplots(1,2, figsize=(20, 10))

ax[0].title.set_text('Label 0')
ax[1].title.set_text('Label 1')

sns.barplot(data=scaled[scaled['Labels'] == 0], x='BusinessTravel', y='Age', ax=ax[0])
sns.barplot(data=scaled[scaled['Labels'] == 1], x='BusinessTravel', y='Age', ax=ax[1])
ax[0].set_ylabel("Count")
ax[1].set_ylabel("Count")
ax[0].set_ylim([0, 1])
ax[1].set_ylim([0, 1])
plt.show()

In [None]:
u, count = np.unique(ac_result, return_counts=True)
dict(zip(u, count))

In [None]:
col = 7
print(labelled_df.columns[col])

for label in np.unique(ac_result):
    u, count = np.unique(labelled_df[labelled_df['Labels'] == label][labelled_df.columns[col]], return_counts=True)
    print(f"{label} -", dict(zip(u, count)))

In [None]:
len(ac_model.children_)

In [None]:
ac_model = AgglomerativeClustering(n_clusters=6, compute_distances=True)
pred_ac = ac_model.fit_predict(new_df_scaled)

In [None]:
labels = pd.DataFrame(ac_model.labels_, columns=['Labels'])
ac_labeled_df = pd.concat([new_df_scaled, labels], axis=1)
ac_labeled_df['Labels'] += 1
ac_labeled_df

In [None]:
len(ac_model.distances_)

In [None]:
ac_model.children_

### DBSCAN

Works best with noisy data with outliers and doesn't prefer spherical, globular, or elliptical clusters. It can cluster in any shape.\

How it works:\
`Epsilon`: It is considered radius around a given point\
`Minimum points`: It gives the minimum number of points that have to be present inside the Epsilon radius circle around a data point\
`Core point`: a data point has a number of points equal or more than the "minimum points" inside the radius of epsilon around the circle\
`Border point`: a data point does not have the minimum points required to make a core point but has at least one core point inside the epsilon radius around the points\
`Noise point`: a data point has no core points inside the epsilon radius around the point (DBSCAN rules out outliers)

Not a great unsupervised model for clustering this dataset.\

Noisy samples are given the label -1

In [None]:
# temp_df = pd.DataFrame(new_df_scaled[['Salary ($)', 'Distance Between Company and Home (KM)', 'Length of Service (Years)']], columns=['Salary ($)', 'Distance Between Company and Home (KM)', 'Length of Service (Years)'])
dbscan_model = DBSCAN(eps=0.1, metric='precomputed')
pred_dbscan = dbscan_model.fit_predict(gower_df)

In [None]:
u, count = np.unique(pred_dbscan, return_counts=True)
dict(zip(u, count))
## since all of the data is -1 for very small values of eps this means that DBSCAN is not a good model for clustering

In [None]:
sil_arr = []
x = np.linspace(0.0001, 1, 100)
x_ = []
for k in tqdm(x):
    dbscan_model = DBSCAN(eps=k, metric='precomputed')
    dbscan_result = dbscan_model.fit_predict(gower_df)
    try:
        sil_arr.append(silhouette_score(new_df_scaled, dbscan_result))
        x_.append(k)
    except:
        continue

plt.plot(x_, sil_arr)
plt.show()

### Gaussian Mixtures
Does not use a distance measure, but applies a probability distribution around the cluster centers to work out the likelihood that a data point belongs to a given cluster.

In [None]:
gm_model = GaussianMixture(n_components=2)
gm_result = gm_model.fit_predict(gower_df)

In [None]:
np.unique(gm_result)

In [None]:
silhouette_score(gower_df, gm_result)

In [None]:
sil_arr = {'full': [], 'tied': [], 'diag': [], 'spherical': []}
calinski_arr = {'full': [], 'tied': [], 'diag': [], 'spherical': []}
aic_arr = {'full': [], 'tied': [], 'diag': [], 'spherical': []}
bic_arr = {'full': [], 'tied': [], 'diag': [], 'spherical': []}
x = np.arange(2, 15)
for k in tqdm(x):
    gm_model = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
    gm_result = gm_model.fit_predict(gower_oe_weighted_resigned_df)
    sil_arr['full'].append(silhouette_score(gower_oe_weighted_resigned_df, gm_result))
    calinski_arr['full'].append(calinski_harabasz_score(gower_oe_weighted_resigned_df, gm_result))
    aic_arr['full'].append(gm_model.aic(gower_oe_weighted_resigned_df))
    bic_arr['full'].append(gm_model.bic(gower_oe_weighted_resigned_df))

    # gm_model = GaussianMixture(n_components=k, covariance_type='tied', random_state=42)
    # gm_result = gm_model.fit_predict(gower_oe_weighted_resigned_df)
    # sil_arr['tied'].append(silhouette_score(gower_oe_weighted_resigned_df, gm_result))
    # calinski_arr['tied'].append(calinski_harabasz_score(gower_oe_weighted_resigned_df, gm_result))

    gm_model = GaussianMixture(n_components=k, covariance_type='diag', random_state=42)
    gm_result = gm_model.fit_predict(gower_oe_weighted_resigned_df)
    sil_arr['diag'].append(silhouette_score(gower_oe_weighted_resigned_df, gm_result))
    calinski_arr['diag'].append(calinski_harabasz_score(gower_oe_weighted_resigned_df, gm_result))
    aic_arr['tied'].append(gm_model.aic(gower_oe_weighted_resigned_df))
    bic_arr['tied'].append(gm_model.bic(gower_oe_weighted_resigned_df))

    gm_model = GaussianMixture(n_components=k, covariance_type='spherical', random_state=42)
    gm_result = gm_model.fit_predict(gower_oe_weighted_resigned_df)
    sil_arr['spherical'].append(silhouette_score(gower_oe_weighted_resigned_df, gm_result))
    calinski_arr['spherical'].append(calinski_harabasz_score(gower_oe_weighted_resigned_df, gm_result))
    aic_arr['spherical'].append(gm_model.aic(gower_oe_weighted_resigned_df))
    bic_arr['spherical'].append(gm_model.bic(gower_oe_weighted_resigned_df))

fig, ax = plt.subplots(2, 2, figsize=(20, 20))

ax[0,0].title.set_text('Silhouette Score')
ax[0,0].plot(x, sil_arr['full'], label='full')
# ax1.plot(x, sil_arr['tied'], label='tied')
ax[0,0].plot(x, sil_arr['diag'], label='diag')
ax[0,0].plot(x, sil_arr['spherical'], label='spherical')

ax[0,1].title.set_text('Calinski Harabas Z Score')
ax[0,1].plot(x, calinski_arr['full'], label='full')
# ax2.plot(x, calinski_arr['tied'], label='tied')
ax[0,1].plot(x, calinski_arr['diag'], label='diag')
ax[0,1].plot(x, calinski_arr['spherical'], label='spherical')

ax[1,0].title.set_text('AIC')
ax[1,0].plot(x, aic_arr['full'], label='full')
ax[1,0].plot(x, aic_arr['tied'], label='tied')
# ax[1,0].plot(x, aic_arr['diag'], label='diag')
ax[1,0].plot(x, aic_arr['spherical'], label='spherical')

ax[1,1].title.set_text('BIC')
ax[1,1].plot(x, bic_arr['full'], label='full')
ax[1,1].plot(x, bic_arr['tied'], label='tied')
# ax[1,1].plot(x, bic_arr['diag'], label='diag')
ax[1,1].plot(x, bic_arr['spherical'], label='spherical')

plt.legend()
plt.show()

In [None]:
sil_arr = {'full': [], 'tied': [], 'diag': [], 'spherical': []}
calinski_arr = {'full': [], 'tied': [], 'diag': [], 'spherical': []}
# aic_arr = {'full': [], 'tied': [], 'diag': [], 'spherical': []}
# bic_arr = {'full': [], 'tied': [], 'diag': [], 'spherical': []}
x = np.arange(2, 15)
for k in tqdm(x):
    gm_model = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
    gm_result = gm_model.fit_predict(gower_oe_weighted_df)
    sil_arr['full'].append(silhouette_score(gower_oe_weighted_df, gm_result))
    calinski_arr['full'].append(calinski_harabasz_score(gower_oe_weighted_df, gm_result))
    # aic_arr['full'].append(gm_model.aic(gower_oe_weighted_df))
    # bic_arr['full'].append(gm_model.bic(gower_oe_weighted_df))

    # gm_model = GaussianMixture(n_components=k, covariance_type='tied', random_state=42)
    # gm_result = gm_model.fit_predict(gower_oe_weighted_df)
    # sil_arr['tied'].append(silhouette_score(gower_oe_weighted_df, gm_result))
    # calinski_arr['tied'].append(calinski_harabasz_score(gower_oe_weighted_df, gm_result))

    gm_model = GaussianMixture(n_components=k, covariance_type='diag', random_state=42)
    gm_result = gm_model.fit_predict(gower_oe_weighted_df)
    sil_arr['diag'].append(silhouette_score(gower_oe_weighted_df, gm_result))
    calinski_arr['diag'].append(calinski_harabasz_score(gower_oe_weighted_df, gm_result))
    # aic_arr['tied'].append(gm_model.aic(gower_oe_weighted_df))
    # bic_arr['tied'].append(gm_model.bic(gower_oe_weighted_df))

    gm_model = GaussianMixture(n_components=k, covariance_type='spherical', random_state=42)
    gm_result = gm_model.fit_predict(gower_oe_weighted_df)
    sil_arr['spherical'].append(silhouette_score(gower_oe_weighted_df, gm_result))
    calinski_arr['spherical'].append(calinski_harabasz_score(gower_oe_weighted_df, gm_result))
    # aic_arr['spherical'].append(gm_model.aic(gower_oe_weighted_df))
    # bic_arr['spherical'].append(gm_model.bic(gower_oe_weighted_df))

fig, ax = plt.subplots(2, 1, figsize=(20, 20))

ax[0].title.set_text('Silhouette Score')
ax[0].plot(x, sil_arr['full'], label='full')
# ax1.plot(x, sil_arr['tied'], label='tied')
ax[0].plot(x, sil_arr['diag'], label='diag')
ax[0].plot(x, sil_arr['spherical'], label='spherical')

ax[1].title.set_text('Calinski Harabas Z Score')
ax[1].plot(x, calinski_arr['full'], label='full')
# ax2.plot(x, calinski_arr['tied'], label='tied')
ax[1].plot(x, calinski_arr['diag'], label='diag')
ax[1].plot(x, calinski_arr['spherical'], label='spherical')

ax[0].legend()
ax[1].legend()

# ax[1,0].title.set_text('AIC')
# ax[1,0].plot(x, aic_arr['full'], label='full')
# ax[1,0].plot(x, aic_arr['tied'], label='tied')
# ax[1,0].plot(x, aic_arr['diag'], label='diag')
# ax[1,0].plot(x, aic_arr['spherical'], label='spherical')

# ax[1,1].title.set_text('BIC')
# ax[1,1].plot(x, bic_arr['full'], label='full')
# ax[1,1].plot(x, bic_arr['tied'], label='tied')
# ax[1,1].plot(x, bic_arr['diag'], label='diag')
# ax[1,1].plot(x, bic_arr['spherical'], label='spherical')

# plt.legend()
plt.show()

Model Building

In [None]:
gm_model = GaussianMixture(n_components=4, random_state=42, covariance_type='full')
gm_result = gm_model.fit_predict(gower_oe_weighted_df)

# 0.07228, 67.78388 - 2 components
# 0.156186, 225.3747 - 4 components, random state = 42
# 0.15744, 200.63134 - 5 components

print(f"Silhouette Score : {silhouette_score(gower_oe_weighted_df, gm_result)}")
print(f"Calinski Score : {calinski_harabasz_score(gower_oe_weighted_df, gm_result)}")

#### Observations (components = 4 | covariance type = full):

Categoric Variables:
- `Gender` - 1,3
- `Job Function_Research&Development`
- `Job Function_Sales`
- `Resign Status` - 2

Numeric Variables:
- Snake Plot
    - `Performance Rating` - 2
    - `Distance Between Company and Home` - 2
    - `Length of Service` - 2
    - `Job Satisfaction` - 2
- Heat Map
    - `Distance Between Company and Home` - 2
    - `Length of Service` - 2

#### Observations (components = 2 | covariance type = spherical)

Categoric Variables:
- None

Numeric Variables:
- Snake Plot
    - `BusinessTravel`
    - `Performance Rating`
- Heat Map
    - None

#### Observations (resigned | components = 2 | covariance type = full)

Categoric Variables:
- `Gender`

Numeric Variables:
- Snake Plot
    - `Age` - 1
    - `Length of Service` - 1
- Heat Map
    - `Length of Service` - 0

In [None]:
categories = [
    'Gender_Female',
    'Job Function_Human Resources',
    'Job Function_Research & Development',
    'Job Function_Sales',
    'MaritalStatus_Divorced',
    'MaritalStatus_Married',
    'MaritalStatus_Single',
    'Resign Status_Yes'
]
numeric = [
    'Age',
    'BusinessTravel',
    'Distance Between Company and Home (KM)',
    'Education (1 is lowest, 5 is highest)',
    'Job Satisfaction (1 is lowest, 4 is highest)',
    'Salary ($)',
    'Performance Rating (1 is lowest, 4 is highest)',
    'Work Life Balance (1 is worst, 4 is best)',
    'Length of Service (Years)'
]

labels = pd.DataFrame(gm_result, columns=["Labels"])
labelled_scaled_df = pd.concat([new_df_scaled_oe_weighted_nfirst, labels], axis=1)
labelled_df = pd.concat([new_df_oe_weighted_nfirst, labels], axis=1)

In [None]:
labelled_df_cat = labelled_df[categories + ['Labels']]
cluster_count_proportion = labelled_df_cat.groupby(['Labels']).sum() / labelled_df_cat.groupby(['Labels']).count()
population_count_proportion = labelled_df_cat.drop(columns=['Labels']).sum() / labelled_df_cat.drop(columns=['Labels']).count()
relative_imp = (cluster_count_proportion - population_count_proportion)

plt.figure(figsize=(25,8))
plt.title('Relative Importance of attributes')
sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='viridis')
plt.show()

In [None]:
labelled_scaled_df_num = labelled_scaled_df[numeric + ['Labels']]
labelled_scaled_df_num_melt = pd.melt(labelled_scaled_df_num, id_vars=['Labels'], value_vars=numeric, var_name='Attribute', value_name='Value')

plt.figure(figsize=(20,5))
plt.title('Snake plot of standardized variables')
sns.lineplot(x='Attribute', y='Value', hue='Labels', data=labelled_scaled_df_num_melt, palette='viridis')
plt.xticks(rotation=90)
plt.hlines(0, 0, len(numeric)-1, colors='red')
plt.show()

In [None]:
labelled_df_num = labelled_df[numeric + ['Labels']]
cluster_avg = labelled_df_num.groupby(['Labels']).mean()
population_avg = labelled_df_num.drop(columns=['Labels']).mean()
relative_imp = cluster_avg / population_avg - 1

plt.figure(figsize=(25,8))
plt.title('Relative Importance of attributes')
sns.heatmap(data=relative_imp, annot=True, fmt='.2f')
plt.show()

In [None]:
labels = pd.DataFrame(gm_result, columns=['Labels'])
# labelled_df = pd.concat([new_df_oe_weighted_nfirst[new_df_oe_weighted_nfirst['Resign Status_Yes'] == 1].reset_index(drop=True), labels], axis=1)
labelled_df = pd.concat([new_df_oe_weighted_nfirst, labels], axis=1)

x, y = 15, 13
fig, ax = plt.subplots(2,2, figsize=(20, 20))

ax[0,0].title.set_text('Label 0')
ax[0,1].title.set_text('Label 1')
ax[1,0].title.set_text('Label 2')
ax[1,1].title.set_text('Label 3')
# ax[2,0].title.set_text('Label 4')

sns.kdeplot(data=labelled_df[labelled_df['Labels'] == 0], x=labelled_df.columns[x], y=labelled_df.columns[y], palette='viridis', ax=ax[0,0])
sns.kdeplot(data=labelled_df[labelled_df['Labels'] == 1], x=labelled_df.columns[x], y=labelled_df.columns[y], palette='viridis', ax=ax[0,1])
sns.kdeplot(data=labelled_df[labelled_df['Labels'] == 2], x=labelled_df.columns[x], y=labelled_df.columns[y], palette='viridis', ax=ax[1,0])
sns.kdeplot(data=labelled_df[labelled_df['Labels'] == 3], x=labelled_df.columns[x], y=labelled_df.columns[y], palette='viridis', ax=ax[1,1])
# sns.kdeplot(data=labelled_df[labelled_df['Labels'] == 4], x=labelled_df.columns[x], y=labelled_df.columns[y], palette='viridis', ax=ax[2,0])

In [None]:
x = 12
fig, ax = plt.subplots(2,2, figsize=(20, 20))

ax[0,0].title.set_text('Label 0')
ax[0,1].title.set_text('Label 1')
ax[1,0].title.set_text('Label 2')
ax[1,1].title.set_text('Label 3')

sns.histplot(data=labelled_df[labelled_df['Labels'] == 0], x=labelled_df.columns[x], ax=ax[0,0])
sns.histplot(data=labelled_df[labelled_df['Labels'] == 1], x=labelled_df.columns[x], ax=ax[0,1])
sns.histplot(data=labelled_df[labelled_df['Labels'] == 2], x=labelled_df.columns[x], ax=ax[1,0])
sns.histplot(data=labelled_df[labelled_df['Labels'] == 3], x=labelled_df.columns[x], ax=ax[1,1])

In [None]:
x = 4
fig, ax = plt.subplots(2,2, figsize=(20, 20))

ax[0,0].title.set_text('Label 0')
ax[0,1].title.set_text('Label 1')
ax[1,0].title.set_text('Label 2')
ax[1,1].title.set_text('Label 3')

sns.countplot(data=labelled_df[labelled_df['Labels'] == 0], x=labelled_df.columns[x], ax=ax[0,0])
sns.countplot(data=labelled_df[labelled_df['Labels'] == 1], x=labelled_df.columns[x], ax=ax[0,1])
sns.countplot(data=labelled_df[labelled_df['Labels'] == 2], x=labelled_df.columns[x], ax=ax[1,0])
sns.countplot(data=labelled_df[labelled_df['Labels'] == 3], x=labelled_df.columns[x], ax=ax[1,1])

In [None]:
labels = pd.DataFrame(gm_result, columns=['Labels'])
labelled_df = pd.concat([new_df_oe_weighted_nfirst[new_df_oe_weighted_nfirst['Resign Status_Yes'] == 1].reset_index(drop=True), labels], axis=1)

x, y = 15,18
fig, ax = plt.subplots(2,2, figsize=(20, 20))

ax[0,0].title.set_text('Label 0')
ax[0,1].title.set_text('Label 1')
ax[1,0].title.set_text('Label 2')
ax[1,1].title.set_text('Label 3')
# ax[2,0].title.set_text('Label 4')

ax[0,0].scatter(labelled_df[labelled_df['Labels'] == 0].iloc[:, x], labelled_df[labelled_df['Labels'] == 0].iloc[:, y])
ax[0,0].set_xlabel(labelled_df[labelled_df['Labels'] == 0].columns[x])
ax[0,0].set_ylabel(labelled_df[labelled_df['Labels'] == 0].columns[y])

ax[0,1].scatter(labelled_df[labelled_df['Labels'] == 1].iloc[:, x], labelled_df[labelled_df['Labels'] == 1].iloc[:, y])
ax[0,1].set_xlabel(labelled_df[labelled_df['Labels'] == 1].columns[x])
ax[0,1].set_ylabel(labelled_df[labelled_df['Labels'] == 1].columns[y])

ax[1,0].scatter(labelled_df[labelled_df['Labels'] == 2].iloc[:, x], labelled_df[labelled_df['Labels'] == 2].iloc[:, y])
ax[1,0].set_xlabel(labelled_df[labelled_df['Labels'] == 2].columns[x])
ax[1,0].set_ylabel(labelled_df[labelled_df['Labels'] == 2].columns[y])

ax[1,1].scatter(labelled_df[labelled_df['Labels'] == 3].iloc[:, x], labelled_df[labelled_df['Labels'] == 3].iloc[:, y])
ax[1,1].set_xlabel(labelled_df[labelled_df['Labels'] == 3].columns[x])
ax[1,1].set_ylabel(labelled_df[labelled_df['Labels'] == 3].columns[y])

# ax[2,0].scatter(labelled_df[labelled_df['Labels'] == 4].iloc[:, x], labelled_df[labelled_df['Labels'] == 4].iloc[:, y])
# ax[2,0].set_xlabel(labelled_df[labelled_df['Labels'] == 4].columns[x])
# ax[2,0].set_ylabel(labelled_df[labelled_df['Labels'] == 4].columns[y])

In [None]:
u, count = np.unique(gm_result, return_counts=True)
dict(zip(u, count))

In [None]:
len(gm_result)

In [None]:
len(oe_weighted_df[oe_weighted_df['Resign Status'] == 'Yes'])

In [None]:
col = 13
print(labelled_df.columns[col])

for label in np.unique(gm_result):
    u, count = np.unique(labelled_df[labelled_df['Labels'] == label][labelled_df.columns[col]], return_counts=True)
    print(f"{label} -", dict(zip(u, count)))

In [None]:
np.mean(labelled_df[labelled_df['Labels'] == 3][labelled_df.columns[15]])
# Salary std 0 - 3446.3785
# Salary std 1 - 4705.7629
# Salary std 2 - 6240.3782
# Salary std 3 - 4579.8788

In [None]:
for i in range(4):
    avg = np.mean(labelled_df[labelled_df['Labels'] == i][labelled_df.columns[15]])
    std = np.std(labelled_df[labelled_df['Labels'] == i][labelled_df.columns[15]])
    print(f"Label {i} : Mean - {avg} | Std - {std}")