In [None]:
import math
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import collections
from scipy.stats.stats import pearsonr
import pandas as pd
import os
from datetime import date

from sklearn.cluster import KMeans, DBSCAN
from scipy.spatial.distance import pdist,  squareform
from tqdm.notebook import tqdm
from sklearn.metrics import silhouette_score, pairwise_distances # For Model evaluation
from sklearn.neighbors import NearestNeighbors

from pyclustering.cluster import cluster_visualizer_multidim
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer

import seaborn as sns
import re

In [None]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
#index_col=False say to not use the first column as ID
df_players = pd.read_csv('players.csv', sep=',', index_col=0) 

In [None]:
#df_players['ioc'] = df_players.ioc.astype("category").cat.codes

In [None]:
df_players.info()

In [None]:
df_prova_org = df_players[['best_rank', 'best_rank_points', 'best_of_3_match', 'best_of_5_match',
                      'best_of_3_wins(%)', 'best_of_5_wins(%)', 'num_win_bye', 'num_win_wo_ret', 'num_win_def',
                       'num_wo_ret', 'num_def', 'tot_minutes',
                      'height','sv1st_win%', 'df%', 'ace_perc', 'bpS_perc', 'nmatch', 'lmatch', 'wmatch']]
df_prova = df_prova_org.loc[df_prova_org['nmatch'] > 100]

df_prova

In [None]:
sns.histplot(data=df_players['nmatch'], bins="auto", binrange=(10,400), color="lightgreen", kde=True)

In [None]:
sns.histplot(data=df_players['wmatch'], bins="auto", binrange=(10,200), color="lightgreen", kde=True)

In [None]:
sns.histplot(data=df_players['lmatch'], bins="auto", binrange=(10,200), color="lightgreen", kde=True)

In [None]:
df_performances_org = df_players[(df_players['best_rank']>0) & (df_players['best_rank_points']>=0) & 
                            (df_players['tot_minutes']>0) & (df_players['ace_perc']>=0) & 
                             (df_players['bpS_perc']>=0)][['best_rank', 'best_rank_points',
                            'tot_minutes','sv1st_win', 'sv2nd_win', 'df', 'ace_perc', 'bpS_perc', 'nmatch', 
                            'best_of_3_match', 'best_of_3_wins', 'best_of_5_match', 'best_of_5_wins', 'n_tourney']]
df_performances = df_performances_org.loc[df_performances_org['nmatch'] > 100]

df_performances

In [None]:
plt.figure(figsize = (15,6))
sns.heatmap( df_performances.corr(), annot=True)

In [None]:
sns.pairplot(df_performances, palette = "Accent")

## KMeans

In [None]:
#df_prova = df_players[['best_rank', 'tot_minutes']]
#df_prova = df_prova[~df_prova['tot_minutes'].isna()]

In [None]:
scaler = StandardScaler()
scaler.fit(df_prova.values)

kmeans = KMeans(n_clusters=4, n_init=4, max_iter=10)
kmeans.fit(scaler.transform(df_prova.values))

np.unique(kmeans.labels_, return_counts=True)

plt.scatter(df_prova["best_rank"], df_prova["tot_minutes"], c=kmeans.labels_)
plt.show()

In [None]:
df_prova = df_players[['best_rank', 'best_rank_points']]
df_prova = df_prova[(df_prova['best_rank']>0) & (df_prova['best_rank_points']>=0)]

scaler = StandardScaler()
scaler.fit(df_prova.values)

kmeans = KMeans(n_clusters=4, n_init=4, max_iter=10)
kmeans.fit(scaler.transform(df_prova.values))

np.unique(kmeans.labels_, return_counts=True)

plt.scatter(df_prova["best_rank"], df_prova["best_rank_points"], c=kmeans.labels_)
plt.show()

In [None]:
'''
f, axs = plt.subplots(nrows=6, ncols=6, figsize=(36,36))
plt.suptitle(("Visualization of the clustered data with respect to different feature spaces"), fontsize=28, fontweight='bold')
plot_id = 0
for i in range(len(cs_num.columns)):
    for j in range(i+1, len(cs_num.columns)):
        a, b = cs.columns[i], cs.columns[j]
        axs[int(plot_id/6)][plot_id%6].scatter(cs[a], cs[b], c=negative_sil_colors, s=20)
        for h in range(len(centers)):
            axs[int(plot_id/6)][plot_id%6].scatter(centers[h][cs.columns.get_loc(a)], centers[h][cs.columns.get_loc(b)], marker='o', c="white", alpha=1, s=200, edgecolor='k')
            axs[int(plot_id/6)][plot_id%6].scatter(centers[h][cs.columns.get_loc(a)], centers[h][cs.columns.get_loc(b)], marker='$%d$' % h, alpha=1, s=50, edgecolor='k')    
        axs[int(plot_id/6)][plot_id%6].set_title('Scatter( ' + a + ' , ' + b + ' ): K = ' + str(k), fontdict={'fontsize': 'x-large', 'fontweight' : 'bold'})
        axs[int(plot_id/6)][plot_id%6].set_xlabel(a, fontdict={'fontsize': 'x-large', 'fontweight' : 'bold'})
        axs[int(plot_id/6)][plot_id%6].set_ylabel(b, fontdict={'fontsize': 'x-large', 'fontweight' : 'bold'})
        plot_id = plot_id +1
plt.show()
'''

## DBscan

In [None]:
scaler = StandardScaler()
scaled_array = scaler.fit_transform(df_performances)
scaled_dataframe = pd.DataFrame( scaled_array, columns = df_performances.columns )

In [None]:
sns.boxplot(data = scaled_dataframe, orient = "h")

In [None]:
scaled_dataframe.describe()

In [None]:
dbscan = DBSCAN(eps=0.75, min_samples=5)
dbscan.fit(scaled_dataframe)

In [None]:
labels = dbscan.labels_
np.unique(dbscan.labels_, return_counts=True)

In [None]:
cleaned_dataframe = df_performances.copy()
cleaned_dataframe["LABEL"] = labels
sns.pairplot(data = cleaned_dataframe, hue = "LABEL", palette = "Accent")

In [None]:
dist = pdist(X=scaled_dataframe, metric='euclidean')  # pair-wise distance: how every record is far from all others
dist = squareform(dist)                      # distance matrix given the vector dist

In [None]:
kmin, kmax = 3, 50
kth_distances = {}
for k in range(kmin, kmax + 1): # initialize k lists
    kth_distances[k] = []

In [None]:
for d in dist:
    # argsort returns the indexes that would sort d
    index_kth_distance = np.argsort(d)[k]
    for k in range(kmin, kmax + 1):
        # append to kth_distances[k] the value in d that corresponds
        # to the distance of the i-th point (record) from its k-th nn.
        # it's like: kth_distances[k].append(sorted_d[k])), but we get "sorted_d[k]" by d[indexes_to_sort_d[k]]
        kth_distances[k].append(d[index_kth_distance])

In [None]:
plt.figure(figsize=(50, 20))
for k in kth_distances.keys():
    plt.plot(range(0, len(kth_distances[k])), sorted(kth_distances[k]))
    
plt.ylabel('dist from k-th neighbor (eps)', fontsize=25)
plt.xlabel('sorted distances', fontsize=25)
#plt.ylim(top=5)
plt.ylim(bottom=-0.25)
plt.tick_params(axis='both', which='major', labelsize=25)
plt.grid()
plt.show()

#### Grid search for eps and min_samples

In [None]:
def get_metrics(eps, min_samples, dataset, iter_):
    # Fitting 
    dbscan_model_ = DBSCAN(eps = eps, min_samples = min_samples)
    dbscan_model_.fit(dataset)
    
    # Mean Noise Point Distance metric
    noise_indices = dbscan_model_.labels_ == -1
    
    if True in noise_indices:
        neighboors = NearestNeighbors(n_neighbors = 6).fit(dataset)
        distances, indices = neighboors.kneighbors(dataset)
        noise_distances = distances[noise_indices, 1:]
        noise_mean_distance = round(noise_distances.mean(), 3)
    else:
        noise_mean_distance = None
    
    # Number of found Clusters metric    
    number_of_clusters = len(set(dbscan_model_.labels_[dbscan_model_.labels_ >= 0]))
    
    #print("%3d | Tested with eps = %3s and min_samples = %3s | %5s %4s" % (i, eps, min_samples, str(noise_mean_distance), number_of_clusters))
    
    return(noise_mean_distance, number_of_clusters)

In [None]:
eps_to_test = [round(x, 3) for x in np.arange(0.4, 5, 0.3)] 
min_samples_to_test = np.arange(4, 30, 2)
eps_to_test

In [None]:
# Dataframe per la metrica sulla distanza media dei noise points dai K punti più vicini
results_noise = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

# Dataframe per la metrica sul numero di cluster
results_clusters = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

In [None]:
#grid search

In [None]:
i = 0

for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        i += 1
        # Calcolo le metriche
        noise_metric, cluster_metric  = get_metrics(eps, min_samples, scaled_dataframe, i)
        # Inserisco i risultati nei relativi dataframe
        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8) )

sns.heatmap(results_noise, annot = True, ax = ax1, cbar = False).set_title("Mean Noise Points Distance")
sns.heatmap(results_clusters, annot = True, ax = ax2, cbar = False).set_title("Number of clusters")

ax1.set_xlabel("min_samples")
ax2.set_xlabel("min_samples")
ax1.set_ylabel("eps")
ax2.set_ylabel("eps")

plt.tight_layout()
plt.show()

#### choose of parameters 

In [None]:
best_dbscan = DBSCAN(eps = 1, min_samples = 14)
# Fitting
best_dbscan.fit(scaled_dataframe)

labels = best_dbscan.labels_
np.unique(best_dbscan.labels_, return_counts=True)

In [None]:
#scaled_dataframe["LABEL"] = best_dbscan.labels_
#sns.pairplot(data = scaled_dataframe, hue = "LABEL")

# Extracting labels
cleaned_dataframe = df_performances.copy()
cleaned_dataframe["LABEL"] = best_dbscan.labels_
# Pairplot
sns.pairplot(data = cleaned_dataframe, hue = "LABEL", palette = "Accent")

#### MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaled_array = scaler.fit_transform(df_performances)
scaled_dataframe = pd.DataFrame( scaled_array, columns = df_performances.columns )

In [None]:
dbscan = DBSCAN(eps=0.75, min_samples=5)
dbscan.fit(scaled_dataframe)
labels = dbscan.labels_
np.unique(dbscan.labels_, return_counts=True)

In [None]:
cleaned_dataframe = df_performances.copy()
cleaned_dataframe["LABEL"] = labels
sns.pairplot(data = cleaned_dataframe, hue = "LABEL", palette = "Accent")

In [None]:
dist = pdist(X=scaled_dataframe, metric='euclidean')  # pair-wise distance: how every record is far from all others
dist = squareform(dist)                      # distance matrix given the vector dist

In [None]:
kmin, kmax = 3, 50
kth_distances = {}
for k in range(kmin, kmax + 1): # initialize k lists
    kth_distances[k] = []

In [None]:
for d in dist:
    # argsort returns the indexes that would sort d
    index_kth_distance = np.argsort(d)[k]
    for k in range(kmin, kmax + 1):
        # append to kth_distances[k] the value in d that corresponds
        # to the distance of the i-th point (record) from its k-th nn.
        # it's like: kth_distances[k].append(sorted_d[k])), but we get "sorted_d[k]" by d[indexes_to_sort_d[k]]
        kth_distances[k].append(d[index_kth_distance])

In [None]:
plt.figure(figsize=(50, 20))
for k in kth_distances.keys():
    plt.plot(range(0, len(kth_distances[k])), sorted(kth_distances[k]))
    
plt.ylabel('dist from k-th neighbor (eps)', fontsize=25)
plt.xlabel('sorted distances', fontsize=25)
#plt.ylim(top=5)
plt.ylim(bottom=-0.25)
plt.tick_params(axis='both', which='major', labelsize=25)
plt.grid()
plt.show()

In [None]:
eps_to_test = [round(x, 3) for x in np.arange(0.2, 0.6, 0.1)]
min_samples_to_test = np.arange(2, 30, 2)
eps_to_test

In [None]:
# Dataframe per la metrica sulla distanza media dei noise points dai K punti più vicini
results_noise = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

# Dataframe per la metrica sul numero di cluster
results_clusters = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

In [None]:
i = 0

for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        i += 1
        # Calcolo le metriche
        noise_metric, cluster_metric  = get_metrics(eps, min_samples, scaled_dataframe, i)
        # Inserisco i risultati nei relativi dataframe
        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,4) )

sns.heatmap(results_noise, annot = True, ax = ax1, cbar = False).set_title("Mean Noise Points Distance")
sns.heatmap(results_clusters, annot = True, ax = ax2, cbar = False).set_title("Number of clusters")

ax1.set_xlabel("min_samples")
ax2.set_xlabel("min_samples")
ax1.set_ylabel("eps")
ax2.set_ylabel("eps")

plt.tight_layout()
plt.show()

In [None]:
best_dbscan = DBSCAN(eps = 0.2, min_samples = 28)
# Fitting
best_dbscan.fit(scaled_dataframe)

labels = best_dbscan.labels_
np.unique(best_dbscan.labels_, return_counts=True)

In [None]:
# Extracting labels
cleaned_dataframe = df_performances.copy()
cleaned_dataframe["LABEL"] = best_dbscan.labels_
# Pairplot
sns.pairplot(data = cleaned_dataframe, hue = "LABEL", palette = "Accent")

# Hierachical

In this section we will see hierarchical clustering performed the divisive technique, in particular, using the four methods to compute the distances between clusters. To compute the distances has been used euclidean distance.

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import AgglomerativeClustering

In [None]:
def count_cluster_elements(data, threshold, criterion='distance'):
    count = {}
    clusters = fcluster(data, threshold, criterion)
    for c in clusters:
        count[c] = count[c]+1 if c in count else 1
    return count, clusters

In [None]:
silhouette_scores = {}

### Euclidean - single (MIN)

In [None]:
plt.figure(figsize=(10, 5))
plt.tick_params(labelbottom=False)
plt.title("Euclidean - single")
threshold_value = 2850
plt.axhline(y=threshold_value, color="black")
link = linkage(df_performances, method='single', metric = 'euclidean')
dend = dendrogram(link, truncate_mode='lastp', p=30, leaf_rotation=60, leaf_font_size = 8, show_contracted=True)

In [None]:
count, clusters = count_cluster_elements(link, threshold_value)
score = silhouette_score(df_performances, clusters)
silhouette_scores['single'] = score
print("Clusters:", len(count.keys()), end=" - ")
print(list(reversed(sorted(count.values()))))
print(f"Silhouette score: {score}")

### Euclidean - complete (MAX)

In [None]:
plt.figure(figsize=(10, 5))
plt.tick_params(labelbottom=False)
plt.title("Euclidean - complete")
threshold_value = 20000
plt.axhline(y=threshold_value, color="black")
link = linkage(df_performances, method='complete', metric = 'euclidean')
dend = dendrogram(link, truncate_mode='lastp', p=30, leaf_rotation=60, leaf_font_size = 8, show_contracted=True)

In [None]:
count, clusters = count_cluster_elements(link, threshold_value)
score = silhouette_score(df_performances, clusters)
silhouette_scores['complete'] = score
print("Clusters:", len(count.keys()), end=" - ")
print(list(reversed(sorted(count.values()))))
print(f"Silhouette score: {score}")

### Euclidean - complete (AVG)

In [None]:
plt.figure(figsize=(10, 5))
plt.tick_params(labelbottom=False)
plt.title("Euclidean - average")
threshold_value = 11000
plt.axhline(y=threshold_value, color="black")
link = linkage(df_performances, method='average', metric = 'euclidean')
dend = dendrogram(link, truncate_mode='lastp', p=30, leaf_rotation=60, leaf_font_size = 8, show_contracted=True)

In [None]:
count, clusters = count_cluster_elements(link, threshold_value)
score = silhouette_score(df_performances, clusters)
silhouette_scores['average'] = score
print("Clusters:", len(count.keys()), end=" - ")
print(list(reversed(sorted(count.values()))))
print(f"Silhouette score: {score}")

### Euclidean - Ward

In [None]:
plt.figure(figsize=(10, 5))
plt.tick_params(labelbottom=False)
plt.title(b"Euclidean - Ward")
threshold_value = 75000
plt.axhline(y=threshold_value, color="black")
link = linkage(df_performances, method='ward', metric = 'euclidean')
dend = dendrogram(link, truncate_mode='lastp', p=30, leaf_rotation=60, leaf_font_size = 8, show_contracted=True)

In [None]:
count, clusters = count_cluster_elements(link, threshold_value)
score = silhouette_score(df_performances, clusters)
silhouette_scores['ward'] = score
print("Clusters:", len(count.keys()), end=" - ")
print(list(reversed(sorted(count.values()))))
print(f"Silhouette score: {score}")

In [None]:
print('Method\t\t Score\n')
for method in silhouette_scores.keys():
    print(method + '\t\t' + str(silhouette_scores[method]))

# XMEANS

In [None]:
amount_initial_centers = 1   #number of clusters at the start. xmeans starts with only one cluster
max_n_clusters = 10

initial_centers = kmeans_plusplus_initializer(df_performances, amount_initial_centers).initialize()
xmeans_instance = xmeans(df_performances, initial_centers, kmax=max_n_clusters)
xmeans_instance.process(); #split with bayesian Information Criterion

clusters = xmeans_instance.get_clusters();
centers = xmeans_instance.get_centers()

print([len(c) for c in clusters])   

In [None]:
# display allocated clusters
visualizer = cluster_visualizer_multidim();
visualizer.append_clusters(clusters, df_performances.values.tolist())
visualizer.append_cluster(centers, None, marker = '*', markersize=4)
#visualizer.show()
visualizer.show(pair_filter=[[0, 1], [0, 2], [0, 3]])

In [None]:
#for i in range(0, len(df_performances.columns), 1):
#    for j in range(0, len(df_performances.columns), 1):
#        if j<i:
#            visualizer.show(pair_filter=[[j, i]])

In [None]:
#plot with sns library

In [None]:
labels = np.zeros(df_performances.shape[0],  dtype=int) #num of rows
for i in range(len(clusters)):#number of cluster
    for j in clusters[i]: #index of row of dataset in cluster i
        labels[j] = int(i)

In [None]:
palette_n = sns.color_palette("hls", n_colors=len(clusters))
palette_n

In [None]:
# Extracting labels
cleaned_dataframe = df_performances.copy()
cleaned_dataframe["LABEL"] = labels
# Pairplot
sns.pairplot(data = cleaned_dataframe, hue = "LABEL", palette = palette_n)

# TO DO: ALTRO ALGORITMO OPZIONALE