In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing, metrics
from sklearn.cluster import KMeans, AgglomerativeClustering

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("/home/felipe/Documents/uf/IA/prova/epl/epl_1819.csv", encoding='utf-8')

train_df.isnull().sum()

Team                             0
category                         0
general_league_position          0
finance _live_games_televised    0
finance _tv_revenue              0
general_matches_played           0
general_won                      0
general_draw                     0
general_lost                     0
attack_scored                    0
defence_goals_conceeded          0
general_goal_difference          0
general_points                   0
general_squad_size               0
general_squad_average_age        0
general_squad_foreigners         0
finance _team_market             0
finance _market_average          0
attack_passes                    0
attack_passes_through            0
attack_passes_long               0
attack_passes_back               0
attack_crosses                   0
attack_corners_taken             0
attack_shots                     0
attack_shots_on_target           0
attack_goals_headed              0
attack_goals_penalty             0
attack_goals_box    

## Limpeza dos dados

A base não possui células vazias, porém duas colunas possuem valores categóricos e podem ser ignoradas.

In [3]:
train_df = train_df._get_numeric_data()

### Normalizar dados

In [4]:
norm_df = preprocessing.Normalizer().fit_transform(train_df)

### K-Means

In [5]:
iters = [1, 10, 100]
k_values = [2, 5, 10]

print("----------- BRUTE DATA -----------") 

for k in k_values:
    print("K =", k)
    for i in iters:
        kmeans_model = KMeans(n_clusters = k, max_iter=i, random_state=0)
        kmeans_model.fit_predict(train_df)

        labels = kmeans_model.labels_
        print("silhouette with max_iter = {i} :".format(i=i), metrics.silhouette_score(train_df, labels, metric='euclidean'))

    print("--------------------------------------------------------------------------------")

print("\n----------- NORMALIZED DATA -----------")    

for k in k_values:
    print("K =", k)
    for i in iters:
        kmeans_model = KMeans(n_clusters = k, max_iter=i, random_state=0)
        kmeans_model.fit_predict(norm_df)

        labels = kmeans_model.labels_
        print("silhouette with max_iter = {i} :".format(i=i), metrics.silhouette_score(norm_df, labels, metric='euclidean'))

    print("--------------------------------------------------------------------------------")

----------- BRUTE DATA -----------
K = 2
silhouette with max_iter = 1 : 0.7775605395450415
silhouette with max_iter = 10 : 0.7775605395450415
silhouette with max_iter = 100 : 0.7775605395450415
--------------------------------------------------------------------------------
K = 5
silhouette with max_iter = 1 : 0.5068576391996034
silhouette with max_iter = 10 : 0.5068576391996034
silhouette with max_iter = 100 : 0.5068576391996034
--------------------------------------------------------------------------------
K = 10
silhouette with max_iter = 1 : 0.4795901498429158
silhouette with max_iter = 10 : 0.4795901498429158
silhouette with max_iter = 100 : 0.4795901498429158
--------------------------------------------------------------------------------

----------- NORMALIZED DATA -----------
K = 2
silhouette with max_iter = 1 : 0.6260318291234148
silhouette with max_iter = 10 : 0.6260318291234148
silhouette with max_iter = 100 : 0.6260318291234148
--------------------------------------------

### Hierárquico (Agglomerative Clustering)

In [6]:
linkages = ['ward', 'complete', 'average', 'single']

print("----------- BRUTE DATA -----------") 

for k in k_values:
    print("K =", k)
    for link in linkages:
        ac_model = AgglomerativeClustering(n_clusters=k, linkage=link)
        ac_model.fit_predict(train_df)

        ac_labels = ac_model.labels_
        print("silhouette with linkage = {i} :".format(i=link), metrics.silhouette_score(train_df, ac_labels, metric='euclidean'))
    print("--------------------------------------------------------------------------------\n")

print("\n----------- NORMALIZED DATA -----------")   

for k in k_values:
    print("K =", k)
    for link in linkages:
        ac_model = AgglomerativeClustering(n_clusters=k, linkage=link)
        ac_model.fit_predict(norm_df)

        ac_labels = ac_model.labels_
        print("silhouette with linkage = {i} :".format(i=link), metrics.silhouette_score(norm_df, ac_labels, metric='euclidean'))
    print("--------------------------------------------------------------------------------\n")

----------- BRUTE DATA -----------
K = 2
silhouette with linkage = ward : 0.7775605395450415
silhouette with linkage = complete : 0.7488503743292658
silhouette with linkage = average : 0.7775605395450415
silhouette with linkage = single : 0.7775605395450415
--------------------------------------------------------------------------------

K = 5
silhouette with linkage = ward : 0.5068576391996034
silhouette with linkage = complete : 0.5068576391996034
silhouette with linkage = average : 0.5068576391996034
silhouette with linkage = single : 0.4322618292576014
--------------------------------------------------------------------------------

K = 10
silhouette with linkage = ward : 0.4795901498429158
silhouette with linkage = complete : 0.4795901498429158
silhouette with linkage = average : 0.4795901498429158
silhouette with linkage = single : 0.3194578189454153
--------------------------------------------------------------------------------


----------- NORMALIZED DATA -----------
K = 2
si

## Conclusão

#### Comparação

Dado que a métrica de silhueta descreve o quanto os membros do _cluster_ são semelhantes, foi possível perceber que os dois algoritmos agruparam melhor as classes quando foram considerados 2 clusters como saída, com ambos tendo pontuação máxima de 0.77. 

Ao variar o número de _clusters_ (K), nota-se que 2 é o melhor valor para K, baseado na silhueta. 

Variar o valor máximo de iterações no K-Means entre 1, 10 e 100 não surtiu diferença.

Variando o _linkage_ no algoritmo Hierárquico nota-se que a pontuação de silhueta varia de forma que para cada valor de K há tipos de _linkage_ que conseguem agrupar melhor os dados.

#### Normalizando

Normalizando os dados, a pontuação de silhueta diminuiu para K = 2 e K = 10, porém aumentou para K = 5. 