In [None]:
# install sompy (use our forked version - the original one has visualization bugs)
#!pip install -U git+https://github.com/joaopfonseca/SOMPY.git

In [None]:
from IPython.display import YouTubeVideo

from os.path import join
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier

import sompy
from sompy.visualization.mapview import View2D
from sompy.visualization.bmuhits import BmuHitsView
from sompy.visualization.hitmap import HitMapView

## Import preprocessed data

In [None]:
df = pd.read_csv(join('..', 'data', 'tugas_preprocessed.csv'))

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Splitting feature names into groups
non_metric_features = df.columns[df.columns.str.startswith('x')]
pc_features = df.columns[df.columns.str.startswith('PC')]
metric_features = df.columns[~df.columns.str.startswith('x') & ~df.columns.str.startswith('PC')]

## Self-organizing maps
What is a SOM? How does it work? What is it used for?

### How is it computed?

In [None]:
YouTubeVideo('k7DK5fnJH94')

### Characteristics:
- Grid shape needs to be set a priori
- Results depend on the initialization
- Fitting a SOM can be computationally expensive
- Capable of finding the global optimum (theoretically - if the LR -> 0)
- Visualization tool for high-dimensional data

### Additional analyses/tutorials
- [Air Flights](https://github.com/sevamoo/SOMPY/blob/master/sompy/examples/AirFlights_hexagonal_grid.ipynb)
- [Visualizations on toy datasets](https://gist.github.com/sevamoo/035c56e7428318dd3065013625f12a11)

### How to apply Self-Organizing Maps?

In [None]:
# This som implementation does not have a random seed parameter
# We're going to set it up ourselves
np.random.seed(42)

sm = sompy.SOMFactory().build(
    df[metric_features].values, 
    mapsize=(10, 10), #(50, 50), 
    initialization='random', 
    component_names=metric_features,
    training='batch',
    lattice='hexa'
)
sm.train(n_job=4, verbose='info', train_rough_len=100, train_finetune_len=100)

## Visualizing data with SOMs

### Component planes
What do they represent? What kinds of information do they contain?

Analyse these plots from the following perspectives:
- Feature importance
- Feature correlation
- Data distribution
- Outlier detection

In [None]:
# Visualizing the Component plates (feature values)
sns.set()
view2D = View2D(50,50,"", text_size=10)
view2D.show(sm, col_sz=3, what='codebook')
plt.subplots_adjust(top=0.90)
plt.suptitle("Component Plates", fontsize=20)
plt.show()

### U-matrix
Why is it useful?

In [None]:
u = sompy.umatrix.UMatrixView(50, 50, 'umatrix', show_axis=True, text_size=8, show_text=True)

# This is the U-matrix value
UMAT  = u.build_u_matrix(sm, distance=1, row_normalized=False)
UMAT

In [None]:
# Here you have U-matrix plus its render
UMAT = u.show(
    sm, 
    distance2=1, 
    row_normalized=False, 
    show_data=True, 
    contooor=False, 
    blob=False
)

In [None]:
vhts  = BmuHitsView(12,12,"Hits Map")
vhts.show(sm, anotate=True, onlyzeros=False, labelsize=12, cmap="Blues")
plt.show()

## Clustering SOMs - A hybrid approach

muda este título para algo melhor se quiseres, não estava a encontrar um título melhor.

Acrescenta alguma descrição aqui por favor

In [None]:
# This som implementation does not have a random seed parameter
# We're going to set it up ourselves
np.random.seed(42)

# Notice that the SOM did not converge - We're under a time constraint for this class
sm = sompy.SOMFactory().build(
    df[metric_features].values, 
    mapsize=(50, 50), 
    initialization='random', 
    component_names=metric_features,
    training='batch',
    lattice='hexa'
)
sm.train(n_job=4, verbose='info', train_rough_len=100, train_finetune_len=100)

In [None]:
sm.get_node_vectors()

In [None]:
# faz um plt.subplots com os clusters e a u-matrix uma ao lado da outra para melhor interpretação talvez?
# tenta fazer mas não percas muito tempo nisso, pelo que vi no código do sompy para fazer isso da forma
# normal do matplotlib é preciso mudar muita coisa no código deles e já não estava com cabeça

u = sompy.umatrix.UMatrixView(50, 50, 'umatrix', show_axis=True, text_size=8, show_text=True)

UMAT = u.show(
    sm, 
    distance2=1, 
    row_normalized=False, 
    show_data=False, 
    contooor=False # Change this to True to visualize isomorphic curves
)

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', n_init=20, random_state=42)
nodeclus_labels = sm.cluster(kmeans)

hits  = HitMapView(12, 12,"Clustering",text_size=10, cmap=plt.cm.jet)
hits.show(sm, anotate=True, onlyzeros=False, labelsize=7, cmap="Pastel1")

plt.show()

In [None]:
hierclust = AgglomerativeClustering(n_clusters=4, linkage='ward')
nodeclus_labels = sm.cluster(hierclust)

hits  = HitMapView(12, 12,"Clustering",text_size=10, cmap=plt.cm.jet)
hits.show(sm, anotate=True, onlyzeros=False, labelsize=7, cmap="Pastel1")

plt.show()

### Final SOM Clustering solution

In [None]:
# check the nodes and clusters
nodes = sm.get_node_vectors()

df_nodes = pd.DataFrame(nodes, columns=metric_features)
df_nodes['label'] = nodeclus_labels
df_nodes

In [None]:
# final cluster solution
knn = KNeighborsClassifier(n_neighbors=1)

# fit using SOM nodes and cluster labels
knn.fit(nodes, nodeclus_labels)

# assign cluster labels to the dataset based on the label of the closest node
final_labels = knn.predict(df[metric_features])
final_labels

In [None]:
# Characterizing the final clusters
df_concat = pd.concat((df, pd.Series(final_labels, name='labels')), axis=1)
df_concat.groupby('labels').mean()

## Analysing the appropriateness of our solution

Altera isto da forma que achares mais apropriado, isto foi a primeira coisa que me lembrei

In [None]:
# using R^2
def get_ss(df):
    ss = np.sum(df.var() * (df.count() - 1))
    return ss  # return sum of sum of squares of each df variable

sst = get_ss(df)  # get total sum of squares
ssw_labels = df_concat.groupby(by='labels').apply(get_ss)  # compute ssw for each cluster labels
ssb = sst - np.sum(ssw_labels)  # remember: SST = SSW + SSB
r2 = ssb / sst
r2