In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils import *
from distance import jaccard
from ksets import *
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors
from plotly.subplots import make_subplots
from sklearn.manifold import TSNE, MDS
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

In [3]:
meta, projects, votes = load_pb_ohe('data/poland_warszawa_2023_srodmiescie.pb')

In [4]:
jaccard_distances = jaccard(votes, votes)

In [5]:
embeded_tsne = TSNE(n_components=2, perplexity=30, learning_rate='auto', init='pca').fit_transform(votes)

In [None]:
embeded_mds_euclid = MDS(n_components=2, normalized_stress='auto', n_jobs=-1).fit_transform(normalize(votes, norm='l2'))
# this takes 5-7 minutes on my machine

In [None]:
embeded_mds_jaccard = MDS(n_components=2, normalized_stress='auto', n_jobs=-1, dissimilarity='precomputed').fit_transform(jaccard_distances)
# this takes 5-7 minutes on my machine

In [None]:
embeded_pca = PCA(n_components=2).fit_transform(votes)

In [None]:
clusters_kmeans = KMeans(n_clusters=8, n_init=10).fit(normalize(votes, norm='l2')).labels_

In [None]:
clusters_ksets = kswaps(8, votes, 100)[1]

In [None]:
def make_graph_object(arr: np.ndarray, cluster_ids):
    colors = dict(enumerate(plotly.colors.qualitative.Plotly))
    return go.Scatter(
        x=arr[:,0],
        y=arr[:,1],
        mode='markers',
        marker=dict(color=[colors[i] for i in cluster_ids]),
        hoverinfo='none',
    )

In [None]:
def visualise(embeddings, idss, **kwargs):
    if not isinstance(embeddings, list) and not isinstance(idss, list):
        return go.Figure(data=make_graph_object(embeddings, idss))
    if not isinstance(embeddings, list):
        embeddings = [embeddings]
    if not isinstance(idss, list):
        idss = [idss]
    fig = make_subplots(rows=len(embeddings), cols=len(idss), shared_yaxes=True, **kwargs)
    for i, embedding in enumerate(embeddings):
        for j, ids in enumerate(idss):
            fig.append_trace(make_graph_object(embedding, ids), row=i+1, col=j+1)
    fig.update_layout(showlegend=False)
    return fig


In [None]:
fig = visualise(
    [embeded_tsne, embeded_mds_euclid, embeded_mds_jaccard, embeded_pca],
    [clusters_kmeans, clusters_ksets],
    row_titles=["TSNE", "MDS Euclid", "MDS Jaccard", "PCA"],
    column_titles=["Euclid KMeans", "KSets"]
)
fig.update_layout(hovermode=False)
fig.update_layout(height=2400, width=1600)
fig.write_image("clusters.png")