# Plotting visualization and prototypes that seems to make sense

In [None]:
import numpy as np
from matplotlib import rc
from matplotlib.collections import LineCollection
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import sys
import torch

sys.path.append("./")
sys.path.append("../../")

from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering

from clustering.cluster import cluster_dataset, cluster_similarity_matrix, create_prototypes
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from revenue.data import RevenueDataset

from representations.representations import calculate_pca, calculate_sarima, calculate_tsne, calculate_embedded_id, calculate_representation
from similarities.similarities import calculate_similarity_matrix
from clustering.cluster import create_prototypes

In [11]:
rc("text", usetex=True)
ts_style = {
    "axes.spines.left": True,
    "axes.spines.right": False,
    "axes.spines.bottom": True,
    "axes.spines.top": False,
    "axes.grid": False,
    "xtick.bottom": True,
    "ytick.left": True,
}
cluster_style = {
    "axes.spines.left": False,
    "axes.spines.right": False,
    "axes.spines.bottom": False,
    "axes.spines.top": False,
    "axes.grid": False,
    "xtick.bottom": False,
    "ytick.left": False,
}

In [12]:
dataset_scaled = RevenueDataset(
    file_path="revenue/data/processed_companies.csv",
    meta_path="revenue/data/comp_sect_meta.csv",
    data_scale=True,
    start_date="2007-01-01",
    end_date="2017-01-01",
)
dataset_unscaled = RevenueDataset(
    file_path="revenue/data/processed_companies.csv",
    meta_path="revenue/data/comp_sect_meta.csv",
    data_scale=False,
    start_date="2007-01-01",
    end_date="2017-01-01",
)
dataset_scaled = dataset_scaled.X.squeeze().detach().numpy()
dataset_unscaled = dataset_unscaled.X.squeeze().detach().numpy()
dataset_scaled.shape, dataset_unscaled.shape

Dimension of X :  torch.Size([15229, 1, 40])
Dimension of Y :  torch.Size([15229, 1, 40])
Dimension of X :  torch.Size([15229, 1, 40])
Dimension of Y :  torch.Size([15229, 1, 40])


(array([-0.77477473, -0.77477473, -0.77477473, -0.77477473, -0.77477473,
        -0.77477473, -0.77477473, -0.77477473, -0.77477473, -0.77477473,
        -0.77477473, -0.77477473, -0.77477473, -0.77477473, -0.77477473,
        -0.77477473, -0.77477473, -0.77477473, -0.77477473, -0.77477473,
        -0.51213926, -0.51213926, -0.51213926,  0.01313177,  0.01313177,
         0.01313177,  0.01313177,  0.2757673 ,  0.8010383 ,  0.5384028 ,
         0.8010383 ,  1.0636739 ,  1.0636739 ,  2.1142159 ,  1.3263093 ,
         1.8515804 ,  1.0636739 ,  1.8515804 ,  1.8515804 ,  2.3768513 ],
       dtype=float32),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  3.,  3.,  3.,
         3.,  4.,  6.,  5.,  6.,  7.,  7., 11.,  8., 10.,  7., 10., 10.,
        12.], dtype=float32),
 (15229, 40),
 (15229, 40))

In [13]:
algorithm = "KMeans"
num_clusters = 100
representation = "raw"
scaled_representation = True
num_components = 10
similarity = "correlation"

In [14]:
if algorithm == "KMeans":
    method = KMeans(n_clusters=num_clusters)
    dist_or_sim_or_feat = "feat"
elif algorithm in ("Agglomerative"):
    method = AgglomerativeClustering(
        n_clusters=num_clusters, affinity="precomputed", linkage="average"
    )
    dist_or_sim_or_feat = "dist"
elif algorithm in ("Spectral clustering"):
    method = SpectralClustering(
        n_clusters=num_clusters, affinity="precomputed"
    )
    dist_or_sim_or_feat = "sim"

""" Representation """
if scaled_representation:
    rep = calculate_representation(
        dataset_scaled,
        representation=representation,
        num_components=num_components,
        dataset="electricity"
    )
else:
    rep = calculate_representation(
        dataset_unscaled,
        representation=representation,
        num_components=num_components,
        dataset="electricity"
    )

""" Similarity """
if dist_or_sim_or_feat == "feat":
    D = rep
    S = calculate_similarity_matrix(rep, metric=similarity, dist_or_sim="dist")
else:
    D = calculate_similarity_matrix(rep, metric=similarity, dist_or_sim=dist_or_sim_or_feat)
    S = D


""" Clustering """
clusters = method.fit_predict(D)
cluster_dist = {
    c: len(clusters[np.where(clusters == c)]) for c in range(len(set(clusters)))
}
for k, v in cluster_dist.items():
    print(f"{k:2} : {v}")
cluster_dict = {i: clusters[i] for i in range(len(clusters))}

ia 127624.23
start iteration
done sorting
end inner loop
Iteration 10, inertia 127468.48
start iteration
done sorting
end inner loop
Iteration 11, inertia 127369.945
start iteration
done sorting
end inner loop
Iteration 12, inertia 127268.4
start iteration
done sorting
end inner loop
Iteration 13, inertia 127199.25
start iteration
done sorting
end inner loop
Iteration 14, inertia 127153.32
start iteration
done sorting
end inner loop
Iteration 15, inertia 127119.06
start iteration
done sorting
end inner loop
Iteration 16, inertia 127089.03
start iteration
done sorting
end inner loop
Iteration 17, inertia 127065.03
start iteration
done sorting
end inner loop
Iteration 18, inertia 127032.42
start iteration
done sorting
end inner loop
Iteration 19, inertia 127001.375
start iteration
done sorting
end inner loop
Iteration 20, inertia 126975.51
start iteration
done sorting
end inner loop
Iteration 21, inertia 126953.62
start iteration
done sorting
end inner loop
Iteration 22, inertia 126936.5

In [18]:
X_scaled_pca = PCA(n_components=10).fit_transform(dataset_scaled)
X_unsscaled_pca = PCA(n_components=10).fit_transform(dataset_unscaled)

In [None]:
emb_scaled = TSNE(n_components=2, verbose=1).fit_transform(X_scaled_pca)

In [None]:
emb_unscaled = TSNE(n_components=2).fit_transform(X_unscaled_pca)

In [16]:
emb_scaled = calculate_tsne(X_scaled_pca, num_components=2, verbose=1)
emb_unscaled = calculate_tsne(X_unscaled_pca, num_components=2, verbose=1)

MemoryError: Unable to allocate 885. MiB for an array with shape (115953606,) and data type float64

In [17]:
with plt.style.context(cluster_style):
    colors = [v for k, v in cluster_dict.items()]
    plt.figure(figsize=(4.77/2, 4.77/2))
    ax = plt.axes([0., 0., 1., 1.])
    plt.scatter(emb_scaled[:,0], emb_scaled[:,1], c = colors, cmap="tab10", edgecolors="black", s=15, linewidth=0.3)
    plt.xticks([])
    plt.yticks([])

    # Display a graph of the partial correlations
    non_zero = (np.abs(np.triu(np.exp(-S), k=1)) > 0.92)

    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    segments = [[emb_scaled[start, :], emb_scaled[stop, :]] for start, stop in zip(start_idx, end_idx)]
    lc = LineCollection(segments, zorder=0)
    ax.add_collection(lc)
    plt.tight_layout()
    #plt.savefig(f"Figures/revenue_proposed_clustering_edges.pdf", bbox_inches="tight")
    #plt.savefig(f"Figures/revenue_proposed_clustering_.pdf", bbox_inches="tight")
    plt.show()

NameError: name 'emb_scaled' is not defined