# Visualizing embeddings from the shapes dataset

We want to visualize the embeddings learned from the shapes dataset together with the shapes, periodicity and noise.

In [1]:
# very nice plot from this one vv
#MODEL_PATH = "shapes/models/test_viz_res_long"

# pre embeddings: also nice plot
#MODEL_PATH = "shapes/models/test_pre"

# NIZE!
#MODEL_PATH = "shapes/models/test_viz_res_long_200_50ep"

# Sweeeeeet!
#MODEL_PATH = "shapes/models/test_viz_res_long_200"

# 10 dim embedding space
MODEL_PATH = "shapes/models/test_dim_10"

In [2]:
from shapes.model import TCN
from shapes.data import ShapeDataset
import torch
import pandas as pd
from torch.utils.data import DataLoader
import pickle
import numpy as np

In [3]:
f = open("_".join([MODEL_PATH, "_args.pkl"]), "rb")
args = pickle.load(f)
f.close()
df = pd.read_csv("_".join([MODEL_PATH, "df.csv"]))
df.columns = ["ts_id", "shape", "noise", "period"]
df = df.replace({np.nan: None})
df.head(), args, len(df)

(   ts_id     shape   noise  period
 0      0    square    None      10
 1      1    square    None      10
 2      2    square  matern       2
 3      3  triangle     iid      10
 4      4  triangle     iid      20,
 Namespace(bias=True, clip=False, dropout=0.2, embed='post', embedding_dim=10, epochs=100, h_batch_size=32, kernel_size=7, length_rolling=50, leveledinit=False, log_interval=20, lr=0.0005, mean=10.0, model_save_path='shapes\\models\\test_dim_10', num_layers=5, num_rolling_periods=1, num_workers=0, one_hot_id=False, print=True, res_block_size=32, stride=1, tenacity=100, time_covariates=False, train_end='2014-12-16', train_start='2012-01-01', type_res_blocks='erik', v_batch_size=32, var=1.0, writer_path='shapes\\runs\\test_dim_10'),
 50)

In [4]:
# Read in model
model = TCN(
        num_layers=args.num_layers,
        in_channels=1,
        out_channels=1,
        residual_blocks_channel_size=[args.res_block_size]*args.num_layers,
        kernel_size=args.kernel_size,
        num_embeddings=len(df),
        embedding_dim=args.embedding_dim,
        embed=args.embed,)
model.load_state_dict(torch.load(MODEL_PATH, map_location="cpu"))
model.eval()


TCN(
  (tcn): TemporalConvolutionalNetwork(
    (net): Sequential(
      (0): ResidualBlock(
        (dcc1): DilatedCausalConv(1, 32, kernel_size=(7,), stride=(1,))
        (drop1): Dropout(p=0.2, inplace=False)
        (dcc2): DilatedCausalConv(32, 32, kernel_size=(7,), stride=(1,))
        (drop2): Dropout(p=0.2, inplace=False)
        (res_conv): Conv1d(1, 32, kernel_size=(1,), stride=(1,))
      )
      (1): ResidualBlock(
        (dcc1): DilatedCausalConv(32, 32, kernel_size=(7,), stride=(1,), dilation=(2,))
        (drop1): Dropout(p=0.2, inplace=False)
        (dcc2): DilatedCausalConv(32, 32, kernel_size=(7,), stride=(1,), dilation=(2,))
        (drop2): Dropout(p=0.2, inplace=False)
      )
      (2): ResidualBlock(
        (dcc1): DilatedCausalConv(32, 32, kernel_size=(7,), stride=(1,), dilation=(4,))
        (drop1): Dropout(p=0.2, inplace=False)
        (dcc2): DilatedCausalConv(32, 32, kernel_size=(7,), stride=(1,), dilation=(4,))
        (drop2): Dropout(p=0.2, inplace=Fa

In [5]:

dataset = ShapeDataset(
    df=df,
    receptive_field=1+2*(args.kernel_size-1)*2**(args.num_layers-1),
    h_batch=0,
    N=args.N if args.N else 50,
)
dataloader = DataLoader(dataset=dataset, batch_size=1, num_workers=0, shuffle=False)

Dimension of X :  torch.Size([50, 1, 100])
Dimension of Y :  torch.Size([50, 1, 100])


Getting the embeddings by passing the indexes through the embedding layer.

In [6]:
embeddings = {}
for i, d in enumerate(dataloader):
    x, y, idx, idx_row = d[0], d[1], d[2], d[3]
    embeddings[idx_row.item()] = model.embedding(idx_row).detach().numpy().flatten()

In [7]:
embeddings[1].shape[0]

10

In [19]:
# PCA if we have too many embeddings to visualize

if embeddings[1].shape[0] > 2:
    print("importing pca")
    from sklearn.decomposition import PCA
    print("imported. loading pca.")
    pca = PCA(n_components=2)
    print("Done loading pca.")
    emb_mat = np.array(list(embeddings.values()))
    print("fit_transforming")
    emb_pca = pca.fit_transform(emb_mat)
    print("done fit_transforming")
    
    embeddings = {i : emb_pca[i] for i in range(len(emb_pca))}
    

In [20]:
print(pca.explained_variance_ratio_)

[0.26188716 0.13722205]


In [9]:
x = [embeddings[key][0] for key in embeddings.keys()]
y = [embeddings[key][1] for key in embeddings.keys()]
embs = [key for key in embeddings.keys()]

In [10]:
dfc = pd.DataFrame({"ts_id": embs,"x" : x, "y":y})
df = pd.merge(df, dfc, on="ts_id")

In [11]:
df_plot = df.replace({None: "None"})
df_plot.head()

Unnamed: 0,ts_id,shape,noise,period,x,y
0,0,square,,10,-0.298825,0.397975
1,1,square,,10,-0.26009,0.294993
2,2,square,matern,2,0.123725,-0.351231
3,3,triangle,iid,10,0.753907,-0.183832
4,4,triangle,iid,20,-0.318734,0.073934


In [12]:
import plotly.express as px

fig = px.scatter(
    df_plot, x="x", y="y", 
    symbol="shape",
    color='noise', 
    size="period", 
    )#hover_data=['period'])
fig.show()

In [13]:
replace_dict_str = {"square": "square", "triangle": "triangle-up", "sine": "circle", "iid": "red", "matern": "blue", "None": "green"}
replace_dict_int = {2: 10, 10: 20, 20: 30}

In [14]:
inv_replace_str = {value : key for key, value in replace_dict_str.items()}
inv_replace_int = {value : key for key, value in replace_dict_int.items()}
inv_replace_str, inv_replace_int 

({'square': 'square',
  'triangle-up': 'triangle',
  'circle': 'sine',
  'red': 'iid',
  'blue': 'matern',
  'green': 'None'},
 {10: 2, 20: 10, 30: 20})

In [15]:
df_plot["symbols"] = df_plot["shape"].replace(replace_dict_str)
df_plot["color"] =  df_plot["noise"].replace(replace_dict_str)
df_plot["size"] =  df_plot["period"].replace(replace_dict_int)
df_plot.head()

Unnamed: 0,ts_id,shape,noise,period,x,y,symbols,color,size
0,0,square,,10,-0.298825,0.397975,square,green,20
1,1,square,,10,-0.26009,0.294993,square,green,20
2,2,square,matern,2,0.123725,-0.351231,square,blue,10
3,3,triangle,iid,10,0.753907,-0.183832,triangle-up,red,20
4,4,triangle,iid,20,-0.318734,0.073934,triangle-up,red,30


In [17]:
import plotly.graph_objects as go

custom_df = np.stack((df_plot['shape'], df_plot['noise'], df_plot['period']), axis=-1)

fig = go.Figure(data=go.Scatter(x=df_plot['x'],
                                y=df_plot['y'],
                                mode='markers',
                                marker_symbol=df_plot["symbols"],
                                marker_line_color="midnightblue", marker_color=df_plot["color"], 
                                marker_line_width=2, marker_size=df_plot["size"], 
                                ))        
fig.update_traces(
    customdata = custom_df, 
    #hovertemplate="Shape: %{customdata[0]}<br>Noise: %{customdata[1]}<br>Period: %{customedata[2]}")
    hovertemplate = "Shape : %{customdata[0]}<br>" + 
    "Noise: %{customdata[1]}<br>"+\
    "Period: %{customdata[2]}")
                                
fig.update_layout(title='Embeddings')
fig.show()