In [None]:
import os
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE

In [None]:
sns.set_style('whitegrid')

In [None]:
dev_embeddings_root = '/home/k/k202141/rootgit/AI4EO-MapYourCity/logs/evaluations/runs/'
experiment = 'multimodal_swin_05-06_A'
np.sort(os.listdir(os.path.join(dev_embeddings_root, experiment)))

In [None]:
fold = '2024-05-06_16-01-08'
dev_embeddings_path = os.path.join(dev_embeddings_root, experiment, fold)
os.listdir(dev_embeddings_path)[:5]

In [None]:
data_root = '../data/AI4EO-MapYourCity/v1/building-age-dataset/train/data/'
dev_df = pd.read_csv('../data/AI4EO-MapYourCity/splits/dev-set.csv')
test_df = pd.read_csv('../data/AI4EO-MapYourCity/v1/building-age-dataset/test/test-set.csv')

In [None]:
this_dev_df = pd.merge(pd.read_csv(os.path.join(dev_embeddings_path, 'dev_predictions.csv')),
                        dev_df, on='pid')
this_dev_df.head()

In [None]:
this_test_df = pd.merge(pd.read_csv(os.path.join(dev_embeddings_path, 'test_predictions.csv')),
                        test_df, on='pid')
this_test_df.head()

## TSNE


### Read dev embeddings

In [None]:
%%time
dev_embeddings = {'topview':[], 'streetview':[], 'sentinel2':[]}

for i, pid in enumerate(dev_df.pid.values):
    for key in dev_embeddings:
        dev_embeddings[key].append(np.load(os.path.join(dev_embeddings_path, f'{pid}_{key}.npy')))

for key in dev_embeddings:
    dev_embeddings[key] = np.asarray(dev_embeddings[key]).squeeze()

In [None]:
%%time
test_embeddings = {'topview':[], 'streetview':[], 'sentinel2':[]}
pids = {'topview':[], 'streetview':[], 'sentinel2':[]}
is_valid_streetview = []

for i, pid in enumerate(test_df.pid.values):
    for key in test_embeddings:
        pf = os.path.join(dev_embeddings_path, f'{pid}_{key}.npy')
        if not os.path.exists(pf):
            continue
        test_embeddings[key].append(np.load(pf))
        if key == 'streetview':
            is_valid_streetview.append(i)
        pids[key].append(pid)

for key in test_embeddings:
    test_embeddings[key] = np.asarray(test_embeddings[key]).squeeze()

### Create TSNE

In [None]:
ndev = len(dev_embeddings['topview'])

In [None]:
embeddings = {}
tsne = {}

for key in dev_embeddings:
    embeddings[key] = np.concatenate([dev_embeddings[key], test_embeddings[key]])
    tsne[key] = TSNE()
  

In [None]:
%%time  
tsne_comp = {}
for key in tsne:
    tsne_comp[key] = tsne[key].fit_transform(embeddings[key])
    print(key, tsne_comp[key].shape)

In [None]:
labels = {}
centers = {}

for key, comp in tsne_comp.items():
    print(key)
    labels[key] = np.zeros(len(comp)) + 7 # Test set - class 7 (placeholder)
    labels[key][:ndev] = dev_df.label.values
    centers[key] = np.zeros([7, 2])
        
    for i in range(7):
        ix = np.where(labels[key]==i)[0]
        for j in range(2):
            centers[key][i,j] = np.mean(comp[:,j][ix])

The following plots show the density of the TSNE test samples (gray) and the labeled dev set samples (colored).

In [None]:
dfs = {} # for plotting
for key, comp in tsne_comp.items():
    dfs[key] = pd.DataFrame(dict(c1=comp[:,0],
                                 c2=comp[:,1],
                                 l0=labels[key].astype(int)))

In [None]:
sns.kdeplot(data=dfs['topview'].iloc[ndev:], x='c1', y='c2', hue='l0', palette='gray', alpha=0.6, zorder=150)
sns.scatterplot(data=dfs['topview'].iloc[:ndev], x='c1', y='c2', hue='l0', palette='muted', alpha=1.0, zorder=180)
ax=plt.gca()
ax.set_aspect('equal')

In [None]:
sns.kdeplot(data=dfs['streetview'].iloc[ndev:], x='c1', y='c2', hue='l0', palette='gray', alpha=0.6, zorder=150)
sns.scatterplot(data=dfs['streetview'].iloc[:ndev], x='c1', y='c2', hue='l0', palette='muted', alpha=1.0, zorder=180)
ax=plt.gca()
ax.set_aspect('equal')

In [None]:
sns.kdeplot(data=dfs['sentinel2'].iloc[ndev:], x='c1', y='c2', hue='l0', palette='gray', alpha=0.6, zorder=150)
sns.scatterplot(data=dfs['sentinel2'].iloc[:ndev], x='c1', y='c2', hue='l0', palette='muted', alpha=1.0, zorder=180)
ax=plt.gca()
ax.set_aspect('equal')

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix

In [None]:
kmeans = {}
kpreds = {}
for key, center in centers.items():
    kmeans[key] = KMeans(7, init=centers[key])
    kmeans[key].fit(tsne_comp[key][:ndev])
    
    # predictions made by kmeans
    kpreds[key] = {'dev': kmeans[key].predict(tsne_comp[key][:ndev]),
                   'test': kmeans[key].predict(tsne_comp[key][ndev:])
                  }

In [None]:
print('streetview', confusion_matrix(kpreds['streetview']['dev'], dev_df.label.values, normalize='true').diagonal().mean())
print('topview   ', confusion_matrix(kpreds['topview']['dev'], dev_df.label.values, normalize='true').diagonal().mean())
print('sentinel2 ', confusion_matrix(kpreds['sentinel2']['dev'], dev_df.label.values, normalize='true').diagonal().mean())

Cluster and dev data

In [None]:
fig, ax = plt.subplots(3, 2, sharex=True, sharey=True, figsize=(10, 15))

for i, (key, comp) in enumerate(tsne_comp.items()):
    ax[i,0].scatter(comp[:ndev,0], comp[:ndev,1], c=labels[key][:ndev], cmap='rainbow')
    ax[i,1].scatter(comp[:ndev,0], comp[:ndev,1], c=kpreds[key]['dev'], cmap='rainbow')
    ax[i,0].set_title(f'True labels {key}')
    ax[i,1].set_title(f'Clusters {key}')    

In [None]:
fig, ax = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(10, 4))

for i, (key, comp) in enumerate(tsne_comp.items()):
    if key != 'streetview':
        continue
        
    print(len(comp))
        
    k = np.where(np.array(pids[key]) == 'bd8jhrdavd')[0][0]
    kk = k + ndev
    print(this_test_df.iloc[is_valid_streetview].iloc[k])
    plabel = this_test_df.iloc[is_valid_streetview].iloc[k].predicted_label
    
    ax[0].scatter(comp[:ndev,0], comp[:ndev,1], c=labels[key][:ndev], alpha=0.2, cmap='rainbow')
    img=ax[1].scatter(comp[:ndev,0], comp[:ndev,1], c=kpreds[key]['dev'], alpha=0.2, cmap='rainbow')
    plt.colorbar(img, ax=ax)
    
    
    ax[0].plot(comp[kk,0], comp[kk,1], marker='x', color='k')
    ax[1].plot(comp[kk,0], comp[kk,1], marker='x', color='k')
    
    print(f'Predicted class: {plabel} / Clustered class: {kpreds[key]["test"][k]}')
    
    
    ax[0].set_title(f'True labels {key}')
    ax[1].set_title(f'Clusters {key}')    

## Cosine Similariy

In [None]:
import torch
from torch.nn import functional as F

In [None]:
nn = 13

new_classes = []

for k in range(len(dev_df.pid.values)):
    pid = dev_df.pid.iloc[k]
    
    ix = np.ones(len(dev_embeddings['topview'])).astype(bool)
    ix[k] = False
    
    tensor_embed1 = torch.Tensor(dev_embeddings['topview'][ix])
    
    tensor_embed2 = torch.Tensor(dev_embeddings['topview'][k]).unsqueeze(0)
    cos_sim = F.cosine_similarity(tensor_embed1, tensor_embed2).numpy()
    sort_ix = np.argsort(cos_sim)[::-1]
    threshold_ix = cos_sim>0.6
    if np.sum(threshold_ix)<2:
        threshold_ix = cos_sim>0.4
        
        if np.sum(threshold_ix)<2:
            threshold_ix = cos_sim>0.2
        
    classes, counts = np.unique(labels['topview'][:ndev][sort_ix][:nn], return_counts=True)
    #classes, counts = np.unique(labels['topview'][:ndev][threshold_ix], return_counts=True)
    sort_ix2 = np.argsort(counts)[::-1]
    new_classes.append(classes[sort_ix2][0])
    
    print(pid)

In [None]:
df =pd.DataFrame(dict(pid=dev_df.pid.values, new_classes=new_classes))

In [None]:
new_df = pd.merge(this_dev_df, df, on='pid')

In [None]:
m1 = confusion_matrix(new_df['new_classes'], new_df['label'], normalize='true').diagonal().mean()
m2 = confusion_matrix(new_df['predicted_label'], new_df['label'], normalize='true').diagonal().mean()

In [None]:
print(m1, m2)

In [None]:
tensor_embed1 = torch.Tensor(dev_embeddings['topview'])
nn = 13

new_classes = []

for k in range(len(pids['topview'])):
    pid = pids['topview'][k]    
    tensor_embed2 = torch.Tensor(test_embeddings['topview'][k]).unsqueeze(0)
    cos_sim = F.cosine_similarity(tensor_embed1, tensor_embed2).numpy()
    sort_ix = np.argsort(cos_sim)[::-1]
    threshold_ix = cos_sim>0.6
    if np.sum(threshold_ix)<2:
        threshold_ix = cos_sim>0.4
        
        if np.sum(threshold_ix)<2:
            threshold_ix = cos_sim>0.2
        
    classes, counts = np.unique(labels['topview'][:ndev][sort_ix][:nn], return_counts=True)
    #classes, counts = np.unique(labels['topview'][:ndev][threshold_ix], return_counts=True)
    sort_ix2 = np.argsort(counts)[::-1]
    new_classes.append(classes[sort_ix2][0])
    
    print(pid)

In [None]:
df =pd.DataFrame(dict(pid=pids['topview'], new_classes=new_classes))

In [None]:
new_df = pd.merge(this_test_df, df, on='pid')

In [None]:
(new_df['predicted_label'] == new_df['new_classes']).sum()

In [None]:
plt.hist(cos_sim, bins=50);