In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

from tqdm import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
import time
from collections import defaultdict
from scipy.spatial.distance import squareform, pdist, cdist
from sklearn.decomposition import PCA

import numpy as np
from numba import njit, jit
from numba import types
from numba.typed import Dict
from scipy.cluster.hierarchy import linkage
from matplotlib.colors import BASE_COLORS
from sklearn.neighbors import NearestNeighbors
from sequence import (
    generate_dna_sequences,
    sequence_distance,
    evaluate_embeddings
)
from linkage_based_embeddings import (
    build_single_linkage_simplex_strength_matrix,
    single_linkage_mds_from_condensed,
    maximal_linkage_mds_from_condensed
)


In [None]:
all_results = {}
for n_components in [2, 5]:
    for num_starts in [100, 200]:
        for num_mutations in [10, 20]: 
            start = time.time()
            base_sequence_ids, sequences = generate_dna_sequences(
                num_mutations=num_mutations,
                num_starts=num_starts)
            condensed_raw_distance_matrix = pdist(sequences, metric=sequence_distance)

            results = {}
            results["base_sequence_ids"] = base_sequence_ids
            results["n_components"] = n_components
            results["num_starts"] = num_starts
            results["num_mutations"] = num_mutations

            results["maximal_linkage_mds_embeddings"] = maximal_linkage_mds_from_condensed(
                condensed_raw_distance_matrix, n_components=n_components)
            results["maximal_linkage_mds_from_condensed"] = evaluate_embeddings(
                embedding_sequence=results["maximal_linkage_mds_embeddings"], num_mutations=num_mutations)

            results["single_linkage_mds_embeddings"] = single_linkage_mds_from_condensed(
                condensed_raw_distance_matrix, n_components=n_components)
            results["single_linkage_mds_from_condensed"] = evaluate_embeddings(
                embedding_sequence=results["single_linkage_mds_embeddings"], num_mutations=num_mutations)
            all_results[(n_components, num_starts, num_mutations)] = results
            
            print("============")
            for k, v in results.items():
                if "embeddings" not in k and "base_sequence_ids" not in k:
                    print(k, v)
            print(time.time() - start)


In [None]:

def plot_reduced_embeddings(base_sequence_ids, reduced_embeddings):    
    entity_id_to_color = {unique_entity_id: np.random.random(3) for unique_entity_id in set(base_sequence_ids)}
    plt.scatter(
        x=reduced_embeddings[:,0],
        y=reduced_embeddings[:,1],
        c=[entity_id_to_color[i] for i in base_sequence_ids]
    )
    mins = np.min(reduced_embeddings, axis=0)
    maxs = np.max(reduced_embeddings, axis=0)
    plt.xlim((mins[0] - abs(mins[0]*0.5), maxs[0] + abs(maxs[0]*0.5)))
    plt.ylim((mins[1] - abs(mins[1]*0.5), maxs[1] + abs(maxs[1]*0.5)))

    
    
result = all_results[(2,100,10)]

plt.figure(figsize=(15, 8))
plt.subplot(1,2,1)
plt.title("Metric Multidimensional Scaling", fontsize=16)
plot_reduced_embeddings(result["base_sequence_ids"], result['maximal_linkage_mds_embeddings'])
plt.xlabel("X Component of 2-Dimensional Embedding", fontsize=16)
plt.ylabel("Y Component of 2-Dimensional Embedding", fontsize=16)


plt.subplot(1,2,2)
plt.title("Single Linkage Scaling", fontsize=16)
plot_reduced_embeddings(result["base_sequence_ids"], result['single_linkage_mds_embeddings'])
plt.xlabel("X Component of 2-Dimensional Embedding", fontsize=16)
plt.ylabel("Y Component of 2-Dimensional Embedding", fontsize=16)

