In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from tqdm import tqdm
import annoy
import random
import numpy as np
import pandas as pd
import seaborn as sns
import time
from collections import defaultdict
from sklearn.manifold import spectral_embedding, MDS
from scipy.spatial.distance import squareform, pdist, cdist
from sklearn.decomposition import PCA

import numpy as np
from numba import njit, jit
from numba import types
from numba.typed import Dict
from scipy.cluster.hierarchy import linkage
from matplotlib.colors import BASE_COLORS
from helpers import mds, write_embedding_to_text_file, write_embedding_to_two_text_files, is_numeric
from sklearn.neighbors import NearestNeighbors
from sequence import *
from linkage_based_embeddings import *

def plot_reduced_embeddings(entity_ids, reduced_embeddings):    
    entity_id_to_color = {unique_entity_id: np.random.random(3) for unique_entity_id in set(entity_ids)}
    plt.scatter(
        x=reduced_embeddings[:,0],
        y=reduced_embeddings[:,1],
        c=[entity_id_to_color[i] for i in entity_ids]
    )
    mins = np.min(reduced_embeddings, axis=0)
    maxs = np.max(reduced_embeddings, axis=0)
    plt.xlim((mins[0] - abs(mins[0]*0.5), maxs[0] + abs(maxs[0]*0.5)))
    plt.ylim((mins[1] - abs(mins[1]*0.5), maxs[1] + abs(maxs[1]*0.5)))


In [None]:
# DNA Simulation Dataset
def generate_condensed_raw_distance_matrix(num_mutations, num_modifications, num_starts, sequence_length):
    base_sequence_list = []
    base = np.random.choice(["A", "C", "G", "T"], sequence_length)
    for i in tqdm(range(num_starts)):
        modified_sequence = [s for s in base]
        for i in range(num_modifications):
            modified_sequence = mutate_sequence(modified_sequence)
        base_sequence_list.append(np.array(modified_sequence)[None,...])
    base_sequences = np.vstack(base_sequence_list)

    entity_ids = []
    raw_embeddings = []
    for i, b in tqdm(enumerate(base_sequences)):
        entity_ids += [i for j in range(num_mutations)]
        raw_embeddings += generate_mutation_chain(base_sequence=b, num_mutations=num_mutations)

    metric = sequence_distance
    return entity_ids, pdist(raw_embeddings, metric=metric)

num_modifications = 100
sequence_length = 1000


In [None]:
all_results = []
for drop_first in [False]:
    for n_components in [2, 5]:
        for num_starts in [100, 200]:
            for num_mutations in [10, 20]: 
                start = time.time()
                entity_ids, condensed_raw_distance_matrix = generate_condensed_raw_distance_matrix(
                    num_mutations=num_mutations,
                    num_modifications=num_modifications,
                    num_starts=num_starts,
                    sequence_length=sequence_length)

                results = {}
                results["entity_ids"] = entity_ids
                results["n_components"] = n_components
                results["num_starts"] = num_starts
                results["num_mutations"] = num_mutations

                results["maximal_linkage_mds_embeddings"] = maximal_linkage_mds_from_condensed(
                    condensed_raw_distance_matrix, n_components=n_components)
                results["maximal_linkage_mds_from_condensed"] = evaluate_embeddings(
                    embedding_sequence=results["maximal_linkage_mds_embeddings"], num_mutations=num_mutations)

                results["single_linkage_mds_embeddings"] = single_linkage_mds_from_condensed(
                    condensed_raw_distance_matrix, n_components=n_components)
                results["single_linkage_mds_from_condensed"] = evaluate_embeddings(
                    embedding_sequence=results["single_linkage_mds_embeddings"], num_mutations=num_mutations)

                print("============")
                for k, v in results.items():
                    if "embeddings" not in k and "entity_ids" not in k:
                        print(k, v)
                all_results.append(results)
                print(time.time() - start)


In [None]:
result = all_results[2]

plt.figure(figsize=(15, 8))
plt.subplot(1,2,1)
plt.title("Metric Multidimensional Scaling with Maximal Linkage")
plot_reduced_embeddings(result["entity_ids"], result['maximal_linkage_mds_embeddings'])
plt.xlabel("X Component of 2-Dimensional Embedding")
plt.ylabel("Y Component of 2-Dimensional Embedding")


plt.subplot(1,2,2)
plt.title("Metric Multidimensional Scaling with Single Linkage")
plot_reduced_embeddings(result["entity_ids"], result['single_linkage_mds_embeddings'])
plt.xlabel("X Component of 2-Dimensional Embedding")
plt.ylabel("Y Component of 2-Dimensional Embedding")

