In [None]:
import pandas as pd
import numpy as np
from collections import Counter

#Importing gensim libraries for d2v embedding
from gensim.models.doc2vec import Doc2Vec

#for sklearn libraries for TSNE and PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


#for sklearn libraries for 
from sklearn.manifold import TSNE

#Importing scipy libraries for hierarchical clustering
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram

# Importing matplotlib and seaborn for graphics.
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
%matplotlib inline

import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})

In [None]:
# functions to create scatter plots

def PCA_scatter(x,y,tag,sp):
    colors = pd.factorize(tag)[0]
    c=Counter(tag)
    sp_uniq=np.unique(sp)
    tag_uniq=np.unique(tag)
    palette_uniq= [plt.cm.hsv(i/float(len(tag_uniq)-1)) for i in range(len(tag_uniq))]
    
    f, ax = plt.subplots(1,figsize=(25, 25))
    ax.set_xlabel('PC1 Explained variance: %.2f percent'% (pca.explained_variance_ratio_[0]*100), fontsize = 15)
    ax.set_ylabel('PC2 Explained variance: %.2f percent'% (pca.explained_variance_ratio_[1]*100), fontsize = 15)

    for i, v in enumerate(tag_uniq):
        if c[v]>3:
            xi=[x[j] for j in range(len(tag)) if tag[j]==v]
            yi=[y[j] for j in range(len(tag)) if tag[j]==v]
            plt.scatter(xi,yi, c=palette_uniq[i], label=v)
            
    #mark with an 'x' if the species is SARS-cov2
    for i, v in enumerate(sp_uniq):
        xi=[x[j] for j in range(len(sp)) if cov in sp[j]]
        yi=[y[j] for j in range(len(sp)) if cov in sp[j]]
        plt.scatter(xi,yi, marker='x', color='black', alpha=0.2)

    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.savefig('plots/cov2vec_pca_ep200_v300.png')

    return f, ax

def TSNE_scatter(x,y,tag,p, sp):
    c=Counter(tag)
    colors = pd.factorize(tag)[0]
    sp_uniq=np.unique(sp)
    tag_uniq=np.unique(tag)
    palette_uniq= [plt.cm.hsv(i/float(len(tag_uniq)-1)) for i in range(len(tag_uniq))]
    f, ax = plt.subplots(1,figsize=(35, 25))
    
    for i, v in enumerate(tag_uniq):
        xi=[x[j] for j in range(len(tag)) if tag[j]==v]
        yi=[y[j] for j in range(len(tag)) if tag[j]==v]
        plt.scatter(xi,yi, c=palette_uniq[i], label=v)
    
    #mark with an 'x' if the species is SARS-cov2
    for i, v in enumerate(sp_uniq):
        xi=[x[j] for j in range(len(sp)) if cov in sp[j]]
        yi=[y[j] for j in range(len(sp)) if cov in sp[j]]
        plt.scatter(xi,yi, marker='x', color='black')
        
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.tight_layout()
    plt.savefig(f'plots/cov2vec_tsne{p}.png')

    return f, ax

In [None]:
dataset = pd.read_csv('data/orf1ab_df_seq_meta.csv')
#Removing sequences that doesn't start with Methionine
dataset=dataset[dataset['Sequence'].astype(str).str.startswith('M')]
dataset

In [None]:
ids = dataset.iloc[:, 1].values
seqs = dataset.iloc[:, 2].values
host= dataset.iloc[:, 7].values
hosts=[i for i in host]
sp = dataset.iloc[:, 10].values
gb_t2=[i.split("[")[1][:-1] for i in sp]
cov= "Severe acute respiratory syndrome coronavirus 2"

In [None]:
model = Doc2Vec.load('models/orf2vec_overlap_v300_ep200_k4_w5.d2v')
x= model.docvecs.vectors_docs

## Principal component analysis (PCA)

In [None]:
x= model.docvecs.vectors_docs
x = StandardScaler().fit_transform(x)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)

In [None]:
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
finalDf = pd.concat([principalDf, dataset[['Host']]], axis = 1)
x=finalDf.iloc[:, 0].values
y=finalDf.iloc[:, 1].values

In [None]:
PCA_scatter(x , y,gb_t2,sp)

## Applying t-distributed stochastic neighbor embedding (t-SNE)

In [None]:
X= model.docvecs.vectors_docs
perplexity=30
tsne = TSNE(n_components=2, perplexity=p, random_state=0)
X_tsne = tsne.fit_transform(X)
df = pd.DataFrame(X_tsne, index=ids, columns=['x', 'y'])
x=[i[0] for i in X_tsne]
y=[i[1] for i in X_tsne]

TSNE_scatter(x , y,gb_t2,p, sp)

## Visualizing hierarchical clustering tree 

In [None]:
X= model.docvecs.vectors_docs
dm = pdist(X, 'cosine') 
method = 'ward'
Z = linkage(dm, method=method, metric='cosine', optimal_ordering=False)


plt.figure(figsize=(50, 50))
dendrogram(Z,
            orientation='top',
            labels=hosts,
            distance_sort='descending')
plt.savefig('plots/dendro_ep200_v300.png')
plt.show()