In [None]:
#Importing gensim libraries for d2v embedding
from gensim.models.doc2vec import Doc2Vec

from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram

from ete3 import Tree

In [None]:
#Define a function to get Newick format from a Tree
def getNewick(node, newick, parentdist, leaf_names):
    if node.is_leaf():
        return "%s:%.2f%s" % (leaf_names[node.id], parentdist - node.dist, newick)
    else:
        if len(newick) > 0:
            newick = "):%.2f%s" % (parentdist - node.dist, newick)
        else:
            newick = ");"
        newick = getNewick(node.get_left(), newick, node.dist, leaf_names)
        newick = getNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names)
        newick = "(%s" % (newick)
        return newick

In [None]:
#Loading the dataset
dataset = pd.read_csv('data/orf1ab_df_seq_meta.csv')
#Removing sequences that doesn't start with Methionine
dataset=dataset[dataset['Sequence'].astype(str).str.startswith('M')]
Accession = dataset.iloc[:, 1].values

In [None]:
vector_sizes = [10,50,100,200,300]
epochs = [5,10,20,50,100,200]
df=pd.DataFrame(np.ones((len(vector_sizes), len(epochs))), index=vs, columns=ep)

#loading the ClustalOmega tree
t2 = Tree("data/tree.out")

for vector_size in vector_sizes:
    for epoch in epochs:
        #loading d2v model and getting vectors
        model = Doc2Vec.load(f'models/orf2vec_overlap_v{vector_size}_ep{epoch}_k4_w5.d2v')
        X= model.docvecs.vectors_docs
        dm = pdist(X, 'cosine')
        #Applying Hierarchical clusterig, generate the tree and 
        #get the Newick format
        method = 'ward'
        Z = linkage(dm, method=method, metric='cosine', optimal_ordering=False)
        leaf_names=ids
        tree = hierarchy.to_tree(Z,False)
        t1 = Tree(getNewick(tree, "", tree.dist, leaf_names))
        #Using Robinson foulds to confront the morphology of the two trees
        rf= t2.robinson_foulds(t1)
        ratio="{:.2f}".format(rf[0]/rf[1])
        df.loc[vs,ep]=ratio

In [None]:
df

In [None]:
df.to_csv('data/results_k4.csv')