In [None]:
import sys
from scipy import fft
import numpy as np
from multiprocessing import Pool
from scipy.stats import pearsonr
# import Bio
from Bio.Phylo.TreeConstruction import *
from functools import partial
import pywt
# import os
from statistics import median, mean
from one_dimensional_num_mapping import *
# plotting
from mpl_toolkits import mplot3d
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from sklearn import preprocessing as pe
import h5py
import os
import shutil
import pathlib
import io 

In [None]:
#MLDSP mains scipt imports

In [None]:
#Classification.py
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict

## Creates and trains various classifiers (linear-discriminant, linear svm, quadratic svm, fine knn) on input sequences and labels
## Parameters:
# - dismat (array): distance matrix between sequences
# - alabels (list): labels corresponding to individual sequences
# - folds (int): number of folds to use when splitting dataset
# - total (int): total number of sequences
# - saveModels (bool): whether or not to save models to file
## Return:
# - avgAccuracy (float): mean accuracy across all models
# - meanModelAccuracies (dictionary): mean accuracy per model
# - aggregatedCMatrix (dictionary): aggregated confusion matrix per model
# - misclassifiedIdx (dictionary): misclassified indices per model
# @profile(sort_by='cumulative', lines_to_print=10, strip_dirs=True)
def classify_dismat(dismat, alabels, folds, total, saveModels=False):

    kf = KFold(n_splits=folds, shuffle=False, random_state=None)
    model_names = {'LinearDiscriminant':LinearDiscriminantAnalysis() #matlab doesn't specify what solver it uses, orginal code used (shrinkage) gamma=0 
                   ,'LinearSVM':SVC(kernel='linear',cache_size=1000,decision_function_shape='ovo'), 'QuadSVM':SVC(kernel='poly', degree=2,cache_size=1000,decision_function_shape='ovo'),
                   'KNN':KNeighborsClassifier(n_neighbors=1, leaf_size=50, metric='euclidean', weights='uniform', algorithm='brute')
                   }
    # use 2 additional classifiers if <= 2000 sequences
    # if total <= 2000:
    #     model_names['SubspaceDiscriminant'] = SVC(kernel='rbf')
    #     model_names['SubspaceKNN'] = None

    accuracies = defaultdict(list) # dictionary with key: modelname, value: list containing accuracies
    confMatrixDict = defaultdict(list) # dictionary with key: modelname, value: list containing confusion matrix displays
    misclassifiedIdx = defaultdict(list) # dictionary with key: modelname, value: list containing indices/sequences of dismat that have been misclassifed

    # Loop through each model
    for modelName in model_names:
        model = model_names.get(modelName)
        print(model)
        # Create pipeline model
        if modelName in ['LinearSVM', 'QuadSVM', 'KNN']:
            pipeModel = make_pipeline(StandardScaler(), model)
        else:
            pipeModel = make_pipeline(model)
            
        i =0
        for train_index, test_index in kf.split(dismat, alabels):
            i += 1
            X_train = dismat[train_index]
            X_test = dismat[test_index]
            y_train = [alabels[i] for i in train_index]
            y_test = [alabels[i] for i in test_index]

            # Fit the pipeline model
            pipeModel.fit(X_train, y_train)
            prediction = pipeModel.predict(X_test)
            # Compute and store accuracy of model
            accuracies[modelName].append(accuracy_score(y_test, prediction))
            print(accuracy_score(y_test, prediction))
            # Generate and store confusion matrix
            cm = confusion_matrix(y_test, prediction, labels=list(np.unique(alabels)), normalize=None)
            confMatrixDict[modelName].append(cm)

            # Store indices (of dismat) of misclassified sequences
            for i in range(len(prediction)):
                # if prediction incorrect, add to list of misclassified indices for the model
                if prediction[i] != y_test[i]:
                    misclassifiedIdx[modelName].append(test_index[i])
            print(i)

    # For each model, Calculate mean of accuracies across 10 folds & Sum all confusion matrices across 10 folds
    meanModelAccuracies = {} # key: modelName, value: mean accuracy value for model
    aggregatedCMatrix = {} # key: modelName, value: summed Confusion Matrix for model
    for modelName in accuracies:
        meanModelAccuracies[modelName] = np.mean(accuracies.get(modelName))
        aggregatedCMatrix[modelName] = np.sum(confMatrixDict.get(modelName), axis=0)

    # Mean accuracy value across all classifiers
    avgAccuracy = sum(meanModelAccuracies.values()) / len(meanModelAccuracies)

    return avgAccuracy, meanModelAccuracies, aggregatedCMatrix, dict(misclassifiedIdx)

# Plots and returns a ConfusionMatrix Display object from a raw array
def displayConfusionMatrix(confMatrix, alabels):
    # generate cm image and plot
    confMatrixDisplayObj = ConfusionMatrixDisplay(confusion_matrix=confMatrix, display_labels=list(np.unique(alabels)))
    confMatrixDisplayObj.plot(cmap='Blues', colorbar= False)

    # access raw cm array: cm_disp.confusion_matrix
    # alternative to display: cm_disp = plot_confusion_matrix(pipeModel, X_test, y_test, normalize=None, cmap='Blues', colorbar= False)

    return confMatrixDisplayObj

In [None]:
#preprocessing.py
import os
import random
import pandas as pd
from Bio import SeqIO
from collections import Counter

def preprocessing(data_set, max_clust_size, metadata):
    """Preprocessing of fasta sequences using BioPython into a database of
    SeqRecord objects each representing a unique sequence, can handle multiple
    sequence fastas.

    seqs: main sequence database, dictionary-like object, no info on clusters
    cluster_names: list of all cluster names from sub-directory names.

    number_of_clusters: integer of the total count of clusters.

    cluster_sample_info: Dictionary with keys=cluster_names and values =
    a tuple consisting of: (number of samples in cluster,
        a list of accession ids corresponding to sequences of that cluster).

    total_seq: integer of the total sequence count in dataset.

    cluster_dict: depracated with cluster_sample_info, will be removed
    """
    # Dictionary to store SeqIO
    seq_dict = {}
    # dictionary with Accession ID as keys and cluster name as values
    cluster_dict = {}
    bad_keys=[]
   
    # Iterate through all fasta files
    for f in sorted(os.listdir(data_set)):
        file = os.path.join(data_set, f)
        with open(file) as handle:
            # SeqIO.index_db() is read only & can't multiprocess, SeqIO.index doesnt take file handle for multi-file, SeqIO_to_dict is all in memory, SeqIO.parse you only get to iterate over data once
    #         # single dict
            seq_dict.update(SeqIO.to_dict(SeqIO.parse(handle, "fasta")))
            
    cluster_dict = pd.read_csv(metadata,header=None, index_col=0, sep=None).squeeze("columns").to_dict()
    cluster_stats = Counter(cluster_dict.values())
    
    for key in seq_dict.keys():
        if not key in cluster_dict:
            bad_keys.append(key)
    
    print(bad_keys)
    #Might affect order of labels & lead to mislabelling
    # for item in bad_keys:
    #     seq_dict.pop(item)
    
    total_seq = len(seq_dict)
    return seq_dict, total_seq, cluster_dict, cluster_stats


def old_preprocessing(data_set, max_clust_size):
    # Dictionary to store SeqIO
    seq_dict = {}
    cluster_names = sorted(os.listdir(data_set))
    # dictionary with Accession ID as keys and cluster name as values
    cluster_dict = {}
    # number of samples in each cluster
    # cluster_samples_info = {}
    # count of the number of clusters as int
    cluster_stats={}
    # Iterate over each cluster (top level directories)
    for cluster in cluster_names:
        files = os.listdir(os.path.join(data_set, cluster))
        files = [os.path.join(data_set, cluster, f) for f in files]
        # paths.extend(files)
        # get path for cluster as str
        cluster_path = os.path.join(data_set, cluster)
        # get names of files in cluster as list of str
        file_name = sorted(os.listdir(cluster_path))
        cluster_stats.update({cluster:len(file_name)})
        temp_dict={}
        # Iterate over each file in the cluster
        for file in file_name:
            # get path for each file in cluster as str
            file_path = os.path.join(cluster_path, file)
            # Required to use SeqIO.index to generate dictionary of SeqRecords (parsed on demand in main script)

            # SeqIO.index doesnt take file handle to index multiple seqs to a
            # single dict

            # Not sure if storing the dict like object of SeqIO.index in a dict forces loading into memory (performance penalty)
            seqs = SeqIO.index(file_path, "fasta"
            #,key_function=get_accession
            )
            seq_dict.update(seqs)
            # Generate second dictionary for cluster info
            for accession_id in seqs.keys():
                cluster_dict.update({accession_id: cluster})
        # if len(temp_dict) >= max_clust_size:
        #     subset = dict(random.sample(temp_dict.items(), max_clust_size))
        #     print(len(subset))
        #     seq_dict.update(subset)
        # else:
        #     seq_dict.update(temp_dict)
        # for accession_id in seq_dict.keys():
        #     cluster_dict.update({accession_id: cluster}) 
    total_seq = len(seq_dict)
    del temp_dict 
    del seqs
    return seq_dict, total_seq, cluster_dict, cluster_stats

In [None]:
#visualisation.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, TSNE
from skbio.stats.ordination import pcoa

def dimReduction(data, n_dim, method):
    """
    Function will take in a nxm 2d-array and reduce the dimensions of the data using a specified dimensionality
    reduction technique (PCA, MDS, or TSNE).
    :param np.array data: input data to be transformed
    :param int n_dim: dimensions to reduce to
    :param str method: which method to use (either 'pca', 'mds', or 'tsne')
    :return np.array transformed: nxn_dim array of tranformed data
    """
    if method == 'pca':
        pca = PCA(n_components=n_dim)
        transformed = pca.fit_transform(data)
        return transformed
    #Not working should be same mds algorithm as matlab
    elif method == 'mds':
        mds = pcoa(data, number_of_dimensions=n_dim)
        transformed = mds.samples
        return transformed
    elif method == 'tsne':
        tsne = TSNE(n_components=n_dim)
        transformed = tsne.fit_transform(data)
        return transformed
        

In [None]:
#cgr.py
import numpy as np

def cgr(chars, order, k):
    """computes CGR representation in standard format: C top-left, G top-right, A bottom-left, T bottom-right

    Keyword arguments:
    chars: sequence
    order: chars to include in CGR
    k: value of k-mer
    """
    # set a numpy array of size 2^k,2^k  # remember that arrays are numbered top to bottom & left to right, unlike coordinate plots which go bottom up and left to right
    out = np.zeros((2**k,2**k))
    # set starting point of cgr plotting in the middle of cgr (x,y)
    x = 2**(k-1) 
    y = 2**(k-1)

    for i in range(len(chars)):
        char = chars[i]
        # devide x coordiate in half, moving it halfway to the left, this is correct if base is C or A
        x = int(x/2)
        # check to see if base is actually a G or T
        if char == order[2] or char == order[3]:  # if the nucleotide is G or T
            # add 2^(k-1) aka half the cgr length to the x value, brining it from 1/4 to 3/4
            x += 2**(k-1)
        # devide y coordiate in half, moving it halfway to the top, this is correct if base is C or G
        y = int(y/2)
        if char == order[0] or char == order[3]:  # if the nucleotide is A or T
            # add 2^(k-1) aka half the cgr length to the y value, brining it from 1/4 to 3/4
            y += 2**(k-1)
        # if i+1 is greater than or equal to k (i.e. if the position of the base is greater than k )
        if (i+1) >= k:
            # add plus 1 to the positions y & x in the cgr array
            out[y][x] += 1
    return out

In [None]:
#helpers.py
import math, sys
import numpy as np
from statistics import median, mean
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt

def length_calc(seq_list):
    """calculates length stats

    Keyword arguments:
    seq_list: a list of squence
    """
    len_list = map(len, seq_list)
    max_len = max(len_list)
    min_len = min(len_list)
    mean_len = mean(len_list)
    med_len = median(len_list)

    return max_len, min_len, mean_len, med_len

def inter_cluster_dist(clsuter,unique_clusters,distance_matrix, cluster_num):
    avg_dist = np.zeros((cluster_num,cluster_num))
    c_ind = np.zeros(cluster_num)
    for h in range(cluster_num):
        c_ind[h] = (clsuter == unique_clusters[h])
    
    for i in range(cluster_num):
        for j in range(i+1, cluster_num):
            if i==j:         
                continue           
            else:
                dT = distance_matrix[c_ind[i],c_ind[j]]
                avg_dist[i,j] = np.mean(np.transpose(dT), 1)  
                avg_dist[j,i] = avg_dist[i,j]
    return avg_dist

# def mds(dMatPath):
#     """
#     Takes input path to an nxn distance matrix. Performs Classical Multidimensional Scaling and returns an nx3 coordinate matrix, where each row 
#     corresponds to one of the input sequences in a 5-dimensional euclidean space. It also produces a 3D plot, very rough testing (need to color coat via cluster labels etc.
#     """

#     # to integrate with your code you can just change it to take input the distance matrix itself instead of the path.
    
#     dMat = np.loadtxt(dMatPath)

#     eigValues, eigVectors = np.linalg.eig(dMat)
#     idx = eigValues.argsort()[::-1][0:5]  
#     selEigValues = eigValues[idx]
#     selEigVectors = eigVectors[:,idx]

#     if False in (selEigValues > 0):
#         print("First 5 largest eigenvalues are not all positive. Exiting..")
#         sys.exit(-1)

#     selEigVectors = np.array(selEigVectors)

#     diagValues = []
#     for i in range(len(selEigValues)):
#         diagValues.append(math.sqrt(eigValues[i]))
        
#     diag = np.diag(diagValues)
#     points = np.dot(selEigVectors,diag)

#     minmaxScalingKameris = []
#     for i in range(5):
#         minmaxScalingKameris.append([ min(points[:,i]), max(points[:,i]) ])

#     scaledPoints = []
#     for i in range(len(dMat)):
#         scaledPoints.append([0, 0, 0, 0, 0])
#         for j in range(5):
#             scaledPoints[i][j] = 2.0 *(points[i][j] - minmaxScalingKameris[j][0]) / ( minmaxScalingKameris[j][1] - minmaxScalingKameris[j][0]) - 1

#     scaledPoints = np.array(scaledPoints) 

#     fig = plt.figure()
#     ax = plt.axes(projection='3d')

#     x = scaledPoints[:,0]
#     y = scaledPoints[:,1]
#     z = scaledPoints[:,2]

#     ax.scatter(x, y, z, c='r', marker='o')

#     fig.show()

#     # return scaled data with first 5 dimensions

#     return scaledPoints

In [None]:
#Main script functions
def compute_cgr(seq_index, seq_dict, keys, method_num, k_val, Result_path):
    # seqs is the sequence database, it is being called by accession number
    # (keys) which is being iterated over all seq_index,
    # (count of all sequences in uploaded dataset) .seq calls the string
    seq_new = str(seq_dict[keys[seq_index]].seq)
    # Replace complementary Purine/Pyrimidine
    if method_num == 15 or method_num == 16:
        seq_new = seq_new.replace('G', 'A')
        seq_new = seq_new.replace('C', 'T')
    cgr_raw = cgr(seq_new, 'ACGT', k_val)  # shape:[2^k, 2^k]
    # takes only the last (bottom) row but all columns of cgr to make 1DPuPyCGR
    if method_num == 16:
        cgr_output = cgr_raw[-1, :]
    else:
        cgr_output = cgr_raw
    # shape:[2^k, 2^k] # may not be appropriate to take by column
    fft_output = fft.fft(cgr_output, axis=0)
    abs_fft_output = np.abs(fft_output.flatten())
    np.save(os.path.join(Result_path,'Num_rep','cgr'+'k='+str(k_val)+'_'+str(seq_index)),cgr_output)
    return abs_fft_output, fft_output, cgr_output  # flatted into 1d array

def one_dimensional_num_mapping_wrapper(seq_index, method, seq_dict, keys, med_len):
    # normalize sequences to median sequence length of cluster
    seq_new = str(seq_dict[keys[seq_index]].seq)
    if len(seq_new) >= med_len:
        seq_new = seq_new[0:round(med_len)]
    num_seq = method(seq_new) 
    if len(num_seq) < med_len:
        pad_width = int(med_len - len(num_seq))
        num_seq = pywt.pad(num_seq, pad_width, 'antisymmetric')[pad_width:]
    fft_output = fft.fft(num_seq)
    abs_fft_output = np.abs(fft_output.flatten())
    return abs_fft_output, fft_output

# def compute_pearson_coeffient(x, y, i, j):
#     r = pearsonr(x, y)[0]
#     normalized_r = (1-r)/2
#     return normalized_r

# def compute_pearson_coeffient_wrapper(abs_fft_output_list):
#     distance_matrix = (1-np.corrcoef(abs_fft_output_list))/2
#     return(distance_matrix)

# def compute_pearson_coeffient_wrapper(i, j, abs_fft_output_list):
#     # print(abs_fft_output_list)
#     # x = abs_fft_output_list[i]
#     # y = abs_fft_output_list[indices[1]]
#     # print(x)
#     # print(y)
#     return compute_pearson_coeffient(abs_fft_output_list[i], abs_fft_output_list[j],i,j)

def phylogenetic_tree(triangle_matrix):
    names = keys
    matrix = triangle_matrix
    distance_matrix = DistanceMatrix(names, matrix)
    constructor = DistanceTreeConstructor()
    # neighbour joining tree not currently output
    # nj_tree = constructor.nj(distance_matrix)
    # neighbour_joining_tree = nj_tree.format('newick')
    upgma = constructor.upgma(distance_matrix)
    upgma_tree = upgma.format('newick')
    print(upgma_tree, file=tree_print)
    # can add code here for visualization with matplotlib

In [None]:
# User data setup
# import cProfile, pstats
# profiler = cProfile.Profile()
# profiler.enable()
## User set up, should be changed to argparse
data_set='/Users/dolteanu/local_documents/Coding/MLDSP_dev_git/data/Primates/fastas'
metadata = '/Users/dolteanu/local_documents/Coding/MLDSP_dev_git/data/Primates/metadata.csv'
Run_name = 'Primates'
# data_set = "/Volumes/NVME-ssd/Gisaid data/Gisaid data 01:11:22/hcov_global_2022-01-09_23-30/Testing/Fastas"
# metadata = "/Volumes/NVME-ssd/Gisaid data/Gisaid data 01:11:22/hcov_global_2022-01-09_23-30/Testing/test_metadata.csv"
# Run_name = "Nextstrain>20"
Result_path = pathlib.Path(os.path.join('./Results',Run_name))
if os.path.exists(Result_path):
    shutil.rmtree(Result_path)
    os.makedirs(os.path.join(Result_path,"Num_rep",'abs_fft'))
else:
    os.makedirs(os.path.join(Result_path,"Num_rep",'abs_fft'))
# Dictionary order & names dependent for downstream execution 
methods_list = {1: num_mapping_PP, 2: num_mapping_Int, 3: num_mapping_IntN, 4: num_mapping_Real, 5: num_mapping_Doublet, 6: num_mapping_Codons, 7: num_mapping_Atomic,
                8: num_mapping_EIIP, 9: num_mapping_AT_CG, 10: num_mapping_justA, 11: num_mapping_justC, 12: num_mapping_justG, 13: num_mapping_justT, 14: 'cgr',15: 'PuPyCGR', 16: '1DPuPyCGR'}
# Change method number referring the variable above (between 1 and 16)
method_num = 14
k_val = 4  # used only for CGR-based representations(if methodNum=14,15,16)
## End of user set up


## Not currently implemented, for future development
# test_set = None 
# seq_to_test = 0
# min_seq_len = 0
max_clust_size = 10000000
# frags_per_seq = 1
method = methods_list.get(method_num)
seq_dict, total_seq, cluster_dict, cluster_stats = preprocessing(data_set,max_clust_size, metadata)
print(cluster_stats)

# Could be parallelized in the future

# variable holding all the keys (accession numbers) for corresponding clusters
keys = list(seq_dict.keys())
# values = list(cluster_dict.values())
#seq dict keys have to be in same order as cluster dict keys (see above)
labels = [cluster_dict[x] for x in seq_dict.keys()]
seqs_length = [len(seq_dict[keys[i]].seq) for i in range(total_seq)]
med_len = median(seqs_length)
print('Mean length'+ str(med_len))
fft_output_list = []
abs_fft_output_list = []
cgr_output_list = []
seq_new_list = []
seq_list = []

In [None]:
#CGR or numerical representation & fourier transform 
# import cProfile, pstats
# profiler = cProfile.Profile()
# profiler.enable()
print('Generating numerical sequences, applying DFT, computing magnitude spectra .... \n')
# for seq_index in range(total_seq):
#     if method_num == 14 or method_num == 15 or method_num == 16:
#         abs_fft_output, fft_output, cgr_output = compute_cgr(seq_index,seq_dict,keys,method_num,k_val)
#         abs_fft_output_list.append(abs_fft_output)
#         fft_output_list.append(fft_output)
#         cgr_output_list.append(cgr_output)
#         # seq_new_list.append(seq_new)
#         # seq_list.append(seq)
#     else:
#         abs_fft_output, fft_output, seq_new, seq = one_dimensional_num_mapping_wrapper(seq_index)
#         fft_output_list.append(fft_output)
#         abs_fft_output_list.append(abs_fft_output)
#         seq_new_list.append(seq_new)
#         seq_list.append(seq)
        
pool = Pool()
if method_num == 14 or method_num == 15 or method_num == 16:
    for abs_fft_output, fft_output, cgr_output in pool.map(partial(compute_cgr, seq_dict=seq_dict, keys=keys, method_num=method_num, k_val=k_val), range(total_seq)):
        abs_fft_output_list.append(abs_fft_output)
        fft_output_list.append(fft_output)
        cgr_output_list.append(cgr_output)
else:
    for abs_fft_output, fft_output in pool.map(partial(one_dimensional_num_mapping_wrapper, method = method, seq_dict=seq_dict, keys=keys, med_len=med_len), range(total_seq)):
        abs_fft_output_list.append(abs_fft_output)
        fft_output_list.append(fft_output)
pool.close()
# profiler.disable()
#     # with open('profile2.txt','w') as p:
# stats = pstats.Stats(profiler)
# stats.strip_dirs()
# stats.sort_stats('cumtime')
# # stats.dump_stats('./profile2.prof')
# stats.print_stats(30)
# stats.print_callers()


In [None]:
plt.matshow(cgr_output_list[0],cmap=cm.gray_r)
plt.xticks([])
plt.yticks([])

In [None]:
#Distance matrix
print('Building distance matrix')

# # hdf5 implementation
# with h5py.File("distmat"+Run_name+".hdf5", "w") as dist_mat_file:
#     distance_matrix = dist_mat_file.create_dataset("dist_mat", (total_seq, total_seq))
#     for i in range(total_seq):
#         for j in range(total_seq):
#             distance_matrix[i,j] = compute_pearson_coeffient_wrapper(i,j,abs_fft_output_list)
# reading file if already made
# # rand = h5py.File('/Users/dolteanu/local_documents/Coding/MLDSP_dev_git/distmat.hdf5', 'r')
# # distance_matrix = rand['dist_mat']

# #parallel implementation
# for normalized_r, i, j in pool.starmap(partial(compute_pearson_coeffient_wrapper, abs_fft_output_list=abs_fft_output_list), ((i, j) for i in range(total_seq) for j in range(total_seq)),chunksize=total_seq):
#     distance_matrix[i,j] = normalized_r
# distance_matrix = compute_pearson_coeffient_wrapper(abs_fft_output_list)
# # Numpy implementation
distance_matrix = (1-np.corrcoef(abs_fft_output_list))/2
np.save(os.path.join(Result_path,'dist_mat'), distance_matrix)

In [None]:
# Molecular distance map
print('Scaling & data visualisation')
le = pe.LabelEncoder()
le.fit(labels)
labs = le.transform(labels)
scaled_distance_matrix = dimReduction(distance_matrix, n_dim=3, method='pca')
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(scaled_distance_matrix[:, 0], scaled_distance_matrix[:, 1], scaled_distance_matrix[:, 2],c=labs)
plt.savefig(os.path.join(Result_path,'MoDmap.png'))

In [None]:
# ML classification
print('Performing classification .... \n')
folds = 10
if (total_seq < folds):
    folds = total_seq
mean_accuracy, accuracy, cmatrix, misClassifiedDict = classify_dismat(distance_matrix, labels, folds, total_seq)
# accuracy,avg_accuracy, clNames, cMat
# accuracies = [accuracy, avg_accuracy];
print('Classification accuracy 5 classifiers\n', accuracy)
print('**** Processing completed ****\n')