In [10]:
import librtd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob
import tqdm

from Bio import SeqIO

# Read txt files in and calculate rtd metric

In [180]:
def rtd_metric(folder_path, kmer, reverse_complement, pairwise):

    files = glob.glob(folder_path + '/*.txt', 
                       recursive = True)

    if reverse_complement is True:
        constant = 4
        
    else:
        constant = 1
        
    df = pd.DataFrame(columns=list(range(constant * 2 * 4 ** kmer)))

    for seq_file in files:
        fasta_seq = SeqIO.parse(open(seq_file),'fasta')

        for fasta in fasta_seq:
            name, seq = fasta.id, str(fasta.seq)

            df.loc[len(df.index)+1] = librtd.return_time_distribution(seq, kmer, reverse_complement, pairwise).values()

    df.columns = librtd.return_time_distribution(seq, kmer, reverse_complement, pairwise).keys()
    
    return df

In [228]:
Basidiomycota = rtd_metric(folder_path = "/Users/davidchen/Documents/GitHub/rtd/Fungi/Basidiomycota", 
           kmer = 2, 
           reverse_complement = False, 
           pairwise = True)

In [229]:
Pezizomycotina = rtd_metric(folder_path = "/Users/davidchen/Documents/GitHub/rtd/Fungi/Pezizomycotina", 
           kmer = 2, 
           reverse_complement = False, 
           pairwise = True)

In [230]:
Saccharomycotina = rtd_metric(folder_path = "/Users/davidchen/Documents/GitHub/rtd/Fungi/Saccharomycotina", 
           kmer = 2, 
           reverse_complement = False, 
           pairwise = True)

In [231]:
metrics = pd.concat([Basidiomycota, Pezizomycotina, Saccharomycotina]).reset_index(drop=True)

In [232]:
metrics['Class'] = list(np.repeat(1, len(Basidiomycota))) + list(np.repeat(2, len(Pezizomycotina))) + list(np.repeat(3, len(Saccharomycotina)))

In [233]:
metrics

Unnamed: 0,TC_rc_std,TA_rc_std,CT_rc_std,CG_rc_std,AT_rc_mean,CA_rc_std,TT_rc_std,GT_rc_std,GC_rc_std,TG_rc_mean,...,GA_rc_mean,TT_rc_mean,AA_rc_std,CT_rc_mean,GA_rc_std,AC_rc_mean,GC_rc_mean,CC_rc_std,GT_rc_mean,Class
0,24.978759,6.111414,18.984465,70.397293,8.287659,24.889063,13.054061,20.355571,50.536820,24.730135,...,21.460889,14.494313,12.276219,20.036142,21.159872,21.644916,48.895252,48.665950,21.176851,1
1,23.389764,6.035017,16.794521,133.680091,7.445828,21.552743,11.319110,22.292830,65.726512,24.900537,...,23.421240,11.387218,9.548308,18.459556,22.713124,20.688596,54.861842,57.050188,23.610936,1
2,32.336123,10.077081,25.709064,52.387013,12.010172,25.579567,29.825278,26.405804,31.460881,24.536401,...,25.474032,26.436740,31.019771,23.985995,25.666019,22.685348,31.069163,37.658378,24.015551,1
3,33.040260,6.169171,26.477173,141.219397,8.506214,35.040340,13.617322,26.834271,67.832943,31.383849,...,25.633142,11.782344,13.370596,22.985982,25.744616,27.694194,47.967105,59.539987,26.805354,1
4,21.448042,6.262199,16.491826,62.597828,8.780216,19.778036,12.532370,19.955096,48.627651,22.031419,...,20.084990,13.744035,12.823560,17.418328,19.656395,19.490452,41.966592,49.664521,19.314249,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,29.941244,4.736982,22.222098,131.606910,6.167007,33.901874,21.119875,31.341400,61.991267,27.346337,...,25.864865,17.666453,21.035720,22.734238,25.610395,24.990576,51.410256,57.872757,27.621872,3
220,50.801494,4.186607,53.265960,200.576048,4.790397,76.872876,9.914388,53.842450,195.813618,43.118280,...,38.823360,9.645173,10.400188,44.756437,49.444854,44.307692,125.908935,93.570000,42.103330,3
221,50.535984,4.904978,69.073790,161.855622,4.769671,78.199718,11.148499,50.053810,174.738098,43.192288,...,45.684292,10.384256,12.857125,52.100939,58.126396,46.428339,105.224816,73.167419,38.909630,3
222,43.076688,5.304593,23.960713,68.358312,7.005028,20.891791,56.390187,21.478125,69.614863,24.393240,...,31.250364,24.199822,45.277189,21.311454,44.213965,18.038086,51.589447,34.498025,20.641493,3


# Machine Learning Proof of Concept

In [192]:
from sklearn.model_selection import cross_validate
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import tree

from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier


In [226]:
def benchmark(metrics):

    scoring = ['accuracy']

    svc = svm.SVC(kernel='linear', C=1, random_state=0)
    svm_scores = cross_validate(svc, X=metrics.iloc[:, :-1], y=metrics.iloc[:, -1], scoring=scoring, cv=10)
    print("SVC: " + str(svm_scores['test_accuracy'].mean()))

    knn = KNeighborsClassifier(n_neighbors=5)
    knn_scores = cross_validate(knn, X=metrics.iloc[:, :-1], y=metrics.iloc[:, -1], scoring=scoring, cv=10)
    print("KNN: " + str(knn_scores['test_accuracy'].mean()))

    sgd = SGDClassifier(loss="hinge", penalty="l2", max_iter=100)
    sgd_scores = cross_validate(sgd, X=metrics.iloc[:, :-1], y=metrics.iloc[:, -1], scoring=scoring, cv=10)
    print("SGD: " + str(sgd_scores['test_accuracy'].mean()))

    dtree = tree.DecisionTreeClassifier()
    dtree_scores = cross_validate(dtree, X=metrics.iloc[:, :-1], y=metrics.iloc[:, -1], scoring=scoring, cv=10)
    print("Decision Tree: " + str(dtree_scores['test_accuracy'].mean()))
    
    adaboost = AdaBoostClassifier(n_estimators=100)
    adaboost_scores = cross_validate(adaboost, X=metrics.iloc[:, :-1], y=metrics.iloc[:, -1], scoring=scoring, cv=10)
    print("Adaboost: " + str(adaboost_scores['test_accuracy'].mean()))


# K=1, RC=False, Pairwise=False

In [204]:
benchmark(metrics)

SVC: 0.8436758893280633
KNN: 0.8128458498023715
SGD: 0.7634387351778656
Decision Tree: 0.817391304347826
Adaboost: 0.7909090909090909
MLP: 0.46403162055335956


# K=2, RC=False, Pairwise=False

In [227]:
benchmark(metrics)

SVC: 0.884387351778656
KNN: 0.7772727272727272
SGD: 0.7284584980237153
Decision Tree: 0.8794466403162057
Adaboost: 0.8077075098814228


# K=3, RC=False, Pairwise=False

In [218]:
benchmark(metrics)

SVC: 0.9021739130434783
KNN: 0.7810276679841898
SGD: 0.7849802371541502
Decision Tree: 0.8885375494071145
Adaboost: 0.7185770750988143
MLP: 0.46403162055335956


# K=2, RC=False, Pairwise=True

In [234]:
benchmark(metrics)

SVC: 0.807905138339921
KNN: 0.7770750988142292
SGD: 0.7005928853754941
Decision Tree: 0.8934782608695653
Adaboost: 0.8480237154150198


# K=1, RC=True, Pairwise=False

In [196]:
benchmark(metrics)

SVC: 0.8395256916996047
KNN: 0.7907114624505929
SGD: 0.7146245059288537
Decision Tree: 0.8446640316205534
Adaboost: 0.8395256916996049
MLP: 0.46403162055335956
