In [268]:
import pandas as pd
import numpy as np
import Bio as bio
import seaborn as sns

In [269]:
dummygenes = pd.read_csv('dummygenes.csv')
dummygenes.index = dummygenes["0"]
dummygenes.drop(columns = '0', inplace = True)

In [387]:
genomes = pd.read_csv('./genomes.csv')
genomes.index = genomes["0"]
genomes.drop(columns = ["0", "Unnamed: 0"], inplace = True)
genomes.columns

Index(['sequences'], dtype='object')

from scipy.cluster.hierarchy import ward
import matplotlib.pyplot as plt

#Determine ward distances between genetic sequences
linkage_array = ward(dummygenes[0::])

#Plot the dendrogram depicting evolutionary distance:
dendrogram(linkage_array)
plt.xlabel("Sample index")
plt.ylabel("Cluster distance")

In [270]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=42).fit(dummygenes[0::])
klabels = pd.DataFrame(index = dummygenes.index, data = kmeans.labels_)

In [332]:
#Function to classify.
#Takes either two strings or two lists of strings.

from Bio import SeqIO
import pandas as pd
def GenomeFormatter(genomes):
    index = genomes.index
    #Values replaced with dummies:
    
    #list of valid inputs:
    atcg = ["A", "T", "C", "G"]
    #Splitting into separate columns:
    genomes = pd.DataFrame(genomes['Sequence'].apply(lambda x: x+("0" * (31775-len(x)))))
    genomes = pd.DataFrame(genomes['Sequence'].apply(lambda x: list(x)))
    genomes = pd.DataFrame(genomes['Sequence'].to_list())
    
    #dropping "noise" from sequencing:
    genomes[~genomes.isin(atcg)] = None
    genomes.fillna(0, inplace = True)
    
    #replacing each nucleotide with its respective dummy value:
    for n in range(0, len(atcg)):
        genomes.replace(atcg[n], (n+1), inplace = True)
    
    #Filling extra space in with kmeans.expected_n_features with 0.
    #TODO: truncate if it's too long.
    genomes.index = index
    appended = pd.concat([dummygenes.head(0), genomes]).fillna(0)
    return genomes

In [333]:
def FastaReader(fastafile):
    from Bio import SeqIO
    with open(fastafile) as fasta_file:  # Will close handle cleanly
        identifiers = []
        lengths = []
        seqs = []
        for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
            identifiers.append(seq_record.id)
            seqs.append(seq_record.upper().seq)
        df = pd.DataFrame(identifiers)
        df["Sequence"] = seqs
        df.index = df[0]
        df.drop(columns = 0, inplace = True)
    return df

In [409]:
#Genome comparison without Numpy - slow but functional
import difflib
def genomecompare (y):
    tree = genomes['sequences'].apply(lambda x: difflib.SequenceMatcher(None, str(y), str(x)).ratio())
    return pd.DataFrame(tree)
#.sort_values(0, ascending = False)

In [410]:
#Takes dataframe from CoronaClass and returns dataframe of cluster 
#TODO: Return 5 closest matches.
#Takes a filepath containing fasta-formatted genomes and returns classifications for each:
def CoronaClassifier(filepath):
    #Reading file from fasta format into dataframe
    df = FastaReader(filepath)
    
    #Comparing genome against previously found samples:
    tree = genomecompare(str(df["Sequence"][0]))
    
    #Formatting for Kmeans:
    formatted = GenomeFormatter(df)
    
    #Classify input genomes:
    result = kmeans.predict(formatted.apply(lambda x: list(x)))

    return result, tree

In [411]:
#CoronaClassifier('./test.txt')
genomecompare(str(FastaReader('./test.txt')["Sequence"][0]))
#genomecompare(FastaReader('./test.txt')["Sequence"])

Unnamed: 0_level_0,sequences
0,Unnamed: 1_level_1
AC_000192.1,0.000000
AF095702.1,0.000000
AF220295.1,0.000125
AF353511.1,0.000000
AF391541.1,0.000125
...,...
MT831673.1,0.000000
MT831674.1,0.000000
MT831675.1,0.000000
MT831676.1,0.000000


In [None]:
result, tree = CoronaClassifier('./test.txt')

In [None]:
result

In [None]:
tree