In [16]:
from Bio import SeqIO
import pandas as pd

In [41]:
def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

def get_hex_df(file_name, virus): 
    fasta_sequences = SeqIO.parse(open(file_name),'fasta')

    names = []
    sequences = []

    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        names.append(name)
        sequences.append(sequence)
        
    sequences_df = pd.DataFrame()
    
    sequences_df['names'] = names
    sequences_df['sequences'] = sequences
    sequences_df['virus'] = virus
    
    sequences_df['hexamers'] = sequences_df['sequences'].apply(lambda x: getKmers(x))
    
    hex_list = list(sequences_df['hexamers'])
    for item in range(len(hex_list)):
        hex_list[item] = ' '.join(hex_list[item])
    
    sequences_df['hexamers'] = hex_list
    
    return sequences_df

## Proof-of-Concept: Supervised Learning with FluA Genomes

In [42]:
h1n1 = get_hex_df('flu/H1N1.fa', 'H1N1')

In [43]:
h1n1.head()

Unnamed: 0,names,sequences,virus,hexamers
0,CY083910,GGAAAACAAAAGCAACAAAAATGAAGGCAATACTAGTAGTTCTGCT...,H1N1,ggaaaa gaaaac aaaaca aaacaa aacaaa acaaaa caaa...
1,CY063606,AAAAGCAACAAAAATGAAGGCAATACTAGTAGTTCTGCTATATACA...,H1N1,aaaagc aaagca aagcaa agcaac gcaaca caacaa aaca...
2,CY083776,AAACAAAAGCAACAAAAATGAAGGCAATACTAGTAGTTCTGCTATA...,H1N1,aaacaa aacaaa acaaaa caaaag aaaagc aaagca aagc...
3,CY073725,AAAAGCAACAAAAATGAAGGCAATACTAGTAGTTCTGCTATATACA...,H1N1,aaaagc aaagca aagcaa agcaac gcaaca caacaa aaca...
4,CY062691,AAAAGCAACAAAAATGAAGGCAATACTAGTAGTTCTGCTATATACA...,H1N1,aaaagc aaagca aagcaa agcaac gcaaca caacaa aaca...


In [44]:
h3n2 = get_hex_df('flu/H3N2.fa', 'H3N2')

In [45]:
h3n2.head()

Unnamed: 0,names,sequences,virus,hexamers
0,CY113229,TAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACATTTT...,H3N2,taattc aattct attcta ttctat tctatt ctatta tatt...
1,CY112460,TAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACATTTT...,H3N2,taattc aattct attcta ttctat tctatt ctatta tatt...
2,CY114553,GGGGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTAC...,H3N2,ggggat gggata ggataa gataat ataatt taattc aatt...
3,KY694983,ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCG...,H3N2,atgaag tgaaga gaagac aagact agacta gactat acta...
4,MN299965,ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCG...,H3N2,atgaag tgaaga gaagac aagact agacta gactat acta...


In [46]:
combined = h1n1.append(h3n2)

In [50]:
X = combined['hexamers']
y = combined['virus']

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
X_vec = cv.fit_transform(X)

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vec, 
                                                    y, 
                                                    test_size = 0.20, 
                                                    random_state=26)

In [56]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.1)

In [57]:
y_pred = classifier.predict(X_test)

In [58]:
import sklearn.metrics as metrics

In [60]:
print(metrics.classification_report(y_pred, y_test))

              precision    recall  f1-score   support

        H1N1       1.00      1.00      1.00       556
        H3N2       1.00      1.00      1.00       283

    accuracy                           1.00       839
   macro avg       1.00      1.00      1.00       839
weighted avg       1.00      1.00      1.00       839

