# Classifying DNA sequences using various machine learning algorithms
The goal of this notebook is to identify whether a sequence of DNA is a promoter (+) or not (-) using machine learning algorithms: K nearest neighbors, Gaussian Process, Decision Tree, Random Forest, Neural Network, AdaBoost, Naive Bayes, Support Vector Machine (with various kernels). We will be evaluating the performance of these classifiers.

In [1]:
import pandas as pd
import numpy as np

In [95]:
path = 'C:/Users/Chris/Documents/Data-Science-Projects/Dataset/promoters.data'
data = pd.read_csv(path,header=None)
data.columns = ['Class','id','Sequence']
data

Unnamed: 0,Class,id,Sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...
5,+,MALEFG,\taggggcaaggaggatggaaagaggttgccgtataaagaaactag...
6,+,MALK,\t\tcagggggtggaggatttaagccatctcctgatgacgcatagt...
7,+,RECA,\t\ttttctacaaaacacttgatactgtatgagcatacagtataat...
8,+,RPOB,\t\tcgacttaatatactgcgacaggacgtccgttctgtgtaaatc...
9,+,RRNAB_P1,\tttttaaatttcctcttgtcaggccggaataactccctataatgc...


Need to get rid of \t

In [96]:
#removing tab
def replace(x):
    return x.replace('\t','')
data['Sequence'] = data['Sequence'].apply(replace)
data

Unnamed: 0,Class,id,Sequence
0,+,S10,tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgc...
1,+,AMPC,tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaa...
2,+,AROH,gtactagagaactagtgcattagcttatttttttgttatcatgcta...
3,+,DEOP2,aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaata...
4,+,LEU1_TRNA,tcgataattaactattgacgaaaagctgaaaaccactagaatgcgc...
5,+,MALEFG,aggggcaaggaggatggaaagaggttgccgtataaagaaactagag...
6,+,MALK,cagggggtggaggatttaagccatctcctgatgacgcatagtcagc...
7,+,RECA,tttctacaaaacacttgatactgtatgagcatacagtataattgct...
8,+,RPOB,cgacttaatatactgcgacaggacgtccgttctgtgtaaatcgcaa...
9,+,RRNAB_P1,ttttaaatttcctcttgtcaggccggaataactccctataatgcgc...


In [65]:
from sklearn.preprocessing import LabelEncoder

#transform the Classes + and - into 0s and 1s
encode = LabelEncoder()
y = encode.fit_transform(data['Class'])

#split the string of sequence into individual nucleotides 
data_split = pd.DataFrame(list(data['Sequence'].apply(list)))

#transform the nucleotide feature matrix into dummy variables
X = pd.get_dummies(data_split)

# Training and testing various algorithms' performance on the dataset
 

In [66]:
#import models of interest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [69]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
# define a seed to produce reproducible results
seed = 1
# split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

In [73]:
# scoring method
scoring = 'accuracy'

# define the models for training
names = ['K-nearest neighbors','Gaussian Process','Decision Tree',
        'Random Forest','Neural Network','AdaBoost','Naive Bayes',
        'SVM Linear','SVM RBF','SVM Sigmoid']
classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    GaussianProcessClassifier(1.0*RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5,n_estimators=10,max_features=1),
    MLPClassifier(alpha=1,max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel = 'linear',gamma='auto'),
    SVC(kernel = 'rbf',gamma='auto'),
    SVC(kernel = 'sigmoid',gamma='auto')
]
# Evaluate the accuracy of the algorithm using cross validation
results = []
for name, classifier in zip(names,classifiers):
    kfold = KFold(n_splits=10, random_state=seed)
    # this function wraps around the classifier and the kfold, doing the training and scoring
    cv_results = cross_val_score(classifier, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(CV_results)
    print('%s: %f (%f)' % (name,cv_results.mean(),cv_results.std()))

K-nearest neighbors: 0.823214 (0.113908)
Gaussian Process: 0.873214 (0.056158)
Decision Tree: 0.700000 (0.225000)
Random Forest: 0.673214 (0.136709)
Neural Network: 0.875000 (0.096825)
AdaBoost: 0.912500 (0.112500)
Naive Bayes: 0.837500 (0.137500)
SVM Linear: 0.850000 (0.108972)
SVM RBF: 0.737500 (0.117925)
SVM Sigmoid: 0.569643 (0.159209)


In [92]:
# Train the model
for classifier in classifiers:
    classifier.fit(X_train,y_train)

In [94]:
# Now evaluating the model using the test set
for name, classifier in zip(names,classifiers):
    print('%s\n%s: %f\n%s: ' % (name,'Accuracy',accuracy_score(classifier.predict(X_test),y_test),
                                'Report'))
    print(classification_report(classifier.predict(X_test),y_test))

K-nearest neighbors
Accuracy: 0.777778
Report: 
              precision    recall  f1-score   support

           0       1.00      0.62      0.77        16
           1       0.65      1.00      0.79        11

   micro avg       0.78      0.78      0.78        27
   macro avg       0.82      0.81      0.78        27
weighted avg       0.86      0.78      0.78        27

Gaussian Process
Accuracy: 0.888889
Report: 
              precision    recall  f1-score   support

           0       1.00      0.77      0.87        13
           1       0.82      1.00      0.90        14

   micro avg       0.89      0.89      0.89        27
   macro avg       0.91      0.88      0.89        27
weighted avg       0.91      0.89      0.89        27

Decision Tree
Accuracy: 0.814815
Report: 
              precision    recall  f1-score   support

           0       1.00      0.67      0.80        15
           1       0.71      1.00      0.83        12

   micro avg       0.81      0.81      0.81    

# Conclusion:
The best algorithm to use in this dataset is support vector machine using linear kernel