In [1]:
import sys
import numpy
import sklearn
import pandas

from sklearn import model_selection
from sklearn.metrics import classification_report, accuracy_score

In [2]:
import numpy as np
import pandas as pd

# import the uci Molecular Biology (Promoter Gene Sequences) Data Set
data=pd.read_csv('training.data', names=['group', 'seq'], header=None)
test_data=pd.read_csv('test.data', names=['seq'], header=None)

In [3]:
def process(df, train=True) :
    data = df
    if train :
        groups = data.loc[:, 'group']

    sequences = list(data.loc[:, 'seq'])
    dataset = {}

    # loop through sequences and split into individual nucleotides
    for i, seq in enumerate(sequences):

        # split into nucleotides, remove tab characters
        ncl = list(seq)
        ncl = [x for x in ncl if x != '\t']

        # append class assignment
        if train:
            ncl.append(groups[i])

        # add to dataset
        dataset[i] = ncl
        
    
    dframe = pd.DataFrame(dataset)
    
    df = dframe.transpose()
    
    if train:
        df.rename(columns = {60: 'Class'}, inplace = True) 

    numerical_df = pd.get_dummies(df)
    
    #remerge the columns by putting anything with a 1 in Class_2 as a 2 in Class_1
    if train:
        index = numerical_df.index
        condition = numerical_df[numerical_df['Class_2']==1]
        indices = condition.index.values.tolist()
        numerical_df['Class_1'][indices] = 2

    #can drop the columns now
    #anything that was a 1 in class_2 is now a 2 in class 1
    #and anything that was a 1 in class_1 is still a 1
    # and the only zeroes left should be those with 1s in class_0
    if train:
        numerical_df = numerical_df.drop(columns=['Class_2'])
        numerical_df = numerical_df.drop(columns=['Class_0'])

        numerical_df.rename(columns = {'Class_1': 'Class'}, inplace = True)
    
    numerical_df = numerical_df.filter(regex = '(?:A|C|G|T|Class)$' , axis=1)
    return numerical_df

In [4]:
df = process(data)

In [5]:
test_df = process(test_data, train=False)

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


#create the training sets
X = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])

#use the same seed to keep the RNG the same
seed = 1

#split the data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=seed)

In [7]:
names = ["SVM RBF"]

scoring = 'accuracy'

model = SVC(kernel = 'rbf')


kfold = model_selection.KFold(n_splits=10, random_state = seed)
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
msg = "%s: %f (%f)" % ("SVM RBF", cv_results.mean(), cv_results.std())
print(msg)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print('Test-- SVM RBF: ',accuracy_score(y_test, predictions))
print()
print(classification_report(y_test, predictions))



SVM RBF: 0.958852 (0.012329)
Test-- SVM RBF:  0.9684361549497847

              precision    recall  f1-score   support

           0       0.93      0.98      0.95       177
           1       0.96      0.95      0.96       168
           2       0.99      0.97      0.98       352

    accuracy                           0.97       697
   macro avg       0.96      0.97      0.96       697
weighted avg       0.97      0.97      0.97       697



In [8]:
#now for the actual predictions
X = np.array(test_df)

predictions = model.predict(X)
display(predictions)

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [9]:
#for some reason it wasn't cooperating so I manually saved it

lst = [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2]


dframe = pd.DataFrame(lst)

dframe.to_csv('out.csv', index=False)

In [10]:
# define scoring method
scoring = 'accuracy'

#put names of models here to be trained
names = ['K nearest-neighbors','MLP Classifier','Decision Tree','Random Forest','SVM Linear','SVM RBF']

#include their classifiers into this list
classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    MLPClassifier(alpha = 1),
    DecisionTreeClassifier(max_depth = 5),
    RandomForestClassifier(max_depth=5,n_estimators=10,max_features=1),
    SVC(kernel = 'linear'),
    SVC(kernel = 'rbf')
]

models = zip(names, classifiers)

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print('Test-- ',name,': ',accuracy_score(y_test, predictions))
    print()
    print(classification_report(y_test, predictions))



K nearest-neighbors: 0.727273 (0.019258)
Test--  K nearest-neighbors :  0.7288378766140603

              precision    recall  f1-score   support

           0       0.54      0.92      0.68       177
           1       0.77      0.88      0.82       168
           2       0.97      0.56      0.71       352

    accuracy                           0.73       697
   macro avg       0.76      0.79      0.74       697
weighted avg       0.81      0.73      0.73       697





MLP Classifier: 0.943541 (0.015400)
Test--  MLP Classifier :  0.9540889526542324

              precision    recall  f1-score   support

           0       0.92      0.95      0.94       177
           1       0.93      0.93      0.93       168
           2       0.98      0.97      0.97       352

    accuracy                           0.95       697
   macro avg       0.95      0.95      0.95       697
weighted avg       0.95      0.95      0.95       697





Decision Tree: 0.940191 (0.014709)
Test--  Decision Tree :  0.9368723098995696

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       177
           1       0.88      0.89      0.88       168
           2       0.96      0.95      0.96       352

    accuracy                           0.94       697
   macro avg       0.93      0.93      0.93       697
weighted avg       0.94      0.94      0.94       697





Random Forest: 0.522010 (0.035578)
Test--  Random Forest :  0.5236728837876614

              precision    recall  f1-score   support

           0       0.91      0.06      0.11       177
           1       1.00      0.02      0.04       168
           2       0.52      1.00      0.68       352

    accuracy                           0.52       697
   macro avg       0.81      0.36      0.27       697
weighted avg       0.73      0.52      0.38       697





SVM Linear: 0.916268 (0.014234)
Test--  SVM Linear :  0.9182209469153515

              precision    recall  f1-score   support

           0       0.87      0.94      0.91       177
           1       0.88      0.87      0.88       168
           2       0.96      0.93      0.94       352

    accuracy                           0.92       697
   macro avg       0.91      0.91      0.91       697
weighted avg       0.92      0.92      0.92       697





SVM RBF: 0.958852 (0.012329)
Test--  SVM RBF :  0.9684361549497847

              precision    recall  f1-score   support

           0       0.93      0.98      0.95       177
           1       0.96      0.95      0.96       168
           2       0.99      0.97      0.98       352

    accuracy                           0.97       697
   macro avg       0.96      0.97      0.96       697
weighted avg       0.97      0.97      0.97       697

