In [1]:
import sys
import numpy
import sklearn
import pandas

# Uncomment the following to print the version of each import

# print("Python version: {} ".format(sys.version))
# print("Numpy version: {}".format(numpy.__version__))
# print("Sklear version: {}".format(sklearn.__version__))
# print("Pandas version: {}".format(pandas.__version__))

In [2]:
import numpy as np
import pandas as pd

# Import Molecular Biology (Promoter gene sequences) dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data"

# Column names
names = ['Class', 'id', 'Sequence']

# Read the csv via Pandas
data = pd.read_csv(url, names=names)


In [3]:
# Build out dataset using  custom pandas dataframe
# Each column in a dataform is called a series

classes = data.loc[:, 'Class']
print(classes[:5])

0    +
1    +
2    +
3    +
4    +
Name: Class, dtype: object


In [4]:
#  Generate a list of DNA sequences 
sequences = list(data.loc[:, 'Sequence'])
dataset = {}

# Loop through the sequences and split into individual nucleotides
for i, seq in enumerate(sequences):
    # Spelit into nucleotides and remove tab character "\t"
    nucleotides = list(seq)
    # Take every single character as far as it is not \t
    nucleotides = [x for x in nucleotides if x != '\t']
    
    # Append class assignment
    nucleotides.append(classes[i])
    
    # Add to dataset
    dataset[i] = nucleotides
    
print(dataset[0])



['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']


In [5]:
# Convert the dataset into dataframe
dframe = pd.DataFrame(dataset)
df = dframe.transpose()
print(df)

    0  1  2  3  4  5  6  7  8  9   ... 48 49 50 51 52 53 54 55 56 57
0    t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t  +
1    t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a  +
2    g  t  a  c  t  a  g  a  g  a  ...  c  a  c  c  c  g  g  c  g  +
3    a  a  t  t  g  t  g  a  t  g  ...  a  a  c  a  a  a  c  t  c  +
4    t  c  g  a  t  a  a  t  t  a  ...  c  c  g  t  g  g  t  a  g  +
5    a  g  g  g  g  c  a  a  g  g  ...  c  g  t  t  t  a  g  g  t  +
6    c  a  g  g  g  g  g  t  g  g  ...  a  t  c  a  t  g  a  a  t  +
7    t  t  t  c  t  a  c  a  a  a  ...  a  a  c  a  g  a  a  c  a  +
8    c  g  a  c  t  t  a  a  t  a  ...  a  a  a  t  g  g  t  t  t  +
9    t  t  t  t  a  a  a  t  t  t  ...  c  c  a  c  t  g  a  c  a  +
10   g  c  a  a  a  a  a  t  a  a  ...  c  c  c  g  c  g  c  c  g  +
11   c  c  t  g  a  a  a  t  t  c  ...  c  c  t  c  g  c  g  a  c  +
12   g  a  t  c  a  a  a  a  a  a  ...  c  c  g  t  t  g  a  g  a  +
13   c  t  g  c  a  a  t  t  t  t 

In [6]:
# Rename the last column to class
df.rename(columns = {57: 'Class'}, inplace = True)

print(df.iloc[:5])

   0  1  2  3  4  5  6  7  8  9  ... 48 49 50 51 52 53 54 55 56 Class
0  t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t     +
1  t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a     +
2  g  t  a  c  t  a  g  a  g  a  ...  c  a  c  c  c  g  g  c  g     +
3  a  a  t  t  g  t  g  a  t  g  ...  a  a  c  a  a  a  c  t  c     +
4  t  c  g  a  t  a  a  t  t  a  ...  c  c  g  t  g  g  t  a  g     +

[5 rows x 58 columns]


In [7]:
# Understnad the data
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,c,t,+
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


In [8]:
# Record value count for each sequence 
series = [] 
for name in df.columns:
    series.append(df[name].value_counts())
    
info = pd.DataFrame(series)
details = info.transpose()
print(details)

      0     1     2     3     4     5     6     7     8     9  ...    48  \
t  38.0  26.0  27.0  26.0  22.0  24.0  30.0  32.0  32.0  28.0  ...  21.0   
c  27.0  22.0  21.0  30.0  19.0  18.0  21.0  20.0  22.0  22.0  ...  36.0   
a  26.0  34.0  30.0  22.0  36.0  42.0  38.0  34.0  33.0  36.0  ...  23.0   
g  15.0  24.0  28.0  28.0  29.0  22.0  17.0  20.0  19.0  20.0  ...  26.0   
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   

     49    50    51    52    53    54    55    56  Class  
t  22.0  23.0  33.0  35.0  30.0  23.0  29.0  34.0    NaN  
c  42.0  31.0  32.0  21.0  32.0  29.0  29.0  17.0    NaN  
a  24.0  28.0  27.0  25.0  22.0  26.0  24.0  27.0    NaN  
g  18.0  24.0  14.0  25.0  22.0  28.0  24.0  28.0    NaN  
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  

[6 rows x 58 columns]


In [9]:
# Switch to numerical data using pd.get_dummies function
numerical_df = pd.get_dummies(df)

numerical_df.iloc[:5]


Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,Class_+,Class_-
0,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
1,0,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
4,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


In [10]:
# Remove one of the class columns and simply rename to class
df = numerical_df.drop(columns=['Class_-'])

df.rename(columns={"Class_+": "Class"}, inplace = True)
print(df.iloc[60])

0_a      0
0_c      0
0_g      1
0_t      0
1_a      1
1_c      0
1_g      0
1_t      0
2_a      0
2_c      0
2_g      1
2_t      0
3_a      0
3_c      0
3_g      1
3_t      0
4_a      0
4_c      0
4_g      0
4_t      1
5_a      0
5_c      0
5_g      1
5_t      0
6_a      0
6_c      0
6_g      1
6_t      0
7_a      0
7_c      1
        ..
49_t     1
50_a     0
50_c     0
50_g     1
50_t     0
51_a     0
51_c     0
51_g     1
51_t     0
52_a     0
52_c     0
52_g     0
52_t     1
53_a     1
53_c     0
53_g     0
53_t     0
54_a     0
54_c     0
54_g     0
54_t     1
55_a     0
55_c     0
55_g     0
55_t     1
56_a     1
56_c     0
56_g     0
56_t     0
Class    0
Name: 60, Length: 229, dtype: uint8


In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [12]:
from sklearn import model_selection

# Create X and Y dataset for Training
X = np.array(df.drop(['Class'],1))
y = np.array(df['Class'])

# Define seed for reproducability
seed = 1

# Split the dataaset for training and testing datasets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.25, random_state=seed)

In [32]:
# Define the scoring method
scoring = 'accuracy'

# Define models to train
names = ['N Nearest Neighbours',
         'Gaussian Process',
         'Decision Tree',
         'Random Forest',
         'Neural Net',
         'Ada Boost',         
         'Naive Bayes',
         'SVM Linear',
         'SVM RBF',
         'SVM Sigmoid'
        ]

import warnings
warnings.filterwarnings("ignore", category = RuntimeWarning)
classifiers = [
    KNeighborsClassifier (n_neighbors=3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel='linear',gamma='auto'),
    SVC(kernel='rbf',gamma='auto'),
    SVC(kernel='sigmoid',gamma='auto')
]

models = zip(names, classifiers)

# Evaluate each model
results = []
names = []


for name, model in models:
    kfold = model_selection.KFold(n_splits = 10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "{0} -  Mean:{1} (Std Dev = {2}) ".format(name, cv_results.mean(), cv_results.std())
    print(msg)
    
    

N Nearest Neighbours -  Mean:0.8232142857142858 (Std Dev = 0.11390841738440759) 
Gaussian Process -  Mean:0.8732142857142857 (Std Dev = 0.05615780426255853) 
Decision Tree -  Mean:0.7125 (Std Dev = 0.17721808598447283) 
Random Forest -  Mean:0.5928571428571429 (Std Dev = 0.141466445145274) 
Neural Net -  Mean:0.8875 (Std Dev = 0.08750000000000001) 
Ada Boost -  Mean:0.9125 (Std Dev = 0.1125) 
Naive Bayes -  Mean:0.8375 (Std Dev = 0.1375) 
SVM Linear -  Mean:0.85 (Std Dev = 0.10897247358851683) 
SVM RBF -  Mean:0.7375 (Std Dev = 0.11792476415070755) 
SVM Sigmoid -  Mean:0.5696428571428571 (Std Dev = 0.1592092225048921) 


In [36]:
# Test the algorithms on the validation dataset

models = zip(names, classifiers)

for name, model in models:
    model.fit(X_train, y_train)
    predictions  = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))
    
print("Complete!")

N Nearest Neighbours
0.7777777777777778
              precision    recall  f1-score   support

           0       1.00      0.65      0.79        17
           1       0.62      1.00      0.77        10

    accuracy                           0.78        27
   macro avg       0.81      0.82      0.78        27
weighted avg       0.86      0.78      0.78        27

Gaussian Process
0.8888888888888888
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        17
           1       0.77      1.00      0.87        10

    accuracy                           0.89        27
   macro avg       0.88      0.91      0.89        27
weighted avg       0.91      0.89      0.89        27

Decision Tree
0.7777777777777778
              precision    recall  f1-score   support

           0       0.92      0.71      0.80        17
           1       0.64      0.90      0.75        10

    accuracy                           0.78        27
   macro avg       