In [2]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_fwf('https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/splice-junction-gene-sequences/splice.data', header=None)
df.columns = ['Class','Donor','Sequence'] # renaming the rows of the columns 
df.head()

Unnamed: 0,Class,Donor,Sequence
0,"EI,","ATRINS-DONOR-521,",CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCC...
1,"EI,","ATRINS-DONOR-905,",AGACCCGCCGGGAGGCGGAGGACCTGCAGGGTGAGCCCCACCGCCC...
2,"EI,","BABAPOE-DONOR-30,",GAGGTGAAGGACGTCCTTCCCCAGGAGCCGGTGAGAAGCGCAGTCG...
3,"EI,","BABAPOE-DONOR-867,",GGGCTGCGTTGCTGGTCACATTCCTGGCAGGTATGGGGCGGGGCTT...
4,"EI,","BABAPOE-DONOR-2817,",GCTCAGCCCCCAGGTCACCCAGGAACTGACGTGAGTGTCCCCATCC...


In [4]:
df['Class'].value_counts() # for the numbers of each classes in the dataset 

N,     1655
IE,     768
EI,     767
Name: Class, dtype: int64

In [5]:
df_new = df['Sequence'].apply(lambda x: pd.Series(list(x))) # converting the rows of sequence into columns 
df_2 = pd.get_dummies(df_new,drop_first=True) # to create dummy variables to get the occurance of each of Nucleobase
df = pd.concat([df,df_2],1)
df_2 = pd.get_dummies(df.Class,drop_first=True) # dummy variables gor IE, EI and N classes
df_2['EI,'] = 1 - df_2['IE,'] - df_2['N,']  # giving values to dummy variables 
df = pd.concat([df,df_2],1)
df.head()

Unnamed: 0,Class,Donor,Sequence,0_C,0_D,0_G,0_T,1_C,1_D,1_G,...,58_G,58_N,58_T,59_C,59_G,59_N,59_T,"IE,","N,","EI,"
0,"EI,","ATRINS-DONOR-521,",CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCC...,1,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,1
1,"EI,","ATRINS-DONOR-905,",AGACCCGCCGGGAGGCGGAGGACCTGCAGGGTGAGCCCCACCGCCC...,0,0,0,0,0,0,1,...,1,0,0,1,0,0,0,0,0,1
2,"EI,","BABAPOE-DONOR-30,",GAGGTGAAGGACGTCCTTCCCCAGGAGCCGGTGAGAAGCGCAGTCG...,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
3,"EI,","BABAPOE-DONOR-867,",GGGCTGCGTTGCTGGTCACATTCCTGGCAGGTATGGGGCGGGGCTT...,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
4,"EI,","BABAPOE-DONOR-2817,",GCTCAGCCCCCAGGTCACCCAGGAACTGACGTGAGTGTCCCCATCC...,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [6]:
X = df[df.columns[3:]] # Extracting all the columns starting from third column into dataframe X
y = X[X.columns[-3:]]  # Extracting the last three columns from dataframe X
X = X[X.columns[:-3]]  # Extracting all the columns except the last three columns from dataframe X

In [7]:
# creating test and train data from the X,y with ratio 67:33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# creating crossing validation and fitting data from X,y with ratio 50:50 
X_cv,X_ft,y_cv,y_ft = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [8]:
model = Sequential()
model.add(Dense(36, input_dim=227, activation='relu')) # input layer with 36 input units 
model.add(Dense(32, activation='relu')) # hidden layer
model.add(Dense(3, activation='sigmoid')) # output layer with 3 units

In [9]:
# Configureing the model for training
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
# Training the model for a given number of epochs
model.fit(X_train, y_train, epochs=20, batch_size=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x16d9de46978>

In [11]:
# Training Set Accuracy
scores = model.evaluate(X_train, y_train)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


acc: 99.97%


In [12]:
# Cross-validation Set Accuracy
scores = model.evaluate(X_cv, y_cv)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


acc: 95.75%


In [14]:
# Test Set Accuracy
scores = model.evaluate(X_ft,y_ft)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


acc: 96.65%
