### An implementation of Time Series Classification on primate splice-junction data. The goal of this project is to classify intron-exon and exon-intron boundaries.
# Data Cleanup
First, we import all libraries used, and the data:

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import matplotlib as plt
from dtw import dtw
from random import choice
import os.path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from scipy.stats import mode

df = pd.read_csv('splice.data', names = ['Class', 'Name', 'Sequence'])
df.head()

Unnamed: 0,Class,Name,Sequence
0,EI,ATRINS-DONOR-521,CCAGCTGCATCACAGGAGGCCAGCGAGCAGG...
1,EI,ATRINS-DONOR-905,AGACCCGCCGGGAGGCGGAGGACCTGCAGGG...
2,EI,BABAPOE-DONOR-30,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGG...
3,EI,BABAPOE-DONOR-867,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGT...
4,EI,BABAPOE-DONOR-2817,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTG...


Since the original data may have inconsistent spacing, we will strip any leading or tailing whitespace from each entry.

In [2]:
df['Class'] = df['Class'].str.strip()
df['Name'] = df['Name'].str.strip()
df['Sequence'] = df['Sequence'].str.strip()

For the purposes of our initial model, we will remove any instances in the "N" or Neither class. This will leave only the EI and IE class for analysis.

In [3]:
df = df[df.Class != 'N']
df['Class'].value_counts()

IE    768
EI    767
Name: Class, dtype: int64

There is one entry that has incomplete data; it can be removed.

In [4]:
df[df.Name == 'HUMALPI1-DONOR-42'].iloc[0, 2]
df = df[df.Name != 'HUMALPI1-DONOR-42']

'CACACAGGGCACCCCCTCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'

The "Name" feature is unnecessary for the procedure, and can be dropped.

In [5]:
df = df.drop('Name', axis=1)
df.head()

Unnamed: 0,Class,Sequence
0,EI,CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCC...
1,EI,AGACCCGCCGGGAGGCGGAGGACCTGCAGGGTGAGCCCCACCGCCC...
2,EI,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGGTGAGAAGCGCAGTCG...
3,EI,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGTATGGGGCGGGGCTT...
4,EI,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTGAGTGTCCCCATCC...


Mapping EI and IE to 0 and 1 respectively, makes it easier for our classifier later on.

In [6]:
df['Class'] = df['Class'].map({'EI': 0, 'IE': 1})
df.head()
df.tail()

Unnamed: 0,Class,Sequence
0,0,CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCC...
1,0,AGACCCGCCGGGAGGCGGAGGACCTGCAGGGTGAGCCCCACCGCCC...
2,0,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGGTGAGAAGCGCAGTCG...
3,0,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGTATGGGGCGGGGCTT...
4,0,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTGAGTGTCCCCATCC...


Unnamed: 0,Class,Sequence
1530,1,AGCCTGGGCTGACCCCACGTCTGGCCACAGGCCCGCGTGCTGCCCC...
1531,1,CTGTCCTGTGGGTTCCTCTCACCCCCTCAGGCTGCTGGTCGTCTAC...
1532,1,ATGTTTAAACCTCGCGTTTCCTCCCCGCAGCTCTTGGGCAATGTGC...
1533,1,CTGTCCTGTGGGTTCCTCTCACCCTCTCAGGTTGCTGGTCGTCTAC...
1534,1,CATATGTATCTTTTTACCTTTTCCCAACAGCTCCTGGGCAACGTGC...


It is necessary to encode our nucleotides into numbers so their pattern and ordering can be analyzed. The bases will be encoded into 1, 2, 3, and 4 in alphabetical order. The ambiguous nucleotides, N, D, S, and R will be pseudorandomly chosen, based on the possible nucleotides it could be.

In [7]:
def translateSequence(sequence):
    baseDict = {'A':'1', 'C':'2', 'G':'3', 'T':'4', 'N':choice(['1', '2', '3', '4']),
               'D':choice(['1', '3', '4']), 'S':choice(['2', '3']), 'R':choice(['1', '3'])}
    newSequence = ''
    for base in sequence:
        newSequence += baseDict[base] 
    return newSequence

df['Sequence'] = [translateSequence(sequence) for sequence in df['Sequence']]
df.head()

Unnamed: 0,Class,Sequence
0,0,2213243214212133133221323132133424344221133322...
1,0,1312223223331332331331224321333431322221223222...
2,0,3133431133123422442222133132233431311323213423...
3,0,3332432344324334212144224332133414333323333244...
4,0,3242132222213342122213311243123431343422221422...


Finally, we will separate each columns into their own individual Numpy arrays. Note that each sequence of numbers is being split, forming a matrix. Each row in this matrix will contain a sequence that has been encoded into numbers.

In [8]:
y = np.asarray(df['Class'])
y
np.shape(y)
X = np.asarray([list(map(int, sequence)) for sequence in df['Sequence']])
X
np.shape(X)

array([0, 0, 0, ..., 1, 1, 1])

(1534,)

array([[2, 2, 1, ..., 2, 4, 3],
       [1, 3, 1, ..., 2, 3, 2],
       [3, 1, 3, ..., 1, 4, 3],
       ...,
       [1, 4, 3, ..., 2, 4, 3],
       [2, 4, 3, ..., 3, 1, 3],
       [2, 1, 4, ..., 2, 4, 3]])

(1534, 60)

# Dynamic Time Warping

Now we're ready for the dynamic time warping procedure. We iterate through all sequences, taking the euclidean distance and calculating the dissimilarity score:

NOTE: This procedure took approximately 10 hours to run on my machine. Using my pre-generated dissimilarity matrix is preferable.

In [9]:
if os.path.exists("matrix.csv"):
    dissimMatrix = np.asarray(pd.read_csv("matrix.csv", index_col=0))
else:
    dissimMatrix = []
    for i, seq1 in enumerate(X):
        dissimMatrix.append([])
        for seq2 in X:
            euclideanDist = lambda seq1, seq2: np.abs(seq1 - seq2)
            warpScore = dtw(seq1, seq2, dist=euclideanDist)
            dissimMatrix[i].append(warpScore[0])
    pd.DataFrame(data=dissimMatrix).to_csv("matrix.csv")

# K Nearest Neighbors

Finally, our model is ready to be validated/tested. First, we need to split the dataset, using Stratified K-Folds.

In [10]:
def createMatrix(values, train_index, dissimMatrix):
    matrix = []
    for c, i in enumerate(values):
        matrix.append([])
        for j in train_index:
            matrix[c].append(dissimMatrix[i][j])
    return matrix

def splitData(X, y, dissimMatrix):
    skf = StratifiedKFold(n_splits=5)
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    for train_index, test_index in skf.split(X, y):
        X_train.append(createMatrix(train_index, train_index, dissimMatrix))
        X_test.append(createMatrix(test_index, train_index, dissimMatrix))
        y_train.append(y[train_index])
        y_test.append(y[test_index])
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = splitData(X, y, dissimMatrix)

Now, we make the predictions:

In [11]:
def predictClass(X_test, y_train, y_test, k):
    y_pred = []
    acc = []
    conf = []
    prec = []
    recall = []
    f1 = []
    for i, fold in enumerate(X_test):
        y_pred.append([])
        for seq in fold:
            closest = np.argsort(seq)[:k] # Returns indices of sorting from smallest to largest number
            y_pred[i].append(mode([y_train[i][j] for j in closest])[0][0])
        y_pred[i] = np.asarray(y_pred[i])
        acc.append(accuracy_score(y_test[i], y_pred[i]))
        tn, fp, fn, tp = confusion_matrix(y_test[i], y_pred[i]).ravel()
        conf.append([tn, fp, fn, tp])
        prec.append(precision_score(y_test[i], y_pred[i]))
        recall.append(recall_score(y_test[i], y_pred[i]))
        f1.append(f1_score(y_test[i], y_pred[i]))
    return acc, conf, prec, recall, f1
results = np.asarray([predictClass(X_test, y_train, y_test, k) for k in range(1,4)])

And output the results:

In [37]:
for i, k in enumerate(results):
    table = pd.DataFrame(k.T, index=range(1,len(X_test) + 1), columns=['Accuracy', 'Confusion Matrix', 'Precision', 'Recall', 'F1'])
    table[['tn', 'fp', 'fn', 'tp']] = pd.DataFrame(table['Confusion Matrix'].values.tolist(), index=table.index)
    table = table.drop('Confusion Matrix', axis=1)
    means = pd.DataFrame(table.iloc[:,0:4].mean(), columns=['Average']).transpose()
    table = table.append(means)
    table = table.style.set_caption('{}-NN'.format(i+1))
    table

Unnamed: 0,Accuracy,F1,Precision,Recall,fn,fp,tn,tp
1,0.827922,0.833856,0.806061,0.863636,21.0,32.0,122.0,133.0
2,0.814332,0.815534,0.812903,0.818182,28.0,29.0,124.0,126.0
3,0.833876,0.833876,0.836601,0.831169,26.0,25.0,128.0,128.0
4,0.77451,0.775244,0.772727,0.777778,34.0,35.0,118.0,119.0
5,0.833333,0.838095,0.814815,0.862745,21.0,30.0,123.0,132.0
Average,0.816795,0.819321,0.808621,0.830702,,,,


Unnamed: 0,Accuracy,F1,Precision,Recall,fn,fp,tn,tp
1,0.818182,0.797101,0.901639,0.714286,44.0,12.0,142.0,110.0
2,0.791531,0.77305,0.851562,0.707792,45.0,19.0,134.0,109.0
3,0.794788,0.770909,0.876033,0.688312,48.0,15.0,138.0,106.0
4,0.77451,0.729412,0.911765,0.607843,60.0,9.0,144.0,93.0
5,0.852941,0.844291,0.897059,0.797386,31.0,14.0,139.0,122.0
Average,0.80639,0.782953,0.887612,0.703124,,,,


Unnamed: 0,Accuracy,F1,Precision,Recall,fn,fp,tn,tp
1,0.853896,0.854369,0.851613,0.857143,22.0,23.0,131.0,132.0
2,0.830619,0.835443,0.814815,0.857143,22.0,30.0,123.0,132.0
3,0.856678,0.861635,0.835366,0.88961,17.0,27.0,126.0,137.0
4,0.816993,0.816993,0.816993,0.816993,28.0,28.0,125.0,125.0
5,0.856209,0.861635,0.830303,0.895425,16.0,28.0,125.0,137.0
Average,0.842879,0.846015,0.829818,0.863263,,,,


In this context:
* A True Negative means that an EI was correctly identified as EI
* A False Positive means that an EI was incorrectly identified as IE
* A False Negative means that an IE was incorrectly identified as EI
* A True Positive means that an IE was correctly identified as IE