In [59]:
# API to train and test SVM Models for Nuclear Export Sequences
# Requires all sequence sets entered to have equivalent lengths
# Supports data entry by csv, sequences separated by new lines

#Author: Erin Conneilly
#Last Modified: 12/2/19

import sys
import csv
import numpy as np 
import pandas as pd 
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import GridSearchCV

# Define values associated with each amino acid
def getHydropathy(aa):
    if aa == 'a':
        return '1.8'
    elif aa == 'r':
        return '-4.5'
    elif aa == 'n':
        return '-3.5'
    elif aa == 'd':
        return '-3.5'
    elif aa == 'c':
        return '2.5'
    elif aa == 'e':
        return '-3.5'
    elif aa == 'q':
        return '-3.5'
    elif aa == 'g':
        return '-0.4'
    elif aa == 'h':
        return '-3.2'
    elif aa == 'i':
        return '4.5'
    elif aa == 'l':
        return '3.8'
    elif aa == 'k':
        return '-3.9'
    elif aa == 'm':
        return '1.9'
    elif aa == 'f':
        return '2.8'
    elif aa == 'p':
        return '-1.6'
    elif aa == 's':
        return '-0.8'
    elif aa == 't':
        return '-0.7'
    elif aa == 'w':
        return '-0.9'
    elif aa == 'y':
        return '-1.3'
    elif aa == 'v':
        return '4.2'
    else:
        return ''

def getIP(aa):
    if aa == 'a':
        return '1.0'
    elif aa == 'r':
        return '-7.5'
    elif aa == 'n':
        return '-2.7'
    elif aa == 'd':
        return '-3.0'
    elif aa == 'c':
        return '0.2'
    elif aa == 'e':
        return '-2.6'
    elif aa == 'q':
        return '-2.9'
    elif aa == 'g':
        return '0.7'
    elif aa == 'h':
        return '-1.7'
    elif aa == 'i':
        return '3.1'
    elif aa == 'l':
        return '2.2'
    elif aa == 'k':
        return '-4.6'
    elif aa == 'm':
        return '1.1'
    elif aa == 'f':
        return '2.5'
    elif aa == 'p':
        return '-0.3'
    elif aa == 's':
        return '-1.1'
    elif aa == 't':
        return '-0.8'
    elif aa == 'w':
        return '1.5'
    elif aa == 'y':
        return '0.1'
    elif aa == 'v':
        return '2.3'
    else:
        return ''

def getMW(aa):
    if aa == 'a':
        return '89.1'
    elif aa == 'r':
        return '174.2'
    elif aa == 'n':
        return '132.1'
    elif aa == 'd':
        return '133.1'
    elif aa == 'c':
        return '121.2'
    elif aa == 'e':
        return '147.1'
    elif aa == 'q':
        return '147.1'
    elif aa == 'g':
        return '75.1'
    elif aa == 'h':
        return '155.2'
    elif aa == 'i':
        return '131.2'
    elif aa == 'l':
        return '131.2'
    elif aa == 'k':
        return '146.2'
    elif aa == 'm':
        return '149.2'
    elif aa == 'f':
        return '165.2'
    elif aa == 'p':
        return '115.1'
    elif aa == 's':
        return '105.1'
    elif aa == 't':
        return '119.1'
    elif aa == 'w':
        return '204.2'
    elif aa == 'y':
        return '181.2'
    elif aa == 'v':
        return '117.1'
    else:
        return ''

def getNumPKA(aa):
    if aa == 'a':
        return '2'
    elif aa == 'r':
        return '3'
    elif aa == 'n':
        return '2'
    elif aa == 'd':
        return '3'
    elif aa == 'c':
        return '3'
    elif aa == 'e':
        return '3'
    elif aa == 'q':
        return '2'
    elif aa == 'g':
        return '2'
    elif aa == 'h':
        return '3'
    elif aa == 'i':
        return '2'
    elif aa == 'l':
        return '2'
    elif aa == 'k':
        return '3'
    elif aa == 'm':
        return '2'
    elif aa == 'f':
        return '2'
    elif aa == 'p':
        return '2'
    elif aa == 's':
        return '2'
    elif aa == 't':
        return '2'
    elif aa == 'w':
        return '2'
    elif aa == 'y':
        return '3'
    elif aa == 'v':
        return '2'
    else:
        return ''

def getLowPKA(aa):
    if aa == 'a':
        return '2.35'
    elif aa == 'r':
        return '2.18'
    elif aa == 'n':
        return '2.02'
    elif aa == 'd':
        return '1.88'
    elif aa == 'c':
        return '1.71'
    elif aa == 'e':
        return '2.19'
    elif aa == 'q':
        return '2.17'
    elif aa == 'g':
        return '2.34'
    elif aa == 'h':
        return '1.78'
    elif aa == 'i':
        return '2.32'
    elif aa == 'l':
        return '2.36'
    elif aa == 'k':
        return '2.2'
    elif aa == 'm':
        return '2.28'
    elif aa == 'f':
        return '2.58'
    elif aa == 'p':
        return '1.99'
    elif aa == 's':
        return '2.21'
    elif aa == 't':
        return '2.15'
    elif aa == 'w':
        return '2.38'
    elif aa == 'y':
        return '2.2'
    elif aa == 'v':
        return '2.29'
    else:
        return ''

def getHighPKA(aa):
    if aa == 'a':
        return '9.87'
    elif aa == 'r':
        return '13.2'
    elif aa == 'n':
        return '8.8'
    elif aa == 'd':
        return '9.6'
    elif aa == 'c':
        return '10.78'
    elif aa == 'e':
        return '9.67'
    elif aa == 'q':
        return '9.13'
    elif aa == 'g':
        return '9.6'
    elif aa == 'h':
        return '8.97'
    elif aa == 'i':
        return '9.76'
    elif aa == 'l':
        return '9.6'
    elif aa == 'k':
        return '10.28'
    elif aa == 'm':
        return '9.21'
    elif aa == 'f':
        return '9.24'
    elif aa == 'p':
        return '10.6'
    elif aa == 's':
        return '9.15'
    elif aa == 't':
        return '9.12'
    elif aa == 'w':
        return '9.39'
    elif aa == 'y':
        return '10.1'
    elif aa == 'v':
        return '9.72'
    else:
        return ''

In [53]:
# Takes in the filename of the dataset, filename of the labels, and training parameters
# Returns a dataframe of the dataset features and the model
def trainModel(dataFile):
    sequences = open(dataFile, 'r')
    reader = csv.reader(sequences, delimiter=',')
    data = []
    numSequences = 0
    for row in reader:
        sequence = row[0].lower()
        aa = 0
        while aa<10:
            data.append([numSequences, 0, aa, getHydropathy(sequence[aa])])
            data.append([numSequences, 1, aa, getIP(sequence[aa])])
            data.append([numSequences, 2, aa, getMW(sequence[aa])])
            data.append([numSequences, 3, aa, getNumPKA(sequence[aa])])
            data.append([numSequences, 4, aa, getLowPKA(sequence[aa])])
            data.append([numSequences, 5, aa, getHighPKA(sequence[aa])])
            aa+= 1
        numSequences+= 1

    sequences.close()

    df = pd.DataFrame(data, columns= ['seqID', 'dimension', 'aaNum', 'value'])
    currSeq = []
    allSeq = []
    for row in df.index:
        currSeq.append(float(df['value'][row]))
        if df['aaNum'][row] == 9 and df['dimension'][row] == 5:
            allSeq.append(np.asarray(currSeq))
            currSeq = []

    np.asarray(allSeq)
    return allSeq

In [116]:
# Takes in prepped data, filename of labelFile, and optional training parameters
# parameters default to the best performing all NES model
# Creates a results summary file returns the trained model
# **want recall to be higher**
def trainAndTestSelf(preppedData, labelFile, kernel='rbf', gamma='scale', nu=0.5):
    labels = np.loadtxt(labelFile, delimiter= ',')

    model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)
    model.fit(preppedData)

    selfPredict = model.predict(preppedData)
    summary = open('selfSummary.csv', 'w+')
    writer = csv.writer(summary)
    writer.writerow(['True Labels'])
    writer.writerow([labels])
    writer.writerow(['Predicted Labels'])
    writer.writerow([selfPredict])
    falsePos = 0
    falseNeg = 0
    truePos = 0
    trueNeg = 0
    for i in range(len(labels)):
        if labels[i] == selfPredict[i]:
            if labels[i] == 1:
                trueNeg+= 1
            else:
                truePos+= 1
        else:
            if labels[i] == 1:
                falseNeg+= 1
            else:
                falsePos+= 1
    writer.writerow(['True Positives', truePos, 'False Positives', falsePos])
    writer.writerow(['False Negatives', falseNeg, 'True Negatives', trueNeg])
    writer.writerow(['Recall', recall_score(labels, selfPredict), 'Precision', precision_score(labels, selfPredict), 'Accuracy', (truePos+trueNeg)/(truePos+trueNeg+falsePos+falseNeg)])


In [169]:
data = trainModel('test.csv')
labels = np.loadtxt('testLabels.csv', delimiter= ',')
svm = OneClassSVM(kernel='rbf', gamma='scale')
scorer = make_scorer(f1_score)
clf = GridSearchCV(svm, param_grid={'kernel': ['rbf'], 'nu': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'gamma': ['scale', 0.5, 0.3, 0.1]}, scoring=scorer)

In [190]:
trainAndTestSelf(data, 'testLabels.csv', gamma=0.001, nu=0.3)

In [82]:
clf.fit(data, labels)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=OneClassSVM(cache_size=200, coef0=0.0, degree=3,
                                   gamma='scale', kernel='rbf', max_iter=-1,
                                   nu=0.5, random_state=None, shrinking=True,
                                   tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'gamma': ['scale', 0.5, 0.3, 0.1], 'kernel': ['rbf'],
                         'nu': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(f1_score), verbose=0)

In [83]:
df = pd.DataFrame.from_dict(clf.cv_results_)

In [84]:
print(clf.best_params_)

{'gamma': 'scale', 'kernel': 'rbf', 'nu': 0.2}


In [85]:
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_kernel,param_nu,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000455,5.4e-05,0.00187,0.00063,scale,rbf,0.1,"{'gamma': 'scale', 'kernel': 'rbf', 'nu': 0.1}",0.666667,0.0,0.0,0.266667,0.326599,7
1,0.000598,0.000342,0.001139,8.7e-05,scale,rbf,0.2,"{'gamma': 'scale', 'kernel': 'rbf', 'nu': 0.2}",1.0,0.0,0.0,0.4,0.489898,1
2,0.000317,2.7e-05,0.001069,1.2e-05,scale,rbf,0.3,"{'gamma': 'scale', 'kernel': 'rbf', 'nu': 0.3}",1.0,0.0,0.0,0.4,0.489898,1
3,0.000302,1.3e-05,0.001117,0.00017,scale,rbf,0.4,"{'gamma': 'scale', 'kernel': 'rbf', 'nu': 0.4}",1.0,0.0,0.0,0.4,0.489898,1
4,0.000448,8.1e-05,0.001205,0.000351,scale,rbf,0.5,"{'gamma': 'scale', 'kernel': 'rbf', 'nu': 0.5}",1.0,0.0,0.0,0.4,0.489898,1
