In [1]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install sklearn
# %pip install imblearn

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.multiclass import OneVsRestClassifier
from collections import Counter

from sklearn.svm import SVC
from sklearn import metrics

import time

In [16]:
# split dataset into X y train test, based on gene_id
# input: df, split_size
# output: train df, test df
def split(df, split_size=0.2):
    splitter = GroupShuffleSplit(test_size=split_size, n_splits=1, random_state=42)
    split = splitter.split(df, groups=df['gene_id'])
    train_inds, test_inds = next(split)
    train = df.iloc[train_inds]
    test = df.iloc[test_inds]
    
    y_train = train['label']
    X_train = train.drop(['label', 'sevenmers'], axis = 1)
    y_test = test['label']
    X_test = test.drop(['label', 'sevenmers'], axis = 1)
    
    return X_train, y_train, X_test, y_test

# oversample and undersample such that ratio of minority to majority samples becomes 3:4
# input: df, df (X_train, y_train)
# output: df, df (resampled version)
def resample(X_train, y_train):
    # define oversampling strategy so that ratio of minority samples to majority samples is 1:2
    oversample = RandomOverSampler(sampling_strategy=0.5, random_state=42)
    X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)
    
    # define undersampling strategy so that the ratio of minority to majority samples becomes 3:4
    under = RandomUnderSampler(sampling_strategy=0.75)
    X_train_under, y_train_under = under.fit_resample(X_train_over, y_train_over)
    return X_train_under, y_train_under

df = pd.read_csv('../data/grouped_data.csv')
features_nominal = ['order_1', 'order_2', 'order_3', 'order_4', 'order_5', 'order_6', 'order_7']
df[features_nominal] = df[features_nominal].astype('category')
X_train, y_train, X_test, y_test = split(df)
X_train, y_train = resample(X_train, y_train)
X_train = X_train.drop(columns=['gene_id', 'transcript_id'])
X_test = X_test.drop(columns=['gene_id', 'transcript_id'])

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109333 entries, 0 to 109332
Data columns (total 66 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   transcript_position     109333 non-null  int64   
 1   dwelling_time_1_min     109333 non-null  float64 
 2   dwelling_time_1_max     109333 non-null  float64 
 3   dwelling_time_1_mean    109333 non-null  float64 
 4   dwelling_time_1_median  109333 non-null  float64 
 5   dwelling_time_1_std     109333 non-null  float64 
 6   dwelling_time_1_skew    109333 non-null  float64 
 7   sd_current_1_min        109333 non-null  float64 
 8   sd_current_1_max        109333 non-null  float64 
 9   sd_current_1_mean       109333 non-null  float64 
 10  sd_current_1_median     109333 non-null  float64 
 11  sd_current_1_std        109333 non-null  float64 
 12  sd_current_1_skew       109333 non-null  float64 
 13  mean_current_1_min      109333 non-null  float64 
 14  mean

# Train Model

In [18]:
# one hot encode the categories
X_train = pd.get_dummies(X_train, columns=features_nominal)
X_test = pd.get_dummies(X_test, columns=features_nominal)

In [19]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [20]:
start_time = time.time()
clf = OneVsRestClassifier(SVC(probability=True, cache_size=1000), n_jobs=-1).fit(X_train_std, y_train)
print("--- %s mins ---" % int((time.time() - start_time)/60))

--- 128 mins ---


In [21]:
y_predict = clf.predict(X_test_std)

In [22]:
def print_score(model):
    y_predict = model.predict(X_test_std)
    print(metrics.confusion_matrix(y_test, y_predict))
    # TN FP
    # FN TP

    print(f'accuracy:  {metrics.accuracy_score(y_test, y_predict)}')
    print(f'precision: {metrics.precision_score(y_test, y_predict)}')
    print(f'recall:    {metrics.recall_score(y_test, y_predict)}')
    print(f'roc auc:   {metrics.roc_auc_score(y_test, y_predict)}')
    print(f'pr auc:    {metrics.average_precision_score(y_test, y_predict)}')

print_score(clf)

[[21097  1551]
 [  308   876]]
accuracy:  0.9219956361195032
precision: 0.36093943139678614
recall:    0.7398648648648649
roc auc:   0.8356909983102141
pr auc:    0.27997020356764774


In [23]:
pd.DataFrame(clf.predict_proba(X_test_std)).describe()

Unnamed: 0,0,1
count,23832.0,23832.0
mean,0.8824344,0.117566
std,0.2459429,0.245943
min,1.005391e-08,3e-06
25%,0.9312698,0.003211
50%,0.9870995,0.012901
75%,0.9967889,0.06873
max,0.9999966,1.0


# Hyperparameter Tuning (on small df)

In [14]:
kernel = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

models = []
for name in kernel:
    start_time = time.time()
    print(f'kernel: {name}')
    model = OneVsRestClassifier(SVC(probability=True, cache_size=1000, kernel=name), n_jobs=-1)
    models.append(model)
    model.fit(X_train_std, y_train)
    print_score(model)
    print("--- %s seconds ---" % (time.time() - start_time))
    print()

kernel: linear
[[5212  704]
 [  40  164]]
accuracy:  0.8784313725490196
precision: 0.1889400921658986
recall:    0.803921568627451
roc auc:   0.8424611223799865
pr auc:    0.15842896298304268
--- 690.8784210681915 seconds ---

kernel: poly
[[5496  420]
 [  77  127]]
accuracy:  0.9187908496732026
precision: 0.23217550274223034
recall:    0.6225490196078431
roc auc:   0.7757775524002705
pr auc:    0.15712233095553885
--- 248.67693185806274 seconds ---

kernel: rbf
[[5617  299]
 [  70  134]]
accuracy:  0.9397058823529412
precision: 0.3094688221709007
recall:    0.6568627450980392
roc auc:   0.8031609195402298
pr auc:    0.2147164485501668
--- 261.742835521698 seconds ---

kernel: sigmoid
[[4252 1664]
 [  59  145]]
accuracy:  0.7184640522875817
precision: 0.08015478164731896
recall:    0.7107843137254902
roc auc:   0.7147565922920893
pr auc:    0.06661328434082311
--- 329.61398124694824 seconds ---

kernel: precomputed


ValueError: Precomputed matrix must be a square matrix. Input is a 27195x77 matrix.

# For Future Usage

In [24]:
import pickle

# save the model to disk
filename = '../data/svm_model.sav'
pickle.dump(clf, open(filename, 'wb'))
 
# load the model from disk
model = pickle.load(open(filename, 'rb'))

In [25]:
y_predict = model.predict(X_test_std)

In [26]:
print(metrics.confusion_matrix(y_test, y_predict))
# TN FP
# FN TP

print(f'accuracy:  {metrics.accuracy_score(y_test, y_predict)}')
print(f'precision: {metrics.precision_score(y_test, y_predict)}')
print(f'recall:    {metrics.recall_score(y_test, y_predict)}')
print(f'roc auc:   {metrics.roc_auc_score(y_test, y_predict)}')
print(f'pr auc:    {metrics.average_precision_score(y_test, y_predict)}')

[[21097  1551]
 [  308   876]]
accuracy:  0.9219956361195032
precision: 0.36093943139678614
recall:    0.7398648648648649
roc auc:   0.8356909983102141
pr auc:    0.27997020356764774
