In [None]:
# check that numeric features are scaled, cat features are encoded

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#from collections import Counte
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier

import time

In [36]:
df = pd.read_csv('data/grouped_data.csv')

In [19]:
df.head()

Unnamed: 0,gene_id,transcript_id,transcript_position,sevenmers,label,dwelling_time_1_min,dwelling_time_1_max,dwelling_time_1_mean,dwelling_time_1_median,dwelling_time_1_std,...,order_2,order_3,order_4,order_5,order_6,order_7,count_A,count_C,count_G,count_T
0,ENSG00000000003,ENST00000373020,512,ATAACTC,0,0.00266,0.0169,0.007247,0.00599,0.004404,...,T,A,A,C,T,C,3,2,0,2
1,ENSG00000000003,ENST00000373020,689,TAAACAA,0,0.00232,0.0279,0.009868,0.00764,0.006946,...,A,A,A,C,A,A,5,1,0,1
2,ENSG00000000003,ENST00000373020,823,ATAACAA,0,0.00299,0.0196,0.007456,0.00631,0.003799,...,T,A,A,C,A,A,5,1,0,1
3,ENSG00000000003,ENST00000373020,830,ATAACCA,0,0.00266,0.0226,0.007765,0.00641,0.004869,...,T,A,A,C,C,A,4,2,0,1
4,ENSG00000000003,ENST00000373020,849,GTAACCC,0,0.00332,0.0181,0.006785,0.00598,0.003115,...,T,A,A,C,C,C,2,3,1,1


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121838 entries, 0 to 121837
Data columns (total 70 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   gene_id                 121838 non-null  object 
 1   transcript_id           121838 non-null  object 
 2   transcript_position     121838 non-null  int64  
 3   sevenmers               121838 non-null  object 
 4   label                   121838 non-null  int64  
 5   dwelling_time_1_min     121838 non-null  float64
 6   dwelling_time_1_max     121838 non-null  float64
 7   dwelling_time_1_mean    121838 non-null  float64
 8   dwelling_time_1_median  121838 non-null  float64
 9   dwelling_time_1_std     121838 non-null  float64
 10  dwelling_time_1_skew    121838 non-null  float64
 11  sd_current_1_min        121838 non-null  float64
 12  sd_current_1_max        121838 non-null  float64
 13  sd_current_1_mean       121838 non-null  float64
 14  sd_current_1_median 

In [37]:
features_nominal = ['sevenmers', 'order_1', 'order_2', 'order_3', 'order_4', 'order_5', 'order_6', 'order_7']
for name in features_nominal:
    df[name] = df[name].astype('category')

# Train Test Split

In [38]:
splitter = GroupShuffleSplit(test_size=.20, n_splits=1, random_state=42)
split = splitter.split(df, groups=df['gene_id'])
train_inds, test_inds = next(split)

train = df.iloc[train_inds]
test = df.iloc[test_inds]

In [39]:
X_train = train.drop(columns = ['label', 'sevenmers'])
y_train = train['label']
X_test = test.drop(columns = ['label', 'sevenmers'])
y_test = test['label']

In [40]:
# one hot encode the categories
features_nominal = ['order_1', 'order_2', 'order_3', 'order_4', 'order_5', 'order_6', 'order_7']
X_train = pd.get_dummies(X_train, columns=features_nominal)
X_test = pd.get_dummies(X_test, columns=features_nominal)

# Resampling

In [41]:
# oversample and undersample such that ratio of minority to majority samples becomes 3:4
# input: df, df (X_train, y_train)
# output: df, df (resampled version)
def resample(X_train, y_train):
    # define oversampling strategy so that ratio of minority samples to majority samples is 1:2
    oversample = RandomOverSampler(sampling_strategy=0.5, random_state=42)
    X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)
    
    # define undersampling strategy so that the ratio of minority to majority samples becomes 3:4
    under = RandomUnderSampler(sampling_strategy=0.75)
    # fit and apply the transform
    X_train_under, y_train_under = under.fit_resample(X_train_over, y_train_over)
    return X_train_under, y_train_under

In [42]:
X_train_resampled, y_train_resampled = resample(X_train, y_train)

# Train Model

In [43]:
X_train_resampled = X_train_resampled.set_index(['gene_id', 'transcript_id'])
X_test = X_test.set_index(['gene_id', 'transcript_id'])

In [44]:
X_train_resampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,transcript_position,dwelling_time_1_min,dwelling_time_1_max,dwelling_time_1_mean,dwelling_time_1_median,dwelling_time_1_std,dwelling_time_1_skew,sd_current_1_min,sd_current_1_max,sd_current_1_mean,...,order_3_G,order_4_A,order_5_C,order_6_A,order_6_C,order_6_T,order_7_A,order_7_C,order_7_G,order_7_T
gene_id,transcript_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ENSG00000089597,ENST00000532402,2546,0.00232,0.0435,0.011854,0.0106,0.008186,2.306846,2.27,8.92,4.401111,...,1,1,1,1,0,0,0,1,0,0
ENSG00000079785,ENST00000233084,1212,0.00325,0.0241,0.009836,0.00876,0.004683,0.694483,3.78,13.8,9.298621,...,0,1,1,1,0,0,1,0,0,0
ENSG00000109084,ENST00000226230,1538,0.00263,0.0239,0.008106,0.007315,0.004693,1.568762,1.59,4.84,2.697632,...,0,1,1,1,0,0,0,0,0,1
ENSG00000065911,ENST00000409804,1050,0.00232,0.0169,0.006777,0.00548,0.003755,1.108239,1.41,5.36,2.333077,...,0,1,1,1,0,0,0,0,1,0
ENSG00000175063,ENST00000356455,659,0.00199,0.0252,0.006294,0.00531,0.003647,1.550596,1.37,16.6,4.689845,...,1,1,1,0,1,0,0,1,0,0


In [45]:
sc = StandardScaler()
sc.fit(X_train_resampled)
X_train_std = sc.transform(X_train_resampled)
X_test_std = sc.transform(X_test)

In [49]:
# fit model to training data
model = XGBClassifier()
model.fit(X_train_std, y_train_resampled)



In [50]:
# print model parameters used in trained model
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [52]:
# fit model to training data
model1 = XGBClassifier()
model1.fit(X_train_resampled, y_train_resampled)



In [53]:
#make predictions
y_pred = model.predict(X_test_std)

In [61]:
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

0.9315626049009735

In [58]:
print(metrics.confusion_matrix(y_test, y_pred))
# TN FP
# FN TP

print(f'precision: {metrics.precision_score(y_test, y_pred)}')
print(f'recall:    {metrics.recall_score(y_test, y_pred)}')
print(f'roc auc:   {metrics.roc_auc_score(y_test, y_pred)}')
print(f'pr auc:    {metrics.average_precision_score(y_test, y_pred)}')

[[21396  1252]
 [  379   805]]
precision: 0.39134662129314535
recall:    0.6798986486486487
roc auc:   0.812308914575119
pr auc:    0.2819790265501487


# hyperparameter tuning of xgboost with gridsearchcv

In [62]:
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.4, 0.3, 0.2],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [63]:
grid_cv = GridSearchCV(model, param_grid, n_jobs = -1, cv = 3, scoring = "roc_auc")
grid_cv.fit(X_train_std, y_train_resampled)





In [64]:
grid_cv.best_score_

0.995411091903534

In [65]:
grid_cv.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 0.4,
 'max_depth': 7,
 'reg_lambda': 1,
 'scale_pos_weight': 3,
 'subsample': 0.8}

In [None]:
https://towardsdatascience.com/beginners-guide-to-xgboost-for-classification-problems-50f75aac5390

In [66]:
param_grid2 = {
    "max_depth": [7, 8, 9],
    "learning_rate": [0.2, 0.1, 0.05],
    "gamma": [0],
    "reg_lambda": [1],
    "scale_pos_weight": [3],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [67]:
grid_cv2 = GridSearchCV(model, param_grid2, n_jobs = -1, cv = 3, scoring = "roc_auc")
grid_cv2.fit(X_train_std, y_train_resampled)





In [69]:
grid_cv2.best_score_

0.997903745036249

In [70]:
grid_cv2.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 0.2,
 'max_depth': 9,
 'reg_lambda': 1,
 'scale_pos_weight': 3,
 'subsample': 0.8}

In [71]:
param_grid3 = {
    "max_depth": [9, 10],
    "learning_rate": [0.2],
    "gamma": [0],
    "reg_lambda": [1],
    "scale_pos_weight": [3],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [72]:
grid_cv3 = GridSearchCV(model, param_grid3, n_jobs = -1, cv = 3, scoring = "roc_auc")
grid_cv3.fit(X_train_std, y_train_resampled)





In [73]:
grid_cv3.best_score_

0.9988054144196519

In [75]:
grid_cv3.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 0.2,
 'max_depth': 10,
 'reg_lambda': 1,
 'scale_pos_weight': 3,
 'subsample': 0.8}

In [81]:
final_xgb = XGBClassifier(
    **grid_cv3.best_params_,
    objective = "binary:logistic"
)

In [84]:
final_xgb.fit(X_train_std, y_train_resampled)

final_ypred = final_xgb.predict(X_test_std)



In [85]:
accuracy = metrics.accuracy_score(y_test, final_ypred)
print(accuracy)

print(metrics.confusion_matrix(y_test, final_ypred))
# TN FP
# FN TP

print(f'precision: {metrics.precision_score(y_test, final_ypred)}')
print(f'recall:    {metrics.recall_score(y_test, final_ypred)}')
print(f'roc auc:   {metrics.roc_auc_score(y_test, final_ypred)}')
print(f'pr auc:    {metrics.average_precision_score(y_test, final_ypred)}')

0.9502349781805975
[[21978   670]
 [  516   668]]
precision: 0.4992526158445441
recall:    0.5641891891891891
roc auc:   0.7673030015179432
pr auc:    0.30332448946040047
