# Base stacking prediction

This code uses a combination of different classifiers to predict base stacking.

## Imports / function definitions

In [0]:
# All import statements
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from scipy.stats import randint as sp_randint
from scipy.stats import expon as sp_expon
from matplotlib import pyplot as plt

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import re
import numpy as np
import pandas as pd
import io
import requests
import warnings

Define all neccessary functions for formatting the dataset.

In [0]:
# Global variable 
NUMBER_CHEMICAL_SHIFT_TYPE = 19

def one_hot(cs):
  '''
  This function encodes the resnames so that there are now 4 columns 
  corresponding to each possible resname.
  '''
  one_hot = pd.get_dummies(cs['resname'])
  cs = cs.join(one_hot)
  return(cs)

def get_cs_all(cs_all, id = "2KOC"):
  '''    
    This function gets chemical shifts for a particular RNA. 
    Assumes each RNA has a unique id  
  '''
  return(cs_all[(cs_all.id == id)])

def get_cs_residues(cs_i, resid, dummy = 0):
  '''    
    This function return an array containing the chemical shifts for a particular residues in an RNA.    
  '''
  cs_tmp = cs_i[(cs_i.resid == resid)].drop(['id', 'resid', 'resname', 'stacking', 'ADE', 'CYT', 'GUA', 'URA'], axis=1)
  info_tmp = cs_i[(cs_i.resid == resid)]
  if (cs_tmp.shape[0] != 1):
     return(dummy*np.ones(shape=(1, NUMBER_CHEMICAL_SHIFT_TYPE)))
  else:
     return(cs_tmp.values)
    
def get_resnames(cs_i, resid, dummy = "UNK"):
  '''    
    This function returns the residue name for specified residue (resid)
  '''
  cs_tmp = cs_i[(cs_i.resid == resid)]  
  if (cs_tmp.shape[0] != 1):
     return(dummy)
  else:
     return(cs_tmp['resname'].values[0])

def get_cs_features(cs_i, resid, neighbors=1):
  '''    
  This function chemical shifts and resnames for residue (resid) and its neighbors        

  '''
  cs = []
  resnames = []
  for i in range(resid-neighbors, resid+neighbors+1):
    cs.append(get_cs_residues(cs_i, i))
    resnames.append(get_resnames(cs_i, i))
  return(resnames, np.array(cs))

def get_columns_names(neighbors = 3, chemical_shift_types = 19):
  '''
    
    Helper function that writes out the required column names
    
  '''

  columns = ['id', 'resname', 'resid', 'stacking', 'ADE', 'CYT', 'GUA', 'URA']
  for i in range(0, neighbors*chemical_shift_types):
    columns.append(i)
  return(columns)

def write_out_resname(neighbors=1):
  '''
  
    Helper function that writes out the column names associated resnames for a given residue and its neighbors
    
  '''  
  colnames = []
  for i in range(1-neighbors-1, neighbors+1):
    if i < 0: 
      colnames.append('R%s'%i)
    elif i > 0: 
      colnames.append('R+%s'%i)
    else: 
      colnames.append('R')
  return(colnames)    


def get_cs_features_rna(cs, neighbors=1, retain = ['id', 'stacking', 'resid', 'ADE', 'CYT', 'GUA', 'URA']):
  '''    
    This function generates the complete required data frame an RNA    
  '''
  all_features = []
  all_resnames = []
  for resid in cs['resid'].unique():
    resnames, features = get_cs_features(cs, resid, neighbors)
    all_features.append(features.flatten())
    all_resnames.append(resnames)

  all_resnames = pd.DataFrame(all_resnames, dtype='object', columns = write_out_resname(neighbors))
  all_features = pd.DataFrame(all_features, dtype='object')
  info = pd.DataFrame(cs[retain].values, dtype='object', columns = retain)
  return(pd.concat([info, all_resnames, all_features], axis=1))

def create_training_testing(cs, leave_out = "2KOC", target_name = 'stacking', neighbors = 2, drop_names = ['id', 'stacking', 'resid']):
  '''    
    This function creates a training and testing set using leave one out    
  '''
  
  # drop extraneous data  
  drop_names = drop_names + list(write_out_resname(neighbors))  
  
  # does not contain leave_out
  train = cs[(cs.id != leave_out)]
  trainX = train.drop(drop_names, axis=1)
  trainy = train[target_name]
 
  # only contains leave_out
  test = cs[(cs.id == leave_out)]
  testX = test.drop(drop_names, axis=1)
  testy = test[target_name]
  
  # return training and testing data
  return(trainX.values, trainy.values, testX.values, testy.values)

def get_cs_features_rna_all(cs, neighbors = 2):  
  '''    
    This [should] function generate a pandas dataframe containing training data for all RNAs
    Each row in the data frame should contain the stacking and chemical shifts for given residue and neighbors in a given RNA.
    Use the function above to write function
    
  '''  
  # Start: your code
  
  cs_new = pd.DataFrame()
  
  for id in c.id.unique():
    cs_id = get_cs_all(cs,id)
    cs_new = pd.concat([cs_new,get_cs_features_rna(cs_id, neighbors)], axis = 0)
  
  
  # End: your code
  return(cs_new)

## Load in and prepare the data

Load in .csv file and create database from it.

In [21]:
warnings.filterwarnings("ignore")   

# load initial data
url="https://drive.google.com/uc?id=1e-SHtWDtg4mD_th3_4Jmq9r1iiQC32wT"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')), sep=' ')
print("[INFO]: loaded data")
# Drop extraneous columns that are unneeded for prediction
c = c.drop(['Unnamed: 0','base_pairing', 'orientation', 'sugar_puckering', 'pseudoknot'], axis = 1)
# Convert stacking column to numerical
c['stacking'] = c['stacking'].map({'stack': 1, 'non-stack': 0})
# One-hot encode the resname data
c_new = one_hot(c)

[INFO]: loaded data


## Set up hyperparameters

Below, we initialize the hyperparameter space distribution

In [0]:
# Set up the parameter space distribution from which to get sample hyperparameters
# in determing the optimal ones to use.
min_size, max_size = 5, 100
parameter_space_distribution = {
    'hidden_layer_sizes': [(sp_randint.rvs(min_size, max_size),sp_randint.rvs(min_size, max_size),sp_randint.rvs(min_size, max_size),sp_randint.rvs(min_size, max_size)), (sp_randint.rvs(min_size, max_size),sp_randint.rvs(min_size, max_size),sp_randint.rvs(min_size, max_size)), (sp_randint.rvs(min_size, max_size),sp_randint.rvs(min_size, max_size)), (sp_randint.rvs(min_size, max_size),)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam', 'lbfgs'],
    'alpha': sp_expon(scale=.01),
    'learning_rate': ['constant','adaptive'],
    'learning_rate_init': sp_expon(scale=.001),
} 

We will go through the training process once with the first ID data left out. This is used to initialize the hyperparameters for running on the program.

In [17]:
# Initialize empty classification array to use for calculating mean F1-score later
classification_array = []

# The ideal number of neighbors found in a prior assignment was 4
NEIGHBORS = 4

# Initialize the ID array which will be used later to cycle through IDs.
id_array = c.id.unique()

# The code below is used to randomly select a subset of IDs to cycle through
# in case the program needs some speeding up
#np.random.shuffle(id_array)
#id_array = id_array[0:50]
id = id_array[0]

# Get all features and ready the data for model fitting by adding neighbor CS columns
cs_all = get_cs_features_rna_all(c_new, neighbors = NEIGHBORS)

# Seperate data into training and testing set while leaving out one ID for testing
trainX, trainy, testX, testy = create_training_testing(cs_all, leave_out = id, neighbors = NEIGHBORS)
print("[INFO]: created training and testing data structures")

# setup scaler and scale the training and testing input data
scaler = StandardScaler()
scaler.fit(trainX)
trainX_scaled = scaler.transform(trainX)
testX_scaled = scaler.transform(testX)
print("[INFO]: scaled the features")

# build a classifier - in this case a simple Multi-layer perceptron classifier
clf = MLPClassifier(max_iter=100)

# Random search for best hyperparameters
n_iter_search = 6
random_search = RandomizedSearchCV(clf, param_distributions=parameter_space_distribution, n_iter=n_iter_search, cv=6, verbose = 1)
random_search.fit(trainX_scaled, np.int_(trainy))
print("[INFO]: hyperparameter search complete")

[INFO]: created training and testing data structures
[INFO]: scaled the features
Fitting 6 folds for each of 6 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  1.6min finished


[INFO]: hyperparameter search complete


In [18]:
random_search.best_params_

{'activation': 'tanh',
 'alpha': 0.02300468536963886,
 'hidden_layer_sizes': (77, 80, 18),
 'learning_rate': 'constant',
 'learning_rate_init': 0.00043915907775182283,
 'solver': 'sgd'}

Parameters chosen from randommized searching were:
```
activation  = 'tanh',
alpha = 0.014025693410201838,
hidden_layer_sizes = (186, 153),
learning_rate = 'constant',
learning_rate_init = 0.006439317102679333,
solver = 'lbfgs'
```

## Loop through IDs to get average f1-score

Now that we have set up the hyperparameters to use in our model, the next step is to cycle through leaving out different IDs and then average over the resulting accuracy f1-scores. For the classifiers, I round the results so that anything 0.5 or below becomes 0 and anything above 0.5 becomes 1.

The basic process will work like this
1. Set up the models
2. Fit the models
3. Use the test data to predict y values
4. Combine models by averaging the y values
5. Average weighted avg f1 scores for individual and combined models

In [11]:
# Set up empty classification arrays
mlp_array = []
lda_array = []
knc_array = []
gnb_array = []
dtc_array = []
svc_array = []
combined_array = []

for id in id_array:
  # Generate training and testing data from the dataset, leaving out one ID at a time
  trainX, trainy, testX, testy = create_training_testing(cs_all, leave_out = id, neighbors = NEIGHBORS)
  print("[INFO]: created training and testing data structures")

  # setup scaler and fit to the training/testing input data
  scaler.fit(trainX)
  trainX_scaled = scaler.transform(trainX)
  testX_scaled = scaler.transform(testX)
  print("[INFO]: scaled the features")

  # Fit the new set of data to the model based on the random search hyperparameters
  mlp_clf = MLPClassifier(activation  = 'tanh',
                          alpha = 0.014025693410201838,
                          hidden_layer_sizes = (186, 153),
                          learning_rate = 'constant',
                          learning_rate_init = 0.006439317102679333,
                          solver = 'lbfgs')
  # Set up some other classifiers as well (using default values)
  lda_clf = LinearDiscriminantAnalysis()
  knc_clf = KNeighborsClassifier()
  gnb_clf = GaussianNB()
  dtc_clf = DecisionTreeClassifier()
  svc_clf = SVC()

  # Fit all of the classifier models.
  mlp_clf.fit(trainX_scaled, np.int_(trainy))
  lda_clf.fit(trainX_scaled, np.int_(trainy))
  knc_clf.fit(trainX_scaled, np.int_(trainy))
  gnb_clf.fit(trainX_scaled, np.int_(trainy))
  dtc_clf.fit(trainX_scaled, np.int_(trainy))
  svc_clf.fit(trainX_scaled, np.int_(trainy))

  # predict y values based on model fits
  ypred_lda = lda_clf.predict(testX_scaled)
  ypred_knc = knc_clf.predict(testX_scaled)
  ypred_gnb = gnb_clf.predict(testX_scaled)
  ypred_dtc = dtc_clf.predict(testX_scaled)
  ypred_svc = svc_clf.predict(testX_scaled)
  ypred_mlp = mlp_clf.predict(testX_scaled)

  # Combine the predictions by taking the mean of their values
  ypred_combined = np.rint(np.mean([ypred_lda,ypred_knc, ypred_mlp, ypred_gnb,ypred_dtc,ypred_svc], axis=0))

  # Record the weighted average f1 scores for the combined and individual classifiers.
  y_true = np.int_(testy)
  mlp_array.append(classification_report(np.int_(y_true),np.int_(ypred_mlp), output_dict=True).get('weighted avg').get('f1-score'))
  lda_array.append(classification_report(np.int_(y_true),np.int_(ypred_lda), output_dict=True).get('weighted avg').get('f1-score'))
  knc_array.append(classification_report(np.int_(y_true),np.int_(ypred_knc), output_dict=True).get('weighted avg').get('f1-score'))
  gnb_array.append(classification_report(np.int_(y_true),np.int_(ypred_gnb), output_dict=True).get('weighted avg').get('f1-score'))
  dtc_array.append(classification_report(np.int_(y_true),np.int_(ypred_dtc), output_dict=True).get('weighted avg').get('f1-score'))
  svc_array.append(classification_report(np.int_(y_true),np.int_(ypred_svc), output_dict=True).get('weighted avg').get('f1-score'))
  combined_array.append(classification_report(np.int_(y_true),np.int_(ypred_combined), output_dict=True).get('weighted avg').get('f1-score'))

[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and testing data structures
[INFO]: scaled the features
[INFO]: created training and

In [13]:
# Displays the mean weighted avg f1-scores obtained from the run
print("mlp:")
print(np.mean(mlp_array))
print("lda:")
print(np.mean(lda_array))
print("knc:")
print(np.mean(knc_array))
print("gnb:")
print(np.mean(gnb_array))
print("dtc:")
print(np.mean(dtc_array))
print("svc:")
print(np.mean(svc_array))
print("combined:")
print(np.mean(combined_array))

mlp:
0.8269027059419968
lda:
0.839978796851995
knc:
0.8381739921249131
gnb:
0.7854625872269962
dtc:
0.8029526223118394
svc:
0.8116905631508463
combined:
0.839401346255776


## Conclusion:

This resulted in the combined model having the highest weighted avg f1-score for only one of the trials. So it does perform well compared to the baseline model, but unfortunately it is not consistent and none of the models made it to 0.9. One option could be to optimize the hyperparameters for the other classifiers besides the `MLPClassifier` since currently these use only the default values. But we should be careful not to overfit them either.

Trial 1:

mlp:
0.8383482049711244

lda:
0.839978796851995

knc:
0.8381739921249131

gnb:
0.7854625872269962

dtc:
0.803306607289426

svc:
0.8116905631508463

combined:
0.8402191854224372

Trial 2:

mlp:
0.8269027059419968

lda:
0.839978796851995

knc:
0.8381739921249131

gnb:
0.7854625872269962

dtc:
0.8029526223118394

svc:
0.8116905631508463

combined:
0.839401346255776

Trial 3:

mlp:
0.8186848724954964

lda:
0.839978796851995

knc:
0.8381739921249131

gnb:
0.7854625872269962

dtc:
0.7973518253628668

svc:
0.8116905631508463

combined:
0.8374392223574743