# Select model to predict TP53 variant effect

William Colgan May 6 2023

### Setup

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline


### Load data

In [2]:
df = pd.read_csv("./data/Gaicomelli_Table_3.csv")
df = df[df['AA_wt'] != 'Z']
df = df[df['AA_variant'] != 'Z']
df['A549_Etoposide_Zscore'] = - df['A549_Etoposide_Zscore']
y = df[['A549_Nutlin3_Zscore', 'A549_Etoposide_Zscore']].mean(axis=1).values
X = np.load("./data/t33_650M_TP53_Embeddings.npy")

### Train test split

In [3]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

### Set up GridSearch

In [22]:
knn_grid = [
    {
        'n_neighbors': [5, 10],
        'weights': ['uniform', 'distance'],
        'algorithm': ['ball_tree', 'kd_tree'],
        'leaf_size' : [15, 30],
    }
    ]

svm_grid = [
    {
        'C' : [0.1, 1.0, 10.0],
        'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree' : [3],
        'gamma': ['scale'],
    }
]

rfr_grid = [
    {
        'n_estimators' : [20],
        'criterion' : ['squared_error'],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split' : [5, 10],
        'min_samples_leaf': [1, 4]
    }
]

In [23]:
cls_list = [KNeighborsRegressor, SVR, RandomForestRegressor]
param_grid_list = [knn_grid, svm_grid, rfr_grid]

### Run GridSearch

In [24]:
result_list = []
grid_list = []
for cls_name, param_grid in zip(cls_list, param_grid_list):
    print(cls_name)
    grid = GridSearchCV(
        estimator = cls_name(),
        param_grid = param_grid,
        scoring = 'r2',
        verbose = 1,
        n_jobs = -1 # use all available cores
    )
    grid.fit(X_train, y_train)
    result_list.append(pd.DataFrame.from_dict(grid.cv_results_))
    grid_list.append(grid)

<class 'sklearn.neighbors._regression.KNeighborsRegressor'>
Fitting 5 folds for each of 16 candidates, totalling 80 fits
<class 'sklearn.svm._classes.SVR'>
Fitting 5 folds for each of 12 candidates, totalling 60 fits
<class 'sklearn.ensemble._forest.RandomForestRegressor'>
Fitting 5 folds for each of 8 candidates, totalling 40 fits
<class 'sklearn.linear_model._coordinate_descent.ElasticNet'>
Fitting 5 folds for each of 9 candidates, totalling 45 fits


### KNN

In [25]:
result_list[0].sort_values('rank_test_score')[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.370058,0.056272,14.79479,0.098753,ball_tree,15,5,distance,"{'algorithm': 'ball_tree', 'leaf_size': 15, 'n...",0.557021,0.510342,0.549003,0.523597,0.506247,0.529242,0.020396,1
5,0.289055,0.012721,13.775185,0.155913,ball_tree,30,5,distance,"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n...",0.557021,0.510342,0.549003,0.523597,0.506247,0.529242,0.020396,1
9,0.219712,0.032317,16.886346,0.222026,kd_tree,15,5,distance,"{'algorithm': 'kd_tree', 'leaf_size': 15, 'n_n...",0.557021,0.510342,0.549003,0.523597,0.506247,0.529242,0.020396,1
13,0.194242,0.016445,15.574923,0.305918,kd_tree,30,5,distance,"{'algorithm': 'kd_tree', 'leaf_size': 30, 'n_n...",0.557021,0.510342,0.549003,0.523597,0.506247,0.529242,0.020396,1
0,0.339796,0.021166,14.788714,0.395986,ball_tree,15,5,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 15, 'n...",0.550046,0.503608,0.542253,0.516866,0.499036,0.522362,0.020436,5


### SVR

In [30]:
result_list[1].sort_values('rank_test_score')[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_degree,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,34.381377,0.271702,8.391446,0.111184,10.0,3,scale,linear,"{'C': 10.0, 'degree': 3, 'gamma': 'scale', 'ke...",0.382941,0.380355,0.370509,0.396077,0.382035,0.382383,0.008162,1
4,33.115042,1.768064,8.79329,0.152528,1.0,3,scale,linear,"{'C': 1.0, 'degree': 3, 'gamma': 'scale', 'ker...",0.20679,0.205678,0.205224,0.203449,0.215294,0.207287,0.004146,2
9,35.607034,0.475647,8.611341,0.266434,10.0,3,scale,poly,"{'C': 10.0, 'degree': 3, 'gamma': 'scale', 'ke...",0.140028,0.13865,0.140414,0.13143,0.147947,0.139694,0.005256,3
10,36.660157,0.2121,11.931242,0.494272,10.0,3,scale,rbf,"{'C': 10.0, 'degree': 3, 'gamma': 'scale', 'ke...",0.081341,0.073282,0.083422,0.072259,0.085981,0.079257,0.005506,4
0,31.026634,0.392821,7.537672,0.17181,0.1,3,scale,linear,"{'C': 0.1, 'degree': 3, 'gamma': 'scale', 'ker...",0.018534,0.006542,0.021756,0.007371,0.019957,0.014832,0.006516,5


### Random Forest

In [31]:
result_list[2].sort_values('rank_test_score')[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,3.155895,0.130659,0.011057,0.004502,squared_error,sqrt,1,10,20,"{'criterion': 'squared_error', 'max_features':...",0.454851,0.456701,0.452418,0.477127,0.431409,0.454501,0.014522,1
0,3.238744,0.122148,0.010756,0.004219,squared_error,sqrt,1,5,20,"{'criterion': 'squared_error', 'max_features':...",0.450337,0.453828,0.44256,0.458666,0.433749,0.447828,0.008783,2
2,2.910471,0.082784,0.011836,0.004754,squared_error,sqrt,4,5,20,"{'criterion': 'squared_error', 'max_features':...",0.450179,0.455898,0.442652,0.45541,0.434509,0.44773,0.008152,3
3,2.883954,0.069049,0.009884,0.002938,squared_error,sqrt,4,10,20,"{'criterion': 'squared_error', 'max_features':...",0.448974,0.440782,0.43127,0.463549,0.44823,0.446561,0.010625,4
5,1.018022,0.070129,0.016372,0.009904,squared_error,log2,1,10,20,"{'criterion': 'squared_error', 'max_features':...",0.424159,0.429375,0.430137,0.441454,0.426047,0.430234,0.006018,5


In [37]:
r2 = []
for grid in grid_list[:3]:
    print(grid.best_estimator_.get_params()) # get the model details from the estimator
    print()
    # get R2 score
    r2.append(grid.score(X_test, y_test))
    print(f'R2 score: {r2[-1]:.3f}')
    print('\n', '-' * 80, '\n')

{'algorithm': 'ball_tree', 'leaf_size': 15, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'distance'}

R2 score: 0.543

 -------------------------------------------------------------------------------- 

{'C': 10.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}

R2 score: 0.385

 -------------------------------------------------------------------------------- 

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 20, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

R2 score: 0.462

 ---------------------------------------------------------------

In [36]:
# write r2
pd.DataFrame(r2, index=['KNN', 'SVM', 'RFR'], columns=['R2']).to_csv('./data/r2.csv')