## Second Approach: Machine Learning
Different models of classic machine learning are employed.

#### Import of libraries and dataset

In [None]:
# import all libraries
import numpy as np
import pandas as pd
import re

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [None]:
from helpers import sample_data, load_data

# load data.
data_oligo_1 = np.delete(load_data("data-oligo/011021_SFL_SYN211_Oligo_1uM_Rawdata_270spectralcolumns.csv"), 0, 1).T
data_oligo_1 = data_oligo_1[data_oligo_1[:, 0] != -999, :]
data_oligo_2 = np.delete(load_data("data-oligo/051021_SFL_SYN211_Oligo_5uM_rawdata_270spectralcolumns.csv"), 0, 1).T
data_oligo_2 = data_oligo_2[data_oligo_2[:, 0] != -999, :]

data_oligo = np.append(data_oligo_1, data_oligo_2, axis = 0)

y_oligo = np.expand_dims(np.zeros(len(data_oligo)), axis=1)

data_PFF1 = np.delete(load_data("data-pff/191121_G80_AInII_SYn211_AsynPFF_5microM_rawdata_290spectracolumns.csv"), 0, 1).T
data_PFF1 = data_PFF1[data_PFF1[:, 0] != -999, :]
data_PFF2 = np.delete(load_data("data-pff/220421_G80_AInII_SYn211_AsynPFF_20microM_880_spectralcolumns.csv"), 0, 1).T
data_PFF2 = data_PFF2[data_PFF2[:, 0] != -999, :]

data_PFF =  np.append(data_PFF1, data_PFF2, axis = 0)

y_PFF = np.expand_dims(np.ones(len(data_PFF)), axis=1)

mix_50_50 = np.delete(load_data("data-mix/1221_G80_AI_SYn211_2uMPFF50__2uMOligo50__rawdata_840spectralcolumns.csv"), 0, 1).T
mix_50_50 = mix_50_50[mix_50_50[:, 0] != -999, :]

y_50_50 = np.expand_dims(np.ones(len(data_PFF))*.5, axis=1)

mix_75_25 = np.delete(load_data("data-mix/1221_G80_AI_SYn211_4.5uMPFF75__1.5uMOligo25__Rawdata_710spectralcolumns.csv"), 0, 1).T
mix_75_25 = mix_75_25[mix_75_25[:, 0] != -999, :]

y_75_25 = np.expand_dims(np.ones(len(mix_75_25))*.75, axis=1)

mix_25_75 = np.delete(load_data("data-mix/1221_G80_AI_SYn211_4.5uMOligo75%_1.5uMPFF25%_Rawdata_730spectralcolumns.csv"), 0, 1).T
mix_25_75 = mix_25_75[mix_25_75[:, 0] != -999, :]

y_25_75 = np.expand_dims(np.ones(len(mix_25_75))*.25, axis=1)

print(data_oligo.shape, data_PFF.shape, mix_50_50.shape, mix_75_25.shape, mix_25_75.shape)


(540, 133) (1170, 133) (840, 133) (710, 133) (730, 133)


In [None]:
#Build X and y by concatenating the different dataset (after sampling the same number of datapoints for each dataset)
np.random.seed()
X = np.concatenate((data_oligo, data_PFF[np.random.randint(data_PFF.shape[0], size=540),:],mix_50_50[np.random.randint(mix_50_50.shape[0], size=540),:], mix_75_25[np.random.randint(mix_75_25.shape[0], size=540),:]), axis = 0)
y = np.concatenate((y_oligo, y_PFF[np.random.randint(y_PFF.shape[0], size=540),:],y_50_50[np.random.randint(y_50_50.shape[0], size=540),:], y_75_25[np.random.randint(y_75_25.shape[0], size=540),:]), axis = 0)
X_train_, y_train_ = X, y
x_test_, y_test_ = mix_25_75, y_25_75

### Linear regression
Model trained on 0-100, 50-50, 75-25, 100-0 and tested on 25-75. Negative mean absolute error during 5-fold cross validation.

In [None]:
# linear regression
lm = LinearRegression()
scores = cross_val_score(lm, X_train_, y_train_, scoring='neg_mean_absolute_error', cv=5)
for i in range (5):
    print(f"Negative mean absolute error: {scores[i]:.4f}. iteration: {i}")

Negative mean absolute error: -0.6506. iteration: 0
Negative mean absolute error: -0.1755. iteration: 1
Negative mean absolute error: -0.1257. iteration: 2
Negative mean absolute error: -0.0910. iteration: 3
Negative mean absolute error: -0.1012. iteration: 4


In [None]:
lm.fit(X_train_, y_train_)
preds = lm.predict(x_test_)
print(f"Mean absolute error: {sklearn.metrics.mean_absolute_error(y_test_, preds):.4f}")

Mean absolute error: 0.1300


### Ridge Regression
Model trained on 0-100, 50-50, 75-25, 100-0 and tested on 25-75. Negative mean squared error and mean absolute error during 5-fold cross validation.


In [None]:
clf = Ridge(alpha=0.01)
scores = cross_val_score(clf, X_train_, y_train_, scoring='neg_mean_absolute_error', cv=5)
for i in range (5):
    print(f"Negative mean absolute error: {scores[i]:.4f}. iteration: {i}")

Negative mean absolute error: -0.6582. iteration: 0
Negative mean absolute error: -0.1794. iteration: 1
Negative mean absolute error: -0.1254. iteration: 2
Negative mean absolute error: -0.0892. iteration: 3
Negative mean absolute error: -0.0951. iteration: 4


In [None]:
clf.fit(X_train_, y_train_)
preds = clf.predict(x_test_)
print(f"Mean absolute error: {sklearn.metrics.mean_absolute_error(y_test_, preds):.4f}")

Mean absolute error: 0.1173


Cross validation for Ridge Regression to optimize alpha

In [None]:
# step-1: create a cross-validation scheme
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

# step-2: specify range of hyperparameters to tune
hyper_params = [{'alpha': [10**k for k in range(-4, 4)]}]


# step-3: perform grid search
# 3.1 specify model
lm = Ridge()
lm.fit(X_train_, y_train_)

# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = lm, 
                        param_grid = hyper_params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
model_cv.fit(X_train_, y_train_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=100, shuffle=True),
             estimator=Ridge(),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                                    1000]}],
             return_train_score=True, scoring='neg_mean_absolute_error',
             verbose=1)

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.002246,0.000231,0.000275,0.000134,0.0001,{'alpha': 0.0001},-0.111003,-0.122452,-0.121395,-0.122896,...,-0.11983,0.004452,3,-0.113699,-0.111056,-0.110518,-0.111548,-0.111806,-0.111725,0.001081
1,0.002595,0.000605,0.000384,0.000171,0.001,{'alpha': 0.001},-0.110767,-0.122563,-0.121154,-0.122526,...,-0.119541,0.004449,1,-0.113555,-0.111003,-0.11034,-0.111355,-0.111789,-0.111608,0.001083
2,0.002313,0.000243,0.000303,0.000166,0.01,{'alpha': 0.01},-0.111064,-0.124571,-0.121153,-0.121783,...,-0.119573,0.004579,2,-0.114446,-0.111984,-0.111152,-0.111833,-0.112735,-0.11243,0.001126
3,0.002043,0.000159,0.000163,3e-06,0.1,{'alpha': 0.1},-0.114745,-0.129796,-0.125247,-0.124417,...,-0.123379,0.004915,4,-0.118952,-0.116679,-0.116218,-0.116785,-0.117801,-0.117287,0.00098
4,0.002139,0.000191,0.00038,0.000165,1.0,{'alpha': 1},-0.119184,-0.134629,-0.130404,-0.129212,...,-0.127875,0.005158,5,-0.124835,-0.121536,-0.121571,-0.12262,-0.123082,-0.122728,0.001211
5,0.002173,0.000398,0.000235,0.000144,10.0,{'alpha': 10},-0.124598,-0.1409,-0.132033,-0.131484,...,-0.131453,0.005419,6,-0.130372,-0.126498,-0.127337,-0.1287,-0.128249,-0.128231,0.001312
6,0.002018,0.000153,0.000229,0.000137,100.0,{'alpha': 100},-0.132355,-0.149426,-0.138092,-0.137388,...,-0.138266,0.005965,7,-0.139024,-0.134681,-0.13603,-0.138204,-0.137494,-0.137087,0.001554
7,0.001962,0.000118,0.000167,9e-06,1000.0,{'alpha': 1000},-0.154758,-0.16948,-0.159477,-0.158845,...,-0.159768,0.005149,8,-0.160802,-0.157151,-0.157421,-0.160326,-0.160843,-0.159309,0.001664


### Extremely randomized trees
Model trained on 0-100, 50-50, 75-25, 100-0 and tested on 25-75.


In [None]:
from sklearn.ensemble import ExtraTreesRegressor
# extremely randomized trees model with default parameters
reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(X_train_, y_train_.ravel())
preds = reg.predict(x_test_)
print(f"Mean absolute error: {sklearn.metrics.mean_absolute_error(y_test_, preds):.4f}")

Mean absolute error: 0.2847


In [None]:
import optuna

def objective(trial):
    # parameters to optimize
    a = trial.suggest_categorical("model_params/etr/n_estimators", [100, 150, 200, 300, 400, 500])
    b = trial.suggest_loguniform("model_params/etr/max_depth", 2, 32)
    c = trial.suggest_categorical("model_params/etr/min_samples_split", [4, 8, 16])
    d = trial.suggest_categorical("model_params/etr/max_features", ['auto', 0.2, 0.4, 0.6, 0.8])
    e = trial.suggest_categorical("model_params/etr/min_samples_leaf", [1, 2, 3, 4])

    reg = ExtraTreesRegressor(n_estimators=a, max_depth=b, min_samples_split=c, max_features=d, min_samples_leaf=e, random_state=430)
    return cross_val_score(reg, X_train_, y_train_.ravel(), cv=5, scoring='neg_mean_absolute_error').mean()

study = optuna.create_study(direction='maximize')
# run param optimization with optuna
study.optimize(objective, n_trials=20)

[32m[I 2021-12-22 18:57:46,422][0m A new study created in memory with name: no-name-ed47b21a-9e80-404a-8996-2d5271fdd6b6[0m
[32m[I 2021-12-22 18:57:46,952][0m Trial 0 finished with value: -0.34508797150964343 and parameters: {'model_params/etr/n_estimators': 100, 'model_params/etr/max_depth': 2.078048183830405, 'model_params/etr/min_samples_split': 16, 'model_params/etr/max_features': 0.4, 'model_params/etr/min_samples_leaf': 4}. Best is trial 0 with value: -0.34508797150964343.[0m
[32m[I 2021-12-22 18:57:52,748][0m Trial 1 finished with value: -0.19179055484803767 and parameters: {'model_params/etr/n_estimators': 500, 'model_params/etr/max_depth': 5.11563846401465, 'model_params/etr/min_samples_split': 4, 'model_params/etr/max_features': 0.6, 'model_params/etr/min_samples_leaf': 1}. Best is trial 1 with value: -0.19179055484803767.[0m
[32m[I 2021-12-22 18:57:57,502][0m Trial 2 finished with value: -0.18773357085809833 and parameters: {'model_params/etr/n_estimators': 500, '

In [None]:
reg = ExtraTreesRegressor(n_estimators=100, max_depth=20, min_samples_split=4, max_features=0.8, min_samples_leaf=2, random_state=0).fit(X_train_, y_train_.ravel())
preds = reg.predict(x_test_)
print(f"Mean absolute error: {sklearn.metrics.mean_absolute_error(y_test_, preds):.4f}")


Mean absolute error: 0.2747


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d66177de-7ea1-46c8-aea6-26d701dd9bc9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>