### Jupyter notebook to run Random Forests regression. This code exec the normalization of the data removing analytical drift.

In [1]:
from sklearn.model_selection import ShuffleSplit

In [2]:
%run "../../common/matrix.py"
%run "../../common/grid_search_rf.py"
%run "../../common/file_handler.py"
%run "../../common/random_forest_reg.py"

### Import matrix

In [3]:
imported_matrix = open_file_from_fullpath(ask_file_path(), 'r')
matrix_to_normalize = pd.read_perseus(imported_matrix)

In [4]:
json_file = open_file_from_fullpath(ask_file_path(), 'r')

In [5]:
grid_name = 'random_forests_regression_'
export_suffix = ''
try:
    matrix_to_normalize = filter_matrix(matrix_to_normalize, get_value_from_json(json_file, 'relevant'), get_value_from_json(json_file, 'value'))
    grid_name += 'filtered_'
    export_suffix += '_filtered'
    if get_value_from_json(json_file, 'value') == 'True':
        grid_name += 'true_'
        export_suffix += '_true'
    else:
        grid_name += 'false_'
        export_suffix += '_false'
except KeyError:
    pass

In [6]:
qc_matrix = slice_submatrix(matrix_to_normalize, get_value_from_json(json_file, 'QC_slice'))
qc_dictionary, qc_matrix = rename_matrixheader(qc_matrix, get_value_from_json(json_file, 'QC_rename'))
qc_matrix = sort_by_injection_order(qc_matrix)

sample_matrix = slice_submatrix(matrix_to_normalize, get_value_from_json(json_file, 'sample_slice'))
sample_dictionary, sample_matrix = rename_matrixheader(sample_matrix, get_value_from_json(json_file, 'sample_rename'))

try:
    ext_matrix = slice_submatrix(matrix_to_normalize, get_value_from_json(json_file, 'EXT_slice'))
    ext_dictionary, ext_matrix = rename_matrixheader(ext_matrix, get_value_from_json(json_file, 'EXT_rename'))
    sample_matrix = pd.concat((sample_matrix, ext_matrix), axis=1)
    merge_dictionary = qc_dictionary | sample_dictionary | ext_dictionary
except KeyError:
    merge_dictionary = qc_dictionary |sample_dictionary
sample_matrix = sort_by_injection_order(sample_matrix)

qc_injection_order = get_injection_order_plugin(qc_matrix)
qc_metabolites = cut_metabolites(qc_matrix)
samples_injection_order = get_injection_order_plugin(sample_matrix)
samples_metabolites = cut_metabolites(sample_matrix)

### Global parameters

In [7]:
global_params = {'max_depth': [2,3,4], 'max_features': ['log2', 'sqrt'], 'random_state': [0]}

In [8]:
X_grids_met_qc, y_grids_met_qc = generate_grid_search_features_labels_rf(qc_injection_order, qc_metabolites)
#shuf_split = ShuffleSplit(n_splits=5, random_state=0)

In [9]:
random_forest_gridsearch = grid_search_rf_regressor(grid_name + get_file_name(imported_matrix), X_grids_met_qc, y_grids_met_qc, global_params, cv = 4, checkpoint_path=r'../../../pickle_models',
checkpoint= True, single_estimator= False, checkpoint_step = 100, verbose = True, threads= -1 )


Grid search completed!


### Prediction on QC

In [10]:
random_forest_qc = get_prediction(random_forest_gridsearch, qc_metabolites)

### Prediction on samples

In [11]:
random_forest_samples = get_prediction(random_forest_gridsearch, samples_metabolites)

### Computing final values (Y') 

In [12]:
median_value_sample = compute_median_value(samples_metabolites)
median_value_qc = compute_median_value(qc_metabolites)
final_values_normalized_qc = normalized_value(qc_metabolites, random_forest_qc, median_value_qc)
final_values_normalized_sample = normalized_value(samples_metabolites, random_forest_samples, median_value_sample)

##### We'll obtain the original matrix with normalized values

In [15]:
qc_matrix

Unnamed: 0,28,35,43,50,58,65,73,80,88,95,103,110,118,125,133,143
0,29.204128,29.284740,29.188604,29.266163,29.250244,29.231503,29.245602,29.238317,29.221752,29.211271,29.167866,29.176882,29.269592,29.216793,29.268242,29.304344
1,26.019770,26.266098,26.016182,26.114361,26.087366,26.143335,26.158699,26.222691,26.156399,26.121532,26.169897,26.060667,26.094969,26.185389,26.278240,26.101849
2,25.327745,24.241356,24.189598,24.753065,24.378485,23.906855,24.062853,24.859720,23.649757,24.472296,24.694445,24.488388,24.132587,23.121363,24.338430,24.845671
3,22.078388,22.163280,22.118677,21.970919,22.187435,22.069963,22.057837,22.015381,22.259312,22.112350,22.236069,13.499348,22.210970,22.110973,13.282654,22.175493
4,27.091827,27.063560,27.145166,27.070803,27.279364,27.113132,27.117514,26.963840,26.916506,26.871754,27.028103,26.962248,27.122484,27.016420,27.090973,27.153467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2130,22.521118,22.411676,22.379129,21.549986,22.011538,22.248377,22.704926,22.459249,22.484499,22.572371,21.682755,22.255987,22.867199,22.070738,22.452581,22.402931
2131,18.448765,18.680761,18.419210,19.060463,17.355713,18.908173,19.437647,18.497469,19.040613,18.116360,18.031900,18.446239,18.647226,18.049881,18.930239,18.870829
2132,24.034586,24.248116,24.043493,24.258978,23.784533,24.023855,23.764238,24.032970,24.134535,24.106054,23.851006,24.032616,23.893776,24.114845,24.162750,24.169891
2133,22.779549,22.901974,22.837358,22.957987,22.786655,22.763355,22.875513,22.890148,22.863913,22.853632,22.852465,10.601770,22.816599,22.848936,8.596189,23.012243


In [16]:
matrix_sample_metabolites_normalized = paste_metabolites(sample_matrix, final_values_normalized_sample)
matrix_qc_metabolites_normalized = paste_metabolites(qc_matrix, final_values_normalized_qc)
merge_all_rf_normalized_inmatrix = merge_submatrix_plugin(matrix_to_normalize, matrix_qc_metabolites_normalized, matrix_sample_metabolites_normalized, merge_dictionary)
merge_all_rf_normalized_inmatrix

Column Name,X99_control_NB_Plasma_Quant_C18_Pos_140,X98_control_NB_Plasma_Quant_C18_Pos_060,X97_control_NB_Plasma_Quant_C18_Pos_057,X96_control_NB_Plasma_Quant_C18_Pos_042,X95_control_NB_Plasma_Quant_C18_Pos_072,X94_control_NB_Plasma_Quant_C18_Pos_089,X93_control_NB_Plasma_Quant_C18_Pos_138,X92_control_NB_Plasma_Quant_C18_Pos_067,X91_control_NB_Plasma_Quant_C18_Pos_025,X90_control_NB_Plasma_Quant_C18_Pos_037,...,Coefficient.of.variation,Coefficient.of.variation.EXT,Coefficient.of.variation.control,Coefficient.of.variation.local,Coefficient.of.variation.post,Coefficient.of.variation.onset,Coefficient.of.variation.QC,Alignment.ID,Metabolite.name,Adduct.type
Group1,control,control,control,control,control,control,control,control,control,control,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,23.165680,23.599106,23.658315,23.597242,23.319755,22.898130,22.852871,23.397496,23.367794,23.883632,...,0.520519,0.049456,0.298998,0.223421,0.187137,0.259624,0.027117,17,Unknown,[M+H]2+
1,25.162592,25.209470,25.521792,25.523806,25.666740,24.606828,25.751688,25.710784,24.979687,25.693530,...,0.283189,0.055227,0.351286,0.329737,0.285502,0.285121,0.053630,23,Unknown,[M+H]+
2,25.166628,24.767982,23.377769,25.108084,25.170260,25.351918,23.753827,25.105312,24.884678,23.852957,...,0.345500,0.428866,0.375141,0.301086,0.318333,0.347381,0.350448,53,Unknown,[M+H-H2O]+
3,25.552342,23.304113,25.160830,24.587544,25.720767,24.742879,18.426825,26.152627,25.298059,24.832708,...,0.445962,0.082314,0.375617,0.418718,0.396555,0.625337,0.394158,70,Unknown,[M+H]+
4,27.644570,27.552181,26.564555,26.523026,26.811721,27.224451,27.864783,27.078410,27.204205,27.099047,...,0.378095,0.080542,0.388227,0.247362,0.251423,0.345149,0.070232,110,w/o MS2:Pyrrolidine;CE30;RWRDLPDLKQPQOW-UHFFFA...,[M+H-2H2O]+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2130,20.772840,23.941382,18.693500,22.552898,21.627231,21.158691,23.706724,24.359624,22.930638,22.627831,...,1.018046,0.346006,0.677578,0.684531,0.810280,0.962365,0.220587,29711,Unknown,[M+2H]2+
2131,18.579039,19.624168,14.277417,19.422835,17.871074,19.047695,20.243545,17.663969,19.164732,19.588557,...,0.783026,0.348056,0.615917,0.686206,0.733034,0.915263,0.328759,29816,Unknown,[M+H]+
2132,24.011000,24.377262,22.710058,24.419932,23.761554,23.646480,25.214938,23.463070,24.542367,25.099388,...,0.576736,0.071524,0.384397,0.451340,0.537500,0.498825,0.102033,29845,Unknown,[M+H]+
2133,28.688252,28.287633,26.727654,29.282733,27.814003,28.774880,17.841781,29.174349,28.718952,28.906601,...,0.692087,0.030957,0.425633,0.621490,0.702032,1.273788,0.393581,29937,Unknown,[2M+H]+


In [14]:
matrix_qc_metabolites_normalized

Unnamed: 0,28,35,43,50,58,65,73,80,88,95,103,110,118,125,133,143
0,29.220119,29.256340,29.209419,29.252392,29.245372,29.239031,29.244903,29.237572,29.233614,29.226940,29.204078,29.217055,29.257333,29.228287,29.251467,29.271223
1,26.083258,26.181853,26.076682,26.118612,26.117408,26.138410,26.133851,26.167921,26.135253,26.124447,26.147801,26.104743,26.119420,26.148508,26.199201,26.103151
2,24.733408,24.297407,24.244805,24.538152,24.316256,24.176028,24.271943,24.538647,24.109986,24.447597,24.492854,24.399622,24.332661,23.936791,24.326695,24.643434
3,22.269273,22.223611,22.473656,22.490720,22.158604,22.531932,22.520635,22.424700,22.610093,22.439066,22.596815,19.389895,22.858389,22.270415,17.915425,23.325780
4,27.066379,27.058521,27.118849,27.058860,27.158769,27.071298,27.107190,27.047431,27.047546,27.002218,27.088842,27.062609,27.127241,27.074684,27.098284,27.154651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2130,22.731727,22.626063,22.592072,21.801419,22.229938,22.466309,22.930999,22.695565,22.714304,22.783981,21.929404,22.528410,23.078592,22.317358,22.719162,22.636903
2131,18.747504,19.038256,18.753750,19.370172,17.672535,19.222178,19.843494,18.824588,19.399093,18.478068,18.367268,18.814399,18.996857,18.416918,19.274883,19.279319
2132,24.105562,24.312495,24.115538,24.316778,23.853391,24.088499,23.831008,24.106069,24.202798,24.184052,23.922595,24.105247,23.967881,24.193199,24.249046,24.252315
2133,28.515732,29.118413,29.715918,29.464503,29.081200,28.562429,28.043050,29.410404,29.079306,29.097284,29.774599,14.381480,29.760509,29.188345,11.662305,28.933690


#### Path to save matrix

In [None]:
merge_all_rf_normalized_inmatrix.to_perseus(generate_new_file_path(imported_matrix, '_intensity_rf' + export_suffix))

In [None]:
imported_matrix.close()
json_file.close()