In [125]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
import os, sys

In [127]:
import pandas as pd
from loguru import logger

In [128]:
sys.path.append('..')

from pyMultiOmics.constants import *
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info
from pyMultiOmics.analysis import *
from pyMultiOmics.query import *
from pyMultiOmics.pipelines import *
from pyMultiOmics.functions import *



# Demonstration of pyMultiOmics

## Load the processed Zebrafish data from [1]

[1] [Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.](https://www.pnas.org/content/114/5/E717.short)

In [129]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'zebrafish_data'))
DATA_FOLDER

'/Users/Karen/pyMultiOmics/notebooks/test_data/zebrafish_data'

Read metabolomics data

In [130]:
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data_chebi.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [31]:
compound_data

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18139,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
57305,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
15724,5787534,4351239,4401036,8187282,8431125,5082056,5138937,7341351,7837293,9256269,9934066,10243285,7344406,5524811,4809250,9279874,9047339,9211255
17148,3430897,1877785,1225710,2326620,2421267,2595529,2003627,2120053,2269318,3220850,4596854,3155377,3760854,2658833,2488025,2506550,4000703,3292566
15611,112845,129977,122292,63219,50113,100343,156651,176682,379322,160906,56802,107161,235982,181200,142994,116132,94589,167280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17202,23590107,17432727,11821183,18396545,10737880,18778788,19264992,8119525,25719881,26895322,20489811,23362861,30532905,7049672,9853902,19511285,30291647,31272774
17659,176648,128962,90159,138395,91016,137467,200091,151758,418224,265649,142543,177308,475141,202469,133316,206919,311047,269830
456216,68045,42897,39287,54154,35470,50695,68939,76078,127354,95864,69298,78328,136188,66532,71612,73803,111415,112341
17552,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472


In [32]:
compound_data.loc[18139]

distal_M1       75170
distal_M2       57052
distal_M3       39170
distal_F1       84057
distal_F2       38608
distal_F3       64126
middle_M1       50214
middle_M2       75680
middle_M3      165178
middle_F1      121856
middle_F2       77061
middle_F3       98015
proximal_M1    113765
proximal_M2     96098
proximal_M3     84198
proximal_F1    117644
proximal_F2    169459
proximal_F3    169669
Name: 18139, dtype: int64

In [131]:
fly_new_data = pd.read_csv(os.path.join(DATA_FOLDER, '../fly_data/fly_metabolomics_no_dupes.csv'), index_col='Identifier')

In [42]:
fly_data = prepare_input(fly_new_data)

In [43]:
fly_data 

Unnamed: 0,Identifier,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
0,17203,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
1,30322,1.751118e+06,2.144522e+06,3106428.8,1749463.8,4435960.0,5334241.0,4967291.0,3.838863e+06,8.655744e+05,...,3.438412e+06,1.945485e+06,2.882596e+06,3.534272e+06,3.544821e+06,2.420287e+06,4.427952e+06,4.842214e+06,2.482414e+06,2.303564e+06
2,16313,1.751118e+06,2.144522e+06,3106428.8,1749463.8,4435960.0,5334241.0,4967291.0,3.838863e+06,8.655744e+05,...,3.438412e+06,1.945485e+06,2.882596e+06,3.534272e+06,3.544821e+06,2.420287e+06,4.427952e+06,4.842214e+06,2.482414e+06,2.303564e+06
3,26271,1.751118e+06,2.144522e+06,3106428.8,1749463.8,4435960.0,5334241.0,4967291.0,3.838863e+06,8.655744e+05,...,3.438412e+06,1.945485e+06,2.882596e+06,3.534272e+06,3.544821e+06,2.420287e+06,4.427952e+06,4.842214e+06,2.482414e+06,2.303564e+06
4,340824,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7132,90031,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.644200e+04,2.975845e+04,1.682612e+04,9.900591e+03
7133,65114,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,6.110493e+03,...,1.340211e+04,1.022194e+04,1.403081e+04,1.048442e+04,1.191305e+04,1.353725e+04,9.332865e+03,1.020300e+04,8.965938e+03,1.151025e+04
7134,22990,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,...,2.379862e+04,1.879203e+04,2.216306e+04,2.543246e+04,2.844122e+04,2.477530e+04,2.306076e+04,2.287044e+04,2.864523e+04,2.137121e+04
7135,39949,1.597361e+04,1.108535e+04,0.0,0.0,0.0,0.0,0.0,1.789832e+04,2.300808e+04,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [45]:
r_chebi = get_related_chebi(fly_data)

2021-04-07 16:10:11.687 | INFO     | pyMultiOmics.functions:get_chebi_relation_dict:395 - Getting chebi_bfs_relation_dict.pkl 
2021-04-07 16:10:13.563 | INFO     | pyMultiOmics.functions:get_related_chebi:347 - Inserted 2088 related compounds


In [124]:
remove_dupes(r_chebi)

2021-04-07 21:17:35.831 | INFO     | pyMultiOmics.functions:remove_dupes:382 - Removing 29 rows with duplicate identifiers


Unnamed: 0,Identifier,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
0,17203,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
0,32864,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
0,32862,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
0,60039,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
1,30322,1.75112e+06,2.14452e+06,3.10643e+06,1.74946e+06,4.43596e+06,5.33424e+06,4.96729e+06,3.83886e+06,865574,...,3.43841e+06,1.94548e+06,2.8826e+06,3.53427e+06,3.54482e+06,2.42029e+06,4.42795e+06,4.84221e+06,2.48241e+06,2.30356e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7135,48316,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,...,0,0,0,0,0,0,0,0,0,0
7135,18036,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,...,0,0,0,0,0,0,0,0,0,0
7136,25371,0,0,0,0,0,0,0,0,10007.4,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3
7136,36263,0,0,0,0,0,0,0,0,10007.4,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3


In [53]:
fly_data

Unnamed: 0,Identifier,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
0,17203,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
1,30322,1.751118e+06,2.144522e+06,3106428.8,1749463.8,4435960.0,5334241.0,4967291.0,3.838863e+06,8.655744e+05,...,3.438412e+06,1.945485e+06,2.882596e+06,3.534272e+06,3.544821e+06,2.420287e+06,4.427952e+06,4.842214e+06,2.482414e+06,2.303564e+06
2,16313,1.751118e+06,2.144522e+06,3106428.8,1749463.8,4435960.0,5334241.0,4967291.0,3.838863e+06,8.655744e+05,...,3.438412e+06,1.945485e+06,2.882596e+06,3.534272e+06,3.544821e+06,2.420287e+06,4.427952e+06,4.842214e+06,2.482414e+06,2.303564e+06
3,26271,1.751118e+06,2.144522e+06,3106428.8,1749463.8,4435960.0,5334241.0,4967291.0,3.838863e+06,8.655744e+05,...,3.438412e+06,1.945485e+06,2.882596e+06,3.534272e+06,3.544821e+06,2.420287e+06,4.427952e+06,4.842214e+06,2.482414e+06,2.303564e+06
4,340824,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7132,90031,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.644200e+04,2.975845e+04,1.682612e+04,9.900591e+03
7133,65114,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,6.110493e+03,...,1.340211e+04,1.022194e+04,1.403081e+04,1.048442e+04,1.191305e+04,1.353725e+04,9.332865e+03,1.020300e+04,8.965938e+03,1.151025e+04
7134,22990,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,...,2.379862e+04,1.879203e+04,2.216306e+04,2.543246e+04,2.844122e+04,2.477530e+04,2.306076e+04,2.287044e+04,2.864523e+04,2.137121e+04
7135,39949,1.597361e+04,1.108535e+04,0.0,0.0,0.0,0.0,0.0,1.789832e+04,2.300808e+04,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [123]:
remove_dupes(fly_data)

2021-04-07 21:17:01.126 | INFO     | pyMultiOmics.functions:remove_dupes:382 - Removing 0 rows with duplicate identifiers


Unnamed: 0,Identifier,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
0,17203,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
1,30322,1.751118e+06,2.144522e+06,3106428.8,1749463.8,4435960.0,5334241.0,4967291.0,3.838863e+06,8.655744e+05,...,3.438412e+06,1.945485e+06,2.882596e+06,3.534272e+06,3.544821e+06,2.420287e+06,4.427952e+06,4.842214e+06,2.482414e+06,2.303564e+06
2,16313,1.751118e+06,2.144522e+06,3106428.8,1749463.8,4435960.0,5334241.0,4967291.0,3.838863e+06,8.655744e+05,...,3.438412e+06,1.945485e+06,2.882596e+06,3.534272e+06,3.544821e+06,2.420287e+06,4.427952e+06,4.842214e+06,2.482414e+06,2.303564e+06
3,26271,1.751118e+06,2.144522e+06,3106428.8,1749463.8,4435960.0,5334241.0,4967291.0,3.838863e+06,8.655744e+05,...,3.438412e+06,1.945485e+06,2.882596e+06,3.534272e+06,3.544821e+06,2.420287e+06,4.427952e+06,4.842214e+06,2.482414e+06,2.303564e+06
4,340824,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7132,90031,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.644200e+04,2.975845e+04,1.682612e+04,9.900591e+03
7133,65114,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,6.110493e+03,...,1.340211e+04,1.022194e+04,1.403081e+04,1.048442e+04,1.191305e+04,1.353725e+04,9.332865e+03,1.020300e+04,8.965938e+03,1.151025e+04
7134,22990,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,...,2.379862e+04,1.879203e+04,2.216306e+04,2.543246e+04,2.844122e+04,2.477530e+04,2.306076e+04,2.287044e+04,2.864523e+04,2.137121e+04
7135,39949,1.597361e+04,1.108535e+04,0.0,0.0,0.0,0.0,0.0,1.789832e+04,2.300808e+04,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [59]:
no_dupes

Unnamed: 0,Identifier,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
0,17203,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
1,32864,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
2,32862,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
3,60039,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
4,30322,1.75112e+06,2.14452e+06,3.10643e+06,1.74946e+06,4.43596e+06,5.33424e+06,4.96729e+06,3.83886e+06,865574,...,3.43841e+06,1.94548e+06,2.8826e+06,3.53427e+06,3.54482e+06,2.42029e+06,4.42795e+06,4.84221e+06,2.48241e+06,2.30356e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9173,48316,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,...,0,0,0,0,0,0,0,0,0,0
9174,18036,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,...,0,0,0,0,0,0,0,0,0,0
9175,25371,0,0,0,0,0,0,0,0,10007.4,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3
9176,36263,0,0,0,0,0,0,0,0,10007.4,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3


In [1134]:
os.getcwd()

'/Users/Karen/pyMultiOmics/notebooks'

In [1135]:
set_log_level_info()

12

In [1136]:
type(compound_design)

pandas.core.frame.DataFrame

In [1137]:
print(compound_design.head())

            group
sample           
distal_M1  Distal
distal_M2  Distal
distal_M3  Distal
distal_F1  Distal
distal_F2  Distal


In [1138]:
compound_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
distal_M1,Distal
distal_M2,Distal
distal_M3,Distal
distal_F1,Distal
distal_F2,Distal
distal_F3,Distal
middle_M1,Middle
middle_M2,Middle
middle_M3,Middle
middle_F1,Middle


In [1139]:
compound_data.head()

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18139,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
57305,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
15724,5787534,4351239,4401036,8187282,8431125,5082056,5138937,7341351,7837293,9256269,9934066,10243285,7344406,5524811,4809250,9279874,9047339,9211255
17148,3430897,1877785,1225710,2326620,2421267,2595529,2003627,2120053,2269318,3220850,4596854,3155377,3760854,2658833,2488025,2506550,4000703,3292566
15611,112845,129977,122292,63219,50113,100343,156651,176682,379322,160906,56802,107161,235982,181200,142994,116132,94589,167280


## Methods for adding related chebi IDs

In [1104]:
# This method is pretty inefficient with the use of iterrows but I'm not sure of another way to run this
# All attempts at vectorisation failed - help Joe?
def get_related_chebi_data(cmpd_data):
    # dont want to modify the original df
    cmpd_data = cmpd_data.copy()
    
    # ensure index type is set to string, since get_chebi_relation_dict also returns string as the keys
    cmpd_data.index = cmpd_data.index.map(str)
    chebi_rel_dict = get_chebi_relation_dict()
    with_related = list(chebi_rel_dict.keys())
    cmpd_data.loc[cmpd_data.index.isin(with_related), 'related']= 'Yes'
    cmpd_data = cmpd_data.reset_index()
    
    # We use this related_df so that we are not looking at all rows, only those with related chebi_ids
    related_df = cmpd_data[cmpd_data.related=='Yes']
#     print(related_df)
    
    for ix, row in related_df.iterrows():
        print (ix)
        chebi_list = chebi_rel_dict[str(row.Identifier)]
        for c in chebi_list:
            #Check if the duplicate row with that chebi exists in the DF
            current_row = row
            current_row.Identifier = int(c)
            matches  = cmpd_data[(cmpd_data==current_row).all(axis=1)]

            if len(matches) == 0:
#                 print ("no matching rows, appending")
                cmpd_data = cmpd_data.append(current_row)
#            else:
#                 print ("row found in DF therefore skipping")
    c_data = cmpd_data.drop(['related'], axis=1)
    c_data = c_data.set_index(['Identifier'])
    
    return c_data


In [1140]:
def get_related_chebi_data_v2(cmpd_data):
    cmpd_data = cmpd_data.copy()
    
    # ensure index type is set to string, since get_chebi_relation_dict also returns string as the keys
    cmpd_data.index = cmpd_data.index.map(str)
    cmpd_data = cmpd_data.reset_index()
    original_cmpds = set(cmpd_data['Identifier']) # used for checking later

    # construct the related chebi dict
    chebi_rel_dict = get_chebi_relation_dict()    

    # loop through each row in cmpd_data
    with_related_data = []
    for ix, row in cmpd_data.iterrows():   
        
        # add the current row we're looping
        current_identifier = row['Identifier']
        with_related_data.append(row)

        # check if there are related compounds to add
        if current_identifier in chebi_rel_dict:

            # if yes, get the related compounds
            chebi_list = chebi_rel_dict[current_identifier]        
            for c in chebi_list:

                # add the related chebi, but only if it's not already present in the original compound
                if c not in original_cmpds:
                    current_row = row.copy()
                    current_row['Identifier'] = c
                    with_related_data.append(current_row)

    # combine all the rows into a single dataframe
    df = pd.concat(with_related_data, axis=1).T
    df = df.set_index('Identifier')
    logger.info('Inserted %d related compounds' % (len(df) - len(cmpd_data)))    
    return df

In [1141]:
def remove_dupes(df):    
    df = df.reset_index()

    # group df by the 'Identifier' column
    to_delete = []
    grouped = df.groupby(df['Identifier'])
    for identifier, group_df in grouped:
        
        # if there are multiple rows sharing the same identifier
        if len(group_df) > 1: 

            # remove 'Identifier' column from the grouped df since it can't be summed
            group_df = group_df.drop('Identifier', axis=1)

            # find the row with the largest sum across the row in the group
            idxmax = group_df.sum(axis=1).idxmax()

            # mark all the rows in the group for deletion, except the one with the largest sum
            temp = group_df.index.tolist()
            temp.remove(idxmax)
            to_delete.extend(temp)

    # actually do the deletion here
    logger.info('Removing %d rows with duplicate identifiers' % (len(to_delete)))
    df = df.drop(to_delete)
    df = df.set_index('Identifier')
    return df

In [1142]:
def get_chebi_relation_dict():
    """
    A method to parse the chebi relation tsv and store the relationship we want in a dictionary
    :return: Dict with structure Chebi_id: [related_chebi_ids]
    """
    CHEBI_BFS_RELATION_DICT = 'chebi_bfs_relation_dict.pkl'
    try:
        chebi_bfs_relation_dict = load_object("../pyMultiOmics/data/" + CHEBI_BFS_RELATION_DICT)
    except Exception as e:
        logger.info("Constructing %s " % CHEBI_BFS_RELATION_DICT)
        try:
            chebi_relation_df = pd.read_csv("data/relation.tsv", delimiter="\t")

        except FileNotFoundError as e:

            logger.error("data/relation.tsv must be present")
            raise e

            # List of relationship we want in the dictionary
        select_list = ["is_conjugate_base_of", "is_conjugate_acid_of", "is_tautomer_of"]
        chebi_select_df = chebi_relation_df[chebi_relation_df.TYPE.isin(select_list)]

        chebi_relation_dict = {}
        # Gather all the INIT_IDs into a dictionary so that each INIT_ID is unique
        for ix, row in chebi_select_df.iterrows():
            init_id = str(row.INIT_ID)
            final_id = str(row.FINAL_ID)
            if init_id in chebi_relation_dict.keys():
                # Append the final_id onto the existing values
                id_1 = chebi_relation_dict[init_id]
                joined_string = ", ".join([id_1, final_id])
                chebi_relation_dict[init_id] = joined_string
            else:  # make a new key entry for the dict
                chebi_relation_dict[init_id] = final_id

        # Change string values to a list.
        graph = {k: v.replace(" ", "").split(",") for k, v in chebi_relation_dict.items()}

        chebi_bfs_relation_dict = {}
        for k, v in graph.items():
            r_chebis = bfs_get_related(graph, k)
            r_chebis.remove(k) #remove original key from list

            chebi_bfs_relation_dict[k] = r_chebis
        try:
            logger.info("saving chebi_relation_dict")
            save_object(chebi_bfs_relation_dict, "./data/" + CHEBI_BFS_RELATION_DICT + ".pkl")


        except Exception as e:
            logger.error("Pickle didn't work because of %s " % e)
            traceback.print_exc()
            pass


    return chebi_bfs_relation_dict


In [1143]:
import gzip
import pickle

def load_object(filename):
    """
    Load saved object from file
    :param filename: The file to load
    :return: the loaded object
    """
    with gzip.GzipFile(filename, 'rb') as f:
        return pickle.load(f)

In [1144]:
def bfs_get_related(graph_dict, node):
    """
    :param graph: Dictionary of key: ['value'] pairs
    :param node: the key for which all related values should be returned
    :return: All related keys as a list
    """
    visited = [] # List to keep track of visited nodes.
    queue = []     #Initialize a queue
    related_keys = []

    visited.append(node)
    queue.append(node)

    while queue:
        k = queue.pop(0)
        related_keys.append(k)

        for neighbour in graph_dict[k]:
          if neighbour not in visited:
            visited.append(neighbour)
            queue.append(neighbour)

    return related_keys


In [1145]:
def get_related_chebi_ids(chebi_ids):
    """
    :param chebi_ids: A list of chebi IDS
    :return: A set of related chebi_IDs that are not already in the list
    """
    chebi_relation_dict = get_chebi_relation_dict()
    related_chebis = set()

    for c_id in chebi_ids:
        if c_id in chebi_relation_dict:
            related_chebis.update(chebi_relation_dict[c_id])

    return related_chebis

### For each chebi_id in the DF that has other relaed Chebi_ids, add on a duplicate row.

##### For the Zebrafish DF we expect the input and output to be the same as all the related Chebi_ids are already present in the DF


In [1111]:
zebra_f_related_chebi = get_related_chebi_data_v2(compound_data)

2021-03-29 13:21:28.430 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds
2021-03-29 13:21:28.430 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds
2021-03-29 13:21:28.430 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds
2021-03-29 13:21:28.430 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds
2021-03-29 13:21:28.430 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds
2021-03-29 13:21:28.430 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds
2021-03-29 13:21:28.430 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds
2021-03-29 13:21:28.430 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds
2021-03-29 13:21:28.430 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds
2021-03-29 13:21:28.430 | IN

In [1112]:
compound_data

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18139,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
57305,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
15724,5787534,4351239,4401036,8187282,8431125,5082056,5138937,7341351,7837293,9256269,9934066,10243285,7344406,5524811,4809250,9279874,9047339,9211255
17148,3430897,1877785,1225710,2326620,2421267,2595529,2003627,2120053,2269318,3220850,4596854,3155377,3760854,2658833,2488025,2506550,4000703,3292566
15611,112845,129977,122292,63219,50113,100343,156651,176682,379322,160906,56802,107161,235982,181200,142994,116132,94589,167280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17202,23590107,17432727,11821183,18396545,10737880,18778788,19264992,8119525,25719881,26895322,20489811,23362861,30532905,7049672,9853902,19511285,30291647,31272774
17659,176648,128962,90159,138395,91016,137467,200091,151758,418224,265649,142543,177308,475141,202469,133316,206919,311047,269830
456216,68045,42897,39287,54154,35470,50695,68939,76078,127354,95864,69298,78328,136188,66532,71612,73803,111415,112341
17552,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472


In [1113]:
zebra_f_related_chebi

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18139,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
58389,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
57305,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
15428,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
32507,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17552,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
65180,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
58189,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
15919,33991,21340,16393,22992,12814,19732,27512,26427,45803,37193,30502,22312,67244,32223,39682,46545,64364,66252


In [1114]:
zebra_f_related_chebi_no_dupes = remove_dupes(zebra_f_related_chebi)
zebra_f_related_chebi_no_dupes

2021-03-29 13:21:43.472 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers
2021-03-29 13:21:43.472 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers
2021-03-29 13:21:43.472 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers
2021-03-29 13:21:43.472 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers
2021-03-29 13:21:43.472 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers
2021-03-29 13:21:43.472 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers
2021-03-29 13:21:43.472 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers
2021-03-29 13:21:43.472 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers
2021-03-29 13:21:43.472 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers
2021-03-29 13:21:43.472 | INFO     | 

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18139,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
58389,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
57305,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
15428,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
32507,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17552,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
65180,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
58189,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
15919,33991,21340,16393,22992,12814,19732,27512,26427,45803,37193,30502,22312,67244,32223,39682,46545,64364,66252


##### For the Fly DF we expect the input and output to be the same as all the related Chebi_ids are already present in the DF

In [1146]:
fly_related_chebi = get_related_chebi_data_v2(fly_compound_data)
fly_related_chebi

2021-03-30 10:17:54.222 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds
2021-03-30 10:17:54.222 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds
2021-03-30 10:17:54.222 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds
2021-03-30 10:17:54.222 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds
2021-03-30 10:17:54.222 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds
2021-03-30 10:17:54.222 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds
2021-03-30 10:17:54.222 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds
2021-03-30 10:17:54.222 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds
2021-03-30 10:17:54.222 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds
2021-03-30 10:17:54.222 | INFO     | __main__:

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
30768,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
30322,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
27957,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
52342,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48314,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,32720.5,...,0,0,0,0,0,0,0,0,0,0
48315,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,32720.5,...,0,0,0,0,0,0,0,0,0,0
25371,0,0,0,0,0,0,0,0,10007.4,13116.3,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3
36264,0,0,0,0,0,0,0,0,10007.4,13116.3,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3


In [1117]:
fly_compound_data

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
30768,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
30322,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
27957,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
52342,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48314,1.597361e+04,1.108535e+04,0.0,0.0,0.0,0.0,0.0,1.789832e+04,2.300808e+04,3.272052e+04,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
48315,1.597361e+04,1.108535e+04,0.0,0.0,0.0,0.0,0.0,1.789832e+04,2.300808e+04,3.272052e+04,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
25371,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,1.000735e+04,1.311626e+04,...,1.689397e+04,1.812973e+04,1.512663e+04,1.870111e+04,1.397613e+04,1.975729e+04,1.498166e+04,1.991403e+04,1.785374e+04,2.120331e+04
36264,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,1.000735e+04,1.311626e+04,...,1.689397e+04,1.812973e+04,1.512663e+04,1.870111e+04,1.397613e+04,1.975729e+04,1.498166e+04,1.991403e+04,1.785374e+04,2.120331e+04


In [1150]:
fly_r_chebi_new = get_related_chebi_data_v2(fly_new_data)
fly_r_chebi_new

2021-03-30 10:21:11.896 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 7048 related compounds
2021-03-30 10:21:11.896 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 7048 related compounds
2021-03-30 10:21:11.896 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 7048 related compounds
2021-03-30 10:21:11.896 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 7048 related compounds
2021-03-30 10:21:11.896 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 7048 related compounds
2021-03-30 10:21:11.896 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 7048 related compounds
2021-03-30 10:21:11.896 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 7048 related compounds
2021-03-30 10:21:11.896 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 7048 related compounds
2021-03-30 10:21:11.896 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 7048 related compounds
2021-03-30 10:21:11

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
32864,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
32862,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
60039,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
30768,541035,363208,1.28377e+06,433412,512450,533919,561412,632462,236013,175521,...,417031,164268,247820,322602,228631,224659,384300,361866,171162,171142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48316,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,32720.5,...,0,0,0,0,0,0,0,0,0,0
18036,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,32720.5,...,0,0,0,0,0,0,0,0,0,0
25371,0,0,0,0,0,0,0,0,10007.4,13116.3,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3
36263,0,0,0,0,0,0,0,0,10007.4,13116.3,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3


In [1151]:
fly_new_data

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.838642e+07,9.720054e+07,156959540.0,77795784.0,2.705256e+08,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
30768,5.410354e+05,3.632083e+05,1283765.1,433412.4,5.124503e+05,533918.9,561412.0,6.324620e+05,2.360131e+05,1.755211e+05,...,4.170311e+05,1.642684e+05,2.478198e+05,3.226023e+05,2.286313e+05,2.246591e+05,3.842999e+05,3.618665e+05,1.711618e+05,1.711421e+05
30322,5.410354e+05,3.632083e+05,1283765.1,433412.4,5.124503e+05,533918.9,561412.0,6.324620e+05,2.360131e+05,1.755211e+05,...,4.170311e+05,1.642684e+05,2.478198e+05,3.226023e+05,2.286313e+05,2.246591e+05,3.842999e+05,3.618665e+05,1.711618e+05,1.711421e+05
27957,5.410354e+05,3.632083e+05,1283765.1,433412.4,5.124503e+05,533918.9,561412.0,6.324620e+05,2.360131e+05,1.755211e+05,...,4.170311e+05,1.642684e+05,2.478198e+05,3.226023e+05,2.286313e+05,2.246591e+05,3.842999e+05,3.618665e+05,1.711618e+05,1.711421e+05
52342,5.410354e+05,3.632083e+05,1283765.1,433412.4,5.124503e+05,533918.9,561412.0,6.324620e+05,2.360131e+05,1.755211e+05,...,4.170311e+05,1.642684e+05,2.478198e+05,3.226023e+05,2.286313e+05,2.246591e+05,3.842999e+05,3.618665e+05,1.711618e+05,1.711421e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90031,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.644200e+04,2.975845e+04,1.682612e+04,9.900591e+03
65114,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.0,0.000000e+00,6.110493e+03,1.083654e+04,...,1.340211e+04,1.022194e+04,1.403081e+04,1.048442e+04,1.191305e+04,1.353725e+04,9.332865e+03,1.020300e+04,8.965938e+03,1.151025e+04
22990,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,...,2.379862e+04,1.879203e+04,2.216306e+04,2.543246e+04,2.844122e+04,2.477530e+04,2.306076e+04,2.287044e+04,2.864523e+04,2.137121e+04
39949,1.597361e+04,1.108535e+04,0.0,0.0,0.000000e+00,0.0,0.0,1.789832e+04,2.300808e+04,3.272052e+04,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [1159]:
fly_r_chebi_new[fly_r_chebi_new['CAR_F1.mzXML']==78386424.0]

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,78386400.0,97200500.0,156960000.0,77795800.0,270526000.0,294458000.0,292876000.0,214637000.0,60363800.0,47209200.0,...,276179000.0,159044000.0,275950000.0,296183000.0,339110000.0,227686000.0,403786000.0,414714000.0,237490000.0,209447000.0
32864,78386400.0,97200500.0,156960000.0,77795800.0,270526000.0,294458000.0,292876000.0,214637000.0,60363800.0,47209200.0,...,276179000.0,159044000.0,275950000.0,296183000.0,339110000.0,227686000.0,403786000.0,414714000.0,237490000.0,209447000.0
32862,78386400.0,97200500.0,156960000.0,77795800.0,270526000.0,294458000.0,292876000.0,214637000.0,60363800.0,47209200.0,...,276179000.0,159044000.0,275950000.0,296183000.0,339110000.0,227686000.0,403786000.0,414714000.0,237490000.0,209447000.0
60039,78386400.0,97200500.0,156960000.0,77795800.0,270526000.0,294458000.0,292876000.0,214637000.0,60363800.0,47209200.0,...,276179000.0,159044000.0,275950000.0,296183000.0,339110000.0,227686000.0,403786000.0,414714000.0,237490000.0,209447000.0


In [1164]:
new_fly_no_dupes = remove_dupes(fly_r_chebi_new)
new_fly_no_dupes

2021-03-30 10:40:43.050 | INFO     | __main__:remove_dupes:24 - Removing 19003 rows with duplicate identifiers
2021-03-30 10:40:43.050 | INFO     | __main__:remove_dupes:24 - Removing 19003 rows with duplicate identifiers
2021-03-30 10:40:43.050 | INFO     | __main__:remove_dupes:24 - Removing 19003 rows with duplicate identifiers
2021-03-30 10:40:43.050 | INFO     | __main__:remove_dupes:24 - Removing 19003 rows with duplicate identifiers
2021-03-30 10:40:43.050 | INFO     | __main__:remove_dupes:24 - Removing 19003 rows with duplicate identifiers
2021-03-30 10:40:43.050 | INFO     | __main__:remove_dupes:24 - Removing 19003 rows with duplicate identifiers
2021-03-30 10:40:43.050 | INFO     | __main__:remove_dupes:24 - Removing 19003 rows with duplicate identifiers
2021-03-30 10:40:43.050 | INFO     | __main__:remove_dupes:24 - Removing 19003 rows with duplicate identifiers
2021-03-30 10:40:43.050 | INFO     | __main__:remove_dupes:24 - Removing 19003 rows with duplicate identifiers
2

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
32864,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
32862,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
60039,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
30322,1.75112e+06,2.14452e+06,3.10643e+06,1.74946e+06,4.43596e+06,5.33424e+06,4.96729e+06,3.83886e+06,865574,629671,...,3.43841e+06,1.94548e+06,2.8826e+06,3.53427e+06,3.54482e+06,2.42029e+06,4.42795e+06,4.84221e+06,2.48241e+06,2.30356e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48316,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,32720.5,...,0,0,0,0,0,0,0,0,0,0
18036,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,32720.5,...,0,0,0,0,0,0,0,0,0,0
25371,0,0,0,0,0,0,0,0,10007.4,13116.3,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3
36263,0,0,0,0,0,0,0,0,10007.4,13116.3,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3


In [1173]:
fly_new_data

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.838642e+07,9.720054e+07,156959540.0,77795784.0,2.705256e+08,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
30768,5.410354e+05,3.632083e+05,1283765.1,433412.4,5.124503e+05,533918.9,561412.0,6.324620e+05,2.360131e+05,1.755211e+05,...,4.170311e+05,1.642684e+05,2.478198e+05,3.226023e+05,2.286313e+05,2.246591e+05,3.842999e+05,3.618665e+05,1.711618e+05,1.711421e+05
30322,5.410354e+05,3.632083e+05,1283765.1,433412.4,5.124503e+05,533918.9,561412.0,6.324620e+05,2.360131e+05,1.755211e+05,...,4.170311e+05,1.642684e+05,2.478198e+05,3.226023e+05,2.286313e+05,2.246591e+05,3.842999e+05,3.618665e+05,1.711618e+05,1.711421e+05
27957,5.410354e+05,3.632083e+05,1283765.1,433412.4,5.124503e+05,533918.9,561412.0,6.324620e+05,2.360131e+05,1.755211e+05,...,4.170311e+05,1.642684e+05,2.478198e+05,3.226023e+05,2.286313e+05,2.246591e+05,3.842999e+05,3.618665e+05,1.711618e+05,1.711421e+05
52342,5.410354e+05,3.632083e+05,1283765.1,433412.4,5.124503e+05,533918.9,561412.0,6.324620e+05,2.360131e+05,1.755211e+05,...,4.170311e+05,1.642684e+05,2.478198e+05,3.226023e+05,2.286313e+05,2.246591e+05,3.842999e+05,3.618665e+05,1.711618e+05,1.711421e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90031,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.644200e+04,2.975845e+04,1.682612e+04,9.900591e+03
65114,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.0,0.000000e+00,6.110493e+03,1.083654e+04,...,1.340211e+04,1.022194e+04,1.403081e+04,1.048442e+04,1.191305e+04,1.353725e+04,9.332865e+03,1.020300e+04,8.965938e+03,1.151025e+04
22990,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,...,2.379862e+04,1.879203e+04,2.216306e+04,2.543246e+04,2.844122e+04,2.477530e+04,2.306076e+04,2.287044e+04,2.864523e+04,2.137121e+04
39949,1.597361e+04,1.108535e+04,0.0,0.0,0.000000e+00,0.0,0.0,1.789832e+04,2.300808e+04,3.272052e+04,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [1174]:
hc_chebi_int = list(map(int, hc_chebi))

fly_new_data.loc[hc_chebi_int]

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.838642e+07,9.720054e+07,1.569595e+08,77795784.00,2.705256e+08,2.944584e+08,2.928758e+08,2.146365e+08,60363772.00,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
17750,1.362246e+07,1.780536e+07,1.460376e+07,7677551.00,2.263544e+07,2.361069e+07,2.945433e+07,1.700735e+07,10878433.00,9.633883e+06,...,6.680130e+08,2.751695e+08,6.562522e+07,3.312658e+07,2.363081e+07,1.572302e+07,2.464594e+07,2.710239e+07,2.081450e+07,1.545813e+07
16414,3.153438e+05,1.924355e+05,3.453031e+05,120788.79,4.625572e+05,5.696252e+05,5.510569e+05,3.438654e+05,347019.34,2.275672e+05,...,2.770457e+05,1.527601e+05,1.603951e+05,1.501062e+05,1.913238e+05,1.128262e+05,2.752722e+05,2.691724e+05,1.384505e+05,1.161181e+05
16704,1.660930e+05,2.808621e+05,3.637339e+05,116451.73,6.415765e+05,7.204768e+05,7.264848e+05,4.160858e+05,121494.95,4.448218e+04,...,1.944350e+05,1.119636e+05,7.426326e+04,9.824895e+04,1.148161e+05,7.362861e+04,1.062186e+05,1.069206e+05,5.887425e+04,4.854882e+04
18132,5.804114e+06,9.359634e+06,4.036648e+06,4252969.00,2.784564e+07,3.244543e+07,3.497637e+07,2.677357e+07,4791101.00,7.403717e+06,...,4.281074e+05,1.078968e+06,1.159019e+06,8.767829e+06,1.943778e+06,5.812625e+05,2.858707e+05,3.279061e+05,2.278933e+05,5.828348e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30836,8.048993e+04,1.035259e+05,9.867276e+04,76495.30,2.379700e+05,1.960418e+05,2.337166e+05,1.336135e+05,102677.60,8.065252e+04,...,5.108084e+04,5.729564e+04,2.209581e+04,7.047417e+04,2.696200e+04,2.170558e+04,7.302790e+03,3.023916e+04,5.679957e+03,1.072465e+04
17148,8.264434e+05,9.039104e+05,7.938238e+05,832969.90,8.113076e+05,8.004005e+05,8.589425e+05,8.500993e+05,785332.94,8.300417e+05,...,3.735940e+05,4.886179e+04,6.092673e+04,7.254051e+04,4.897638e+04,6.906940e+04,8.245180e+04,5.833096e+04,5.076517e+04,4.340685e+04
17361,0.000000e+00,0.000000e+00,0.000000e+00,0.00,4.472018e+04,2.404662e+04,4.364692e+04,4.962652e+04,0.00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
16695,2.341337e+04,4.605546e+04,6.119332e+04,0.00,1.906519e+05,2.058674e+05,2.425068e+05,2.009113e+05,0.00,0.000000e+00,...,0.000000e+00,1.708008e+04,1.892531e+04,7.913866e+03,2.074022e+04,0.000000e+00,0.000000e+00,1.381059e+04,0.000000e+00,1.819591e+04


In [1167]:
new_fly_no_dupes.loc[hc_chebi]

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
17750,1.36225e+07,1.78054e+07,1.46038e+07,7.67755e+06,2.26354e+07,2.36107e+07,2.94543e+07,1.70073e+07,1.08784e+07,9.63388e+06,...,6.68013e+08,2.75169e+08,6.56252e+07,3.31266e+07,2.36308e+07,1.5723e+07,2.46459e+07,2.71024e+07,2.08145e+07,1.54581e+07
16414,315344,192435,345303,120789,462557,569625,551057,343865,347019,227567,...,277046,152760,160395,150106,191324,112826,275272,269172,138450,116118
16704,166093,280862,363734,116452,641576,720477,726485,416086,121495,44482.2,...,194435,111964,74263.3,98248.9,114816,73628.6,106219,106921,58874.2,48548.8
18132,5.80411e+06,9.35963e+06,4.03665e+06,4.25297e+06,2.78456e+07,3.24454e+07,3.49764e+07,2.67736e+07,4.7911e+06,7.40372e+06,...,428107,1.07897e+06,1.15902e+06,8.76783e+06,1.94378e+06,581262,285871,327906,227893,582835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30836,80489.9,103526,98672.8,76495.3,237970,196042,233717,133614,102678,80652.5,...,51080.8,57295.6,22095.8,70474.2,26962,21705.6,7302.79,30239.2,5679.96,10724.6
17148,826443,903910,793824,832970,811308,800400,858942,850099,785333,830042,...,373594,48861.8,60926.7,72540.5,48976.4,69069.4,82451.8,58331,50765.2,43406.8
17361,0,0,0,0,44720.2,24046.6,43646.9,49626.5,0,0,...,0,0,0,0,0,0,0,0,0,0
16695,23413.4,46055.5,61193.3,0,190652,205867,242507,200911,0,0,...,0,17080.1,18925.3,7913.87,20740.2,0,0,13810.6,0,18195.9


In [1190]:
## Difference between two databases.
from pandas._testing import assert_frame_equal

test1 = new_fly_no_dupes.loc[hc_chebi]
fly_new_data.index = fly_new_data.index.map(str)
test2 = fly_new_data.loc[hc_chebi]

t1 = test1.astype(object)
t2 = test2.astype(object)
assert_frame_equal(t1, t2)


In [1163]:
hc_chebi = ['17203',
 '17750',
 '16414',
 '16704',
 '18132',
 '27596',
 '17533',
 '32796',
 '18049',
 '28483',
 '17256',
 '6032',
 '17015',
 '46905',
 '16168',
 '15603',
 '37023',
 '28587',
 '15978',
 '35704',
 '17215',
 '45133',
 '47977',
 '18050',
 '27570',
 '16958',
 '18183',
 '32816',
 '506227',
 '16347',
 '1547',
 '17380',
 '29069',
 '18123',
 '18344',
 '10072',
 '16283',
 '17895',
 '16785',
 '16828',
 '16349',
 '17154',
 '17747',
 '16015',
 '73685',
 '17981',
 '18019',
 '28123',
 '38571',
 '70744',
 '17549',
 '18095',
 '42111',
 '33198',
 '4167',
 '16865',
 '17587',
 '16946',
 '17310',
 '16856',
 '17992',
 '17521',
 '17515',
 '16467',
 '17562',
 '16020',
 '16708',
 '4170',
 '15354',
 '16899',
 '18300',
 '19062',
 '16643',
 '17295',
 '17368',
 '15746',
 '17489',
 '17858',
 '17196',
 '15676',
 '17482',
 '17351',
 '30769',
 '16335',
 '16870',
 '17596',
 '16027',
 '18295',
 '15891',
 '21547',
 '30797',
 '27781',
 '16040',
 '84543',
 '17345',
 '73124',
 '45658',
 '17713',
 '16742',
 '16610',
 '73238',
 '73882',
 '30836',
 '17148',
 '17361',
 '16695',
 '52742']

In [1119]:
fly_related_chebi_no_dupes = remove_dupes(fly_related_chebi)
fly_related_chebi_no_dupes

2021-03-29 13:22:18.023 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers
2021-03-29 13:22:18.023 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers
2021-03-29 13:22:18.023 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers
2021-03-29 13:22:18.023 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers
2021-03-29 13:22:18.023 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers
2021-03-29 13:22:18.023 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers
2021-03-29 13:22:18.023 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers
2021-03-29 13:22:18.023 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers
2021-03-29 13:22:18.023 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers
2

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
30768,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
30322,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
27957,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
52342,7.83864e+07,9.72005e+07,1.5696e+08,7.77958e+07,2.70526e+08,2.94458e+08,2.92876e+08,2.14637e+08,6.03638e+07,4.72092e+07,...,2.76179e+08,1.59044e+08,2.7595e+08,2.96183e+08,3.3911e+08,2.27686e+08,4.03786e+08,4.14714e+08,2.3749e+08,2.09447e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48314,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,32720.5,...,0,0,0,0,0,0,0,0,0,0
48315,15973.6,11085.3,0,0,0,0,0,17898.3,23008.1,32720.5,...,0,0,0,0,0,0,0,0,0,0
25371,0,0,0,0,0,0,0,0,10007.4,13116.3,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3
36264,0,0,0,0,0,0,0,0,10007.4,13116.3,...,16894,18129.7,15126.6,18701.1,13976.1,19757.3,14981.7,19914,17853.7,21203.3


In [1130]:
all_rows = len(fly_related_chebi_no_dupes)
unique_rows = len(fly_related_chebi_no_dupes['sak_h_3.mzXML'].unique())

In [1131]:
unique_rows

892

In [1132]:
print ("There are", unique_rows, "unique rows out of", all_rows, "total rows in the FlyMet data")

There are 892 unique rows out of 9244 total rows in the FlyMet data
