In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pandas as pd
from loguru import logger

In [4]:
sys.path.append('..')

from pyMultiOmics.constants import *
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info
from pyMultiOmics.analysis import *
from pyMultiOmics.query import *
from pyMultiOmics.pipelines import *


2021-03-26 15:40:31.229 | INFO     | pyMultiOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)
  from pandas.core.index import Index as PandasIndex


# Demonstration of pyMultiOmics

## Load the processed Zebrafish data from [1]

[1] [Rabinowitz, Jeremy S., et al. "Transcriptomic, proteomic, and metabolomic landscape of positional memory in the caudal fin of zebrafish." Proceedings of the National Academy of Sciences 114.5 (2017): E717-E726.](https://www.pnas.org/content/114/5/E717.short)

In [5]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'zebrafish_data'))
DATA_FOLDER

'C:\\Users\\joewa\\Work\\git\\pyMultiOmics\\notebooks\\test_data\\zebrafish_data'

Read metabolomics data

In [6]:
compound_data = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_data_chebi.csv'), index_col='Identifier')
compound_design = pd.read_csv(os.path.join(DATA_FOLDER, 'compound_design.csv'), index_col='sample')

In [7]:
fly_compound_data = pd.read_csv(os.path.join(DATA_FOLDER, '../fly_data/fly_metabolomics.csv'), index_col='Identifier')

In [8]:
os.getcwd()

'C:\\Users\\joewa\\Work\\git\\pyMultiOmics\\notebooks'

In [9]:
set_log_level_info()

1

In [10]:
type(compound_design)

pandas.core.frame.DataFrame

In [11]:
print(compound_design.head())

            group
sample           
distal_M1  Distal
distal_M2  Distal
distal_M3  Distal
distal_F1  Distal
distal_F2  Distal


In [12]:
compound_design

Unnamed: 0_level_0,group
sample,Unnamed: 1_level_1
distal_M1,Distal
distal_M2,Distal
distal_M3,Distal
distal_F1,Distal
distal_F2,Distal
distal_F3,Distal
middle_M1,Middle
middle_M2,Middle
middle_M3,Middle
middle_F1,Middle


In [13]:
compound_data.head()

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18139,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
57305,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
15724,5787534,4351239,4401036,8187282,8431125,5082056,5138937,7341351,7837293,9256269,9934066,10243285,7344406,5524811,4809250,9279874,9047339,9211255
17148,3430897,1877785,1225710,2326620,2421267,2595529,2003627,2120053,2269318,3220850,4596854,3155377,3760854,2658833,2488025,2506550,4000703,3292566
15611,112845,129977,122292,63219,50113,100343,156651,176682,379322,160906,56802,107161,235982,181200,142994,116132,94589,167280


## Methods for adding related chebi IDs

In [14]:
# This method is pretty inefficient with the use of iterrows but I'm not sure of another way to run this
# All attempts at vectorisation failed - help Joe?
def get_related_chebi_data(cmpd_data):
    # dont want to modify the original df
    cmpd_data = cmpd_data.copy()
    
    # ensure index type is set to string, since get_chebi_relation_dict also returns string as the keys
    cmpd_data.index = cmpd_data.index.map(str)
    chebi_rel_dict = get_chebi_relation_dict()
    with_related = list(chebi_rel_dict.keys())
    cmpd_data.loc[cmpd_data.index.isin(with_related), 'related']= 'Yes'
    cmpd_data = cmpd_data.reset_index()
    
    # We use this related_df so that we are not looking at all rows, only those with related chebi_ids
    related_df = cmpd_data[cmpd_data.related=='Yes']
#     print(related_df)
    
    for ix, row in related_df.iterrows():
        print (ix)
        chebi_list = chebi_rel_dict[str(row.Identifier)]
        for c in chebi_list:
            #Check if the duplicate row with that chebi exists in the DF
            current_row = row
            current_row.Identifier = int(c)
            matches  = cmpd_data[(cmpd_data==current_row).all(axis=1)]

            if len(matches) == 0:
#                 print ("no matching rows, appending")
                cmpd_data = cmpd_data.append(current_row)
#            else:
#                 print ("row found in DF therefore skipping")
    c_data = cmpd_data.drop(['related'], axis=1)
    c_data = c_data.set_index(['Identifier'])
    
    return c_data


In [15]:
def get_related_chebi_data_v2(cmpd_data):
    cmpd_data = cmpd_data.copy()
    
    # ensure index type is set to string, since get_chebi_relation_dict also returns string as the keys
    cmpd_data.index = cmpd_data.index.map(str)
    cmpd_data = cmpd_data.reset_index()
    original_cmpds = set(cmpd_data['Identifier']) # used for checking later

    # construct the related chebi dict
    chebi_rel_dict = get_chebi_relation_dict()    

    # loop through each row in cmpd_data
    with_related_data = []
    for ix, row in cmpd_data.iterrows():   
        
        # add the current row we're looping
        current_identifier = row['Identifier']
        with_related_data.append(row)

        # check if there are related compounds to add
        if current_identifier in chebi_rel_dict:

            # if yes, get the related compounds
            chebi_list = chebi_rel_dict[current_identifier]        
            for c in chebi_list:

                # add the related chebi, but only if it's not already present in the original compound
                if c not in original_cmpds:
                    current_row = row.copy()
                    current_row['Identifier'] = c
                    with_related_data.append(current_row)

    # combine all the rows into a single dataframe
    df = pd.concat(with_related_data, axis=1).T
    df = df.set_index('Identifier')
    logger.info('Inserted %d related compounds' % (len(df) - len(cmpd_data)))    
    return df

In [16]:
def remove_dupes(df):    
    df = df.reset_index()

    # group df by the 'Identifier' column
    to_delete = []
    grouped = df.groupby(df['Identifier'])
    for identifier, group_df in grouped:
        
        # if there are multiple rows sharing the same identifier
        if len(group_df) > 1: 

            # remove 'Identifier' column from the grouped df since it can't be summed
            group_df = group_df.drop('Identifier', axis=1)

            # find the row with the largest sum across the row in the group
            idxmax = group_df.sum(axis=1).idxmax()

            # mark all the rows in the group for deletion, except the one with the largest sum
            temp = group_df.index.tolist()
            temp.remove(idxmax)
            to_delete.extend(temp)

    # actually do the deletion here
    logger.info('Removing %d rows with duplicate identifiers' % (len(to_delete)))
    df = df.drop(to_delete)
    df = df.set_index('Identifier')
    return df

In [17]:
def get_chebi_relation_dict():
    """
    A method to parse the chebi relation tsv and store the relationship we want in a dictionary
    :return: Dict with structure Chebi_id: [related_chebi_ids]
    """
    CHEBI_BFS_RELATION_DICT = 'chebi_bfs_relation_dict.pkl'
    try:
        chebi_bfs_relation_dict = load_object("../pyMultiOmics/data/" + CHEBI_BFS_RELATION_DICT)
    except Exception as e:
        logger.info("Constructing %s " % CHEBI_BFS_RELATION_DICT)
        try:
            chebi_relation_df = pd.read_csv("data/relation.tsv", delimiter="\t")

        except FileNotFoundError as e:

            logger.error("data/relation.tsv must be present")
            raise e

            # List of relationship we want in the dictionary
        select_list = ["is_conjugate_base_of", "is_conjugate_acid_of", "is_tautomer_of"]
        chebi_select_df = chebi_relation_df[chebi_relation_df.TYPE.isin(select_list)]

        chebi_relation_dict = {}
        # Gather all the INIT_IDs into a dictionary so that each INIT_ID is unique
        for ix, row in chebi_select_df.iterrows():
            init_id = str(row.INIT_ID)
            final_id = str(row.FINAL_ID)
            if init_id in chebi_relation_dict.keys():
                # Append the final_id onto the existing values
                id_1 = chebi_relation_dict[init_id]
                joined_string = ", ".join([id_1, final_id])
                chebi_relation_dict[init_id] = joined_string
            else:  # make a new key entry for the dict
                chebi_relation_dict[init_id] = final_id

        # Change string values to a list.
        graph = {k: v.replace(" ", "").split(",") for k, v in chebi_relation_dict.items()}

        chebi_bfs_relation_dict = {}
        for k, v in graph.items():
            r_chebis = bfs_get_related(graph, k)
            r_chebis.remove(k) #remove original key from list

            chebi_bfs_relation_dict[k] = r_chebis
        try:
            logger.info("saving chebi_relation_dict")
            save_object(chebi_bfs_relation_dict, "./data/" + CHEBI_BFS_RELATION_DICT + ".pkl")


        except Exception as e:
            logger.error("Pickle didn't work because of %s " % e)
            traceback.print_exc()
            pass


    return chebi_bfs_relation_dict


In [18]:
import gzip
import pickle

def load_object(filename):
    """
    Load saved object from file
    :param filename: The file to load
    :return: the loaded object
    """
    with gzip.GzipFile(filename, 'rb') as f:
        return pickle.load(f)

In [19]:
def bfs_get_related(graph_dict, node):
    """
    :param graph: Dictionary of key: ['value'] pairs
    :param node: the key for which all related values should be returned
    :return: All related keys as a list
    """
    visited = [] # List to keep track of visited nodes.
    queue = []     #Initialize a queue
    related_keys = []

    visited.append(node)
    queue.append(node)

    while queue:
        k = queue.pop(0)
        related_keys.append(k)

        for neighbour in graph_dict[k]:
          if neighbour not in visited:
            visited.append(neighbour)
            queue.append(neighbour)

    return related_keys


In [20]:
def get_related_chebi_ids(chebi_ids):
    """
    :param chebi_ids: A list of chebi IDS
    :return: A set of related chebi_IDs that are not already in the list
    """
    chebi_relation_dict = get_chebi_relation_dict()
    related_chebis = set()

    for c_id in chebi_ids:
        if c_id in chebi_relation_dict:
            related_chebis.update(chebi_relation_dict[c_id])

    return related_chebis

### For each chebi_id in the DF that has other relaed Chebi_ids, add on a duplicate row.

##### For the Zebrafish DF we expect the input and output to be the same as all the related Chebi_ids are already present in the DF


In [21]:
zebra_f_related_chebi = get_related_chebi_data_v2(compound_data)

2021-03-26 15:40:33.931 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 178 related compounds


In [22]:
compound_data

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18139,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
57305,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
15724,5787534,4351239,4401036,8187282,8431125,5082056,5138937,7341351,7837293,9256269,9934066,10243285,7344406,5524811,4809250,9279874,9047339,9211255
17148,3430897,1877785,1225710,2326620,2421267,2595529,2003627,2120053,2269318,3220850,4596854,3155377,3760854,2658833,2488025,2506550,4000703,3292566
15611,112845,129977,122292,63219,50113,100343,156651,176682,379322,160906,56802,107161,235982,181200,142994,116132,94589,167280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17202,23590107,17432727,11821183,18396545,10737880,18778788,19264992,8119525,25719881,26895322,20489811,23362861,30532905,7049672,9853902,19511285,30291647,31272774
17659,176648,128962,90159,138395,91016,137467,200091,151758,418224,265649,142543,177308,475141,202469,133316,206919,311047,269830
456216,68045,42897,39287,54154,35470,50695,68939,76078,127354,95864,69298,78328,136188,66532,71612,73803,111415,112341
17552,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472


In [23]:
zebra_f_related_chebi

Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18139,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
58389,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
57305,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
15428,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
32507,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17552,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
65180,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
58189,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
15919,33991,21340,16393,22992,12814,19732,27512,26427,45803,37193,30502,22312,67244,32223,39682,46545,64364,66252


In [24]:
zebra_f_related_chebi_no_dupes = remove_dupes(zebra_f_related_chebi)
zebra_f_related_chebi_no_dupes

2021-03-26 15:40:34.247 | INFO     | __main__:remove_dupes:24 - Removing 0 rows with duplicate identifiers


Unnamed: 0_level_0,distal_M1,distal_M2,distal_M3,distal_F1,distal_F2,distal_F3,middle_M1,middle_M2,middle_M3,middle_F1,middle_F2,middle_F3,proximal_M1,proximal_M2,proximal_M3,proximal_F1,proximal_F2,proximal_F3
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18139,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
58389,75170,57052,39170,84057,38608,64126,50214,75680,165178,121856,77061,98015,113765,96098,84198,117644,169459,169669
57305,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
15428,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
32507,64511,33658,23565,52102,49508,37498,30417,55728,88519,103871,45974,73101,72725,66008,54220,95341,110192,291598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17552,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
65180,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
58189,75053,37911,27336,38910,24157,51875,90259,74022,168126,100414,30025,62006,183069,60881,35713,55937,95041,126472
15919,33991,21340,16393,22992,12814,19732,27512,26427,45803,37193,30502,22312,67244,32223,39682,46545,64364,66252


##### For the Fly DF we expect the input and output to be the same as all the related Chebi_ids are already present in the DF

In [25]:
fly_related_chebi = get_related_chebi_data_v2(fly_compound_data)

2021-03-26 15:40:38.453 | INFO     | __main__:get_related_chebi_data_v2:36 - Inserted 0 related compounds


In [26]:
fly_compound_data

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
30768,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
30322,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
27957,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
52342,7.838642e+07,9.720054e+07,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,2.146365e+08,6.036377e+07,4.720918e+07,...,2.761787e+08,1.590437e+08,2.759497e+08,2.961829e+08,3.391098e+08,2.276857e+08,4.037859e+08,4.147139e+08,2.374896e+08,2.094466e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48314,1.597361e+04,1.108535e+04,0.0,0.0,0.0,0.0,0.0,1.789832e+04,2.300808e+04,3.272052e+04,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
48315,1.597361e+04,1.108535e+04,0.0,0.0,0.0,0.0,0.0,1.789832e+04,2.300808e+04,3.272052e+04,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
25371,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,1.000735e+04,1.311626e+04,...,1.689397e+04,1.812973e+04,1.512663e+04,1.870111e+04,1.397613e+04,1.975729e+04,1.498166e+04,1.991403e+04,1.785374e+04,2.120331e+04
36264,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.000000e+00,1.000735e+04,1.311626e+04,...,1.689397e+04,1.812973e+04,1.512663e+04,1.870111e+04,1.397613e+04,1.975729e+04,1.498166e+04,1.991403e+04,1.785374e+04,2.120331e+04


In [27]:
fly_related_chebi

Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
30768,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
30322,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
27957,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
52342,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48314,15973.613,11085.349,0.0,0.0,0.0,0.0,0.0,17898.322,23008.082,32720.518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48315,15973.613,11085.349,0.0,0.0,0.0,0.0,0.0,17898.322,23008.082,32720.518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10007.353,13116.258,...,16893.97,18129.729,15126.627,18701.113,13976.128,19757.29,14981.658,19914.031,17853.74,21203.312
36264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10007.353,13116.258,...,16893.97,18129.729,15126.627,18701.113,13976.128,19757.29,14981.658,19914.031,17853.74,21203.312


In [28]:
fly_related_chebi_no_dupes = remove_dupes(fly_related_chebi)
fly_related_chebi_no_dupes

2021-03-26 15:40:44.847 | INFO     | __main__:remove_dupes:24 - Removing 21663 rows with duplicate identifiers


Unnamed: 0_level_0,CAR_F1.mzXML,CAR_F2.mzXML,CAR_F3.mzXML,CAR_F4.mzXML,CAR_M1.mzXML,CAR_M2.mzXML,CAR_M3.mzXML,CAR_M4.mzXML,FM_BRN_F1.mzXML,FM_BRN_F2.mzXML,...,sak_f_3.mzXML,sak_f_4.mzXML,sak_g_1.mzXML,sak_g_2.mzXML,sak_g_3.mzXML,sak_g_4.mzXML,sak_h_1.mzXML,sak_h_2.mzXML,sak_h_3.mzXML,sak_h_4.mzXML
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17203,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
30768,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
30322,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
27957,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
52342,78386424.0,97200536.0,156959540.0,77795784.0,270525570.0,294458370.0,292875800.0,214636540.0,60363772.0,47209180.0,...,276178660.0,159043710.0,275949660.0,296182900.0,339109820.0,227685700.0,403785920.0,414713920.0,237489580.0,209446620.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48314,15973.613,11085.349,0.0,0.0,0.0,0.0,0.0,17898.322,23008.082,32720.518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48315,15973.613,11085.349,0.0,0.0,0.0,0.0,0.0,17898.322,23008.082,32720.518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10007.353,13116.258,...,16893.97,18129.729,15126.627,18701.113,13976.128,19757.29,14981.658,19914.031,17853.74,21203.312
36264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10007.353,13116.258,...,16893.97,18129.729,15126.627,18701.113,13976.128,19757.29,14981.658,19914.031,17853.74,21203.312
