# Imports

In [3]:
import pandas as pd
import numpy as np
from io import StringIO
import os
import requests
import pip
import matplotlib.pyplot as plt


# Reading tools.tsv to dataframe and Preparing the table for lowering down the EDAM operations

In [5]:
tools_db = pd.read_table('tools.tsv')
tools2_db = tools_db[tools_db['To keep']==True]
tools3_db =tools2_db[tools2_db['EDAM operation'].notna()]
tmp = pd.concat( [tools3_db['Galaxy tool ids'], tools3_db['EDAM operation'].str.split(',', expand=True).add_prefix('SubColumn')], axis=1 )
#tmp.set_index('Galaxy tool ids', inplace=True)
#tools2_db.head()
#print(tools3_db)

tmp.head()

Unnamed: 0,Galaxy tool ids,SubColumn0,SubColumn1,SubColumn2,SubColumn3,SubColumn4,SubColumn5,SubColumn6,SubColumn7,SubColumn8,...,SubColumn10,SubColumn11,SubColumn12,SubColumn13,SubColumn14,SubColumn15,SubColumn16,SubColumn17,SubColumn18,SubColumn19
12,lotus2,Sequence feature detection,,,,,,,,,...,,,,,,,,,,
119,antismash,Sequence clustering,Gene prediction,Differential gene expression analysis,,,,,,,...,,,,,,,,,,
129,combine_metaphlan_humann,Aggregation,,,,,,,,,...,,,,,,,,,,
130,compare_humann2_output,Comparison,,,,,,,,,...,,,,,,,,,,
133,format_metaphlan2_output,Formatting,,,,,,,,,...,,,,,,,,,,


## Using OWL aginst the tmp dataframe to keep only the leaf node of the same branch for EDAM operations of one tool that belong to the same branch

In [6]:
import pandas as pd
from owlready2 import get_ontology, Thing
import copy

def process_row(row):
# Remove extra spaces from each column value in the row
    cleaned_row = [str(value).strip() for value in row[1:]]  # Exclude the first column ('Galaxy tool ids')
    
    # Convert the cleaned row to a list of EDAM terms using the provided ontology
    edam_ontology = get_ontology('https://edamontology.org/EDAM_1.25.owl').load()
    
    terms = cleaned_row
    classes = [edam_ontology.search_one(label=term) for term in terms]
    check_classes = [cla for cla in classes if cla is not None]  # Remove None values
    
    new_classes = []
    for cla in check_classes:
        try:
            # get all subclasses
            subclasses = list(cla.subclasses())
    
            # check if any of the other classes is a subclass
            include_class = True
            for subcla in subclasses:
                for cla2 in check_classes:
                    if subcla == cla2:
                        include_class = False
    
            # only keep the class if it is not a parent class
            if include_class:
                new_classes.append(cla)
        except Exception as e:
            print(f"Error processing class {cla}: {e}")

    # convert back to terms, skipping None values
    new_terms = [cla.label[0] for cla in new_classes if cla is not None]
    
    # Include the first column ('Galaxy tool ids') in the returned series
    return pd.Series([row[0], ', '.join(new_terms)])  # Combine the new terms with commas

def process_dataframe(input_df):
    # Apply the process_row function to each row in the dataframe
    output_df = input_df.apply(process_row, axis=1)
    
    # Set the header of the output dataframe
    output_df.columns = ['Galaxy tool ids', 'EDAM operation']
    
    return output_df

example_df = pd.DataFrame({
    'Galaxy tool ids': ['tool1', 'tool2', 'tool3'],
    'SubColumn0': ['Differential protein expression profiling', 'Sequence analysis', 'Data retrieval'],
    'SubColumn1': [' Gene expression profiling ', '  Phylogenetic tree analysis  ', ' Service invocation ']
})


processed_df = process_dataframe(tmp)
processed_df.to_csv('filteredtoolsEDAMoperations.csv', index=False)







## Adding an updated EDAM operation column (EDAM operation low classes) in the original tool.tsv file (after removing to Kepp = false) and save it to updatedtools.tsv

In [14]:
if 'EDAM operation' in tools2_db.columns and 'EDAM operation' in processed_df.columns:
    # Add the column from df2 to df1 with a new name
    tools2_db['EDAM operation low classes'] = tools2_db['EDAM operation']
else:
    print("Column 'EDAM operation' not found in one or both dataframes.")

tools2_db.to_csv('Updatedtools.tsv', index=False)

tools2_db.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tools2_db['EDAM operation low classes'] = tools2_db['EDAM operation']


Unnamed: 0,Galaxy wrapper id,Total tool usage (usegalaxy.eu),No. of tool users (2022-2023) (usegalaxy.eu),Galaxy tool ids,Description,bio.tool id,bio.tool name,bio.tool description,EDAM operation,EDAM topic,...,Galaxy wrapper source,Galaxy wrapper version,Conda id,Conda version,https://usegalaxy.org,https://usegalaxy.org.au,https://usegalaxy.eu,Reviewed,To keep,EDAM operation low classes
6,TreeBest,,,treebest_best,TreeBeST best,,,,,,...,https://github.com/TGAC/earlham-galaxytools/tr...,1.9.2.post0,treebest,1.9.2.post1,(0/1),(0/1),(1/1),True,True,
8,ete,1255.0,67.0,"ete_gene_csv_finder, ete_genetree_splitter, et...",Analyse phylogenetic trees using the ETE Toolkit,,,,,,...,https://github.com/TGAC/earlham-galaxytools/tr...,3.1.2,ete3,3.1.1,(0/7),(0/7),(7/7),True,True,
12,lotus2,936.0,114.0,lotus2,LotuS2 OTU processing pipeline,lotus2,lotus2,LotuS2 is a lightweight and user-friendly pipe...,Sequence feature detection,Metagenomics,...,https://github.com/TGAC/earlham-galaxytools/tr...,2.32,lotus2,2.32,(0/1),(0/1),(1/1),True,True,Sequence feature detection
16,abacas,,,abacas,Order and Orientate Contigs,,,,,,...,https://github.com/phac-nml/abacas,1.1,mummer,3.23,(0/1),(0/1),(0/1),True,True,
17,assemblystats,,,assemblystats,Summarise an assembly (e.g. N50 metrics),,,,,,...,https://github.com/phac-nml/galaxy_tools,1.1.0,perl-bioperl,1.7.8,(0/1),(0/1),(0/1),True,True,
