In [1]:
import os
import pandas as pd

In [3]:
taxonomy_file = "/Users/cheesemania/PycharmProjects/mscthesis_wrkdir/metadata/fungal-traits/UNITE-FungalTraits-merging.xlsx"
taxonomy_df = pd.read_excel(taxonomy_file, sheet_name='UNITE-FungalTraits-merged_all')

In [7]:
taxonomy_df_short = taxonomy_df[['Feature ID', 'Taxon', 'Fruitbody_type_template', 'primary_lifestyle']]

In [5]:
base_dir = "/Users/cheesemania/PycharmProjects/mscthesis_wrkdir/src-analysis/ML/Rarefied"

In [6]:
# List all project folders
project_folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
print(project_folders)

['PRJNA813503', 'PRJEB38930', 'PRJNA607176', 'PRJNA590898', 'PRJNA756382', 'PRJNA735021', 'PRJEB42375', 'PRJNA662173', 'PRJNA647266']


In [9]:
# Iterate through each project folder
for project in project_folders:
    project_path = os.path.join(base_dir, project)
    
    # Iterate over both "ML-results-filtered" and "ML-results-non-filtered" folders
    for result_type in ['ML-results-non-filtered', 'ML-results-filt-nonresident']:
        result_path = os.path.join(project_path, result_type, 'RF-ncv-classifier-ten-fold')
        
        feature_importance_file = os.path.join(result_path, 'feature_importance.tsv')
        feature_importance_df = pd.read_csv(feature_importance_file, sep='\t')
        
        # Sort by 'importance' and get the top 20 features
        top_20_important_features = feature_importance_df.sort_values(by='importance', ascending=False).head(20)
        
        # Merge with the common taxonomy DataFrame
        merged_df = pd.merge(top_20_important_features, taxonomy_df_short, left_on='id', right_on='Feature ID', how='left')
        
        # Save to a new TSV file
        output_file = os.path.join(result_path, 'important_features_top_20_with_taxonomy.tsv')
        merged_df.to_csv(output_file, sep='\t', index=False)

        print(f"Processed {project}/{result_type} and saved the result to {output_file}")

Processed PRJNA813503/ML-results-non-filtered and saved the result to /Users/cheesemania/PycharmProjects/mscthesis_wrkdir/src-analysis/ML/Rarefied/PRJNA813503/ML-results-non-filtered/RF-ncv-classifier-ten-fold/important_features_top_20_with_taxonomy.tsv
Processed PRJNA813503/ML-results-filt-nonresident and saved the result to /Users/cheesemania/PycharmProjects/mscthesis_wrkdir/src-analysis/ML/Rarefied/PRJNA813503/ML-results-filt-nonresident/RF-ncv-classifier-ten-fold/important_features_top_20_with_taxonomy.tsv
Processed PRJEB38930/ML-results-non-filtered and saved the result to /Users/cheesemania/PycharmProjects/mscthesis_wrkdir/src-analysis/ML/Rarefied/PRJEB38930/ML-results-non-filtered/RF-ncv-classifier-ten-fold/important_features_top_20_with_taxonomy.tsv
Processed PRJEB38930/ML-results-filt-nonresident and saved the result to /Users/cheesemania/PycharmProjects/mscthesis_wrkdir/src-analysis/ML/Rarefied/PRJEB38930/ML-results-filt-nonresident/RF-ncv-classifier-ten-fold/important_featur