# Exploratory Data Analysis


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

seed_value = 42

## Loading the dataset

In [2]:
# Read the .txt file, set header to be the first row
orig_df = pd.read_csv("./dataset/Bacteria Genus Relative Abundance.txt", delimiter='\t').T
# transpose and set the col "name" to be the index

# # Set the first row as the column headers
orig_df.columns = orig_df.iloc[0]

# # Drop the first row, as it's now the header
orig_df = orig_df[1:]

# # Reset the index to default numeric index
orig_df.reset_index(inplace=True)
# df
orig_df.rename(columns={"index": "name"}, inplace=True)
orig_df = orig_df.rename_axis(None, axis=1)

In [3]:
orig_df

Unnamed: 0,name,Simonsiella,Treponema,Campylobacter,Helicobacter,Paracoccus,Comamonas,Pseudomonas,Xanthomonas,Agrobacterium,...,Merdibacter,Massilioclostridium,Criibacterium,Fournierella,Lagierella,Urmitella,Colibacter,Alterileibacterium,Negativibacillus,Duodenibacillus
0,TCGA-CG-5720-01A,0.0,0.0,0.0,0.89505,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TCGA-BR-4292-11A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TCGA-CN-4741-01A,0.0,0.0,0.01047,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TCGA-BR-6801-01A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TCGA-AA-A01P-11A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615,TCGA-CG-5719-01A,0.0,0.0,0.0,0.106557,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
616,TCGA-CQ-5329-01A,0.0,0.175564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
617,TCGA-CQ-7068-01A,0.0,0.33506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
618,TCGA-CG-4455-01A,0.0,0.0,0.0,0.0,0.0,0.0,0.014781,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
meta_df = pd.read_csv("./dataset/Sample Metadata.txt", delimiter="\t")
meta_df.rename(columns={"Unnamed: 0": "name"}, inplace=True)
meta_df


Unnamed: 0,name,case_id,Sample,biospecimen_sequence,composition,current_weight,days_to_collection,days_to_sample_procurement,freezing_method,initial_weight,...,percent_monocyte_infiltration,percent_necrosis,percent_neutrophil_infiltration,percent_normal_cells,percent_stromal_cells,percent_tumor_cells,percent_tumor_nuclei,project,ffpe_tumor_slide_submitted,HistologicalType
0,TCGA-CG-5720-01A,7c9ea4fa-4cbc-4941-945a-e531e1d48304,PT,,,,,,,,...,,3.5,,0.0,12.5,84.0,75.0,STAD,NO,
1,TCGA-BR-4292-11A,703d3e86-32f4-44ae-bd88-c02378fc2269,STN,,,,,,,,...,,,,,,,,STAD,,
2,TCGA-CN-4741-01A,277b02e9-ded5-4980-845d-af53690000ac,PT,,,,,,,,...,,,,,,,,HNSC,,SCC
3,TCGA-BR-6801-01A,24fed326-bdcf-4c20-a06e-7c1c3d6c9cc5,PT,,,,,,,,...,,5.0,,0.0,25.0,70.0,75.0,STAD,NO,
4,TCGA-AA-A01P-11A,13ae9d83-a22f-451f-88bb-686051725cf3,STN,,,,2403.0,,,140.0,...,9.0,,0.0,100.0,,0.0,,COAD,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615,TCGA-CG-5719-01A,263f67e0-0a28-42e6-b3f3-1ddd0d397220,PT,,,,,,,,...,,0.0,,0.0,20.0,80.0,70.0,STAD,NO,
616,TCGA-CQ-5329-01A,02dcc11f-4f0e-4c9e-8d96-d22d47beef5d,PT,,,,,,,,...,,,,,,,,HNSC,,SCC
617,TCGA-CQ-7068-01A,8ebe8c25-5ef9-42d4-9414-8313227b673f,PT,,,,,,,,...,,,,,,,,HNSC,,SCC
618,TCGA-CG-4455-01A,8a173d98-20a1-4c84-86c1-97818e1c665a,PT,,,,,,,,...,,0.0,,0.0,5.0,95.0,82.5,STAD,NO,


In [5]:
# generas that were kept (provided in the paper)
genera_list = [
    'Simonsiella', 'Treponema', 'Campylobacter', 'Helicobacter',
    'Paracoccus', 'Comamonas', 'Pseudomonas', 'Xanthomonas',
    'Agrobacterium', 'Bradyrhizobium', 'Acinetobacter', 'Neisseria',
    'Eikenella', 'Citrobacter', 'Enterobacter', 'Escherichia',
    'Klebsiella', 'Shigella', 'Haemophilus', 'Bacteroides',
    'Butyrivibrio', 'Porphyromonas', 'Prevotella', 'Roseburia',
    'Fusobacterium', 'Desulfovibrio', 'Megasphaera', 'Selenomonas',
    'Capnocytophaga', 'Peptostreptococcus', 'Ruminococcus', 'Staphylococcus',
    'Streptococcus', 'Enterococcus', 'Gemella', 'Atopobium',
    'Clostridium', 'Lactobacillus', 'Actinomyces', 'Bifidobacterium',
    'Corynebacterium', 'Eubacterium', 'Propionibacterium', 'Mycobacterium',
    'Gordonia', 'Mycoplasma', 'Thermosipho', 'Gardnerella',
    'Lachnospira', 'Veillonella', 'Leptotrichia', 'Rothia',
    'Kingella', 'Phascolarctobacterium', 'Coprococcus', 'Bilophila',
    'Dialister', 'Sutterella', 'Tissierella', 'Johnsonella',
    'Catonella', 'Filifactor', 'Abiotrophia', 'Lautropia',
    'Mitsuokella', 'Chryseobacterium', 'Centipeda', 'Eggerthella',
    'Cryptobacterium', 'Pedobacter', 'Mogibacterium', 'Coprobacillus',
    'Collinsella', 'Ensifer', 'Pseudoramibacter', 'Granulicatella',
    'Bulleidia', 'Solobacterium', 'Olsenella', 'Catenibacterium',
    'Anaeroglobus', 'Peptoniphilus', 'Anaerococcus', 'Sneathia',
    'Shuttleworthia', 'Varibaculum', 'Dorea', 'Tannerella',
    'Scardovia', 'Faecalibacterium', 'Ottowia', 'Alistipes',
    'Akkermansia', 'Marvinbryantia', 'Oribacterium', 'Odoribacter',
    'Subdoligranulum', 'Parabacteroides', 'Gulbenkiania', 'Barnesiella',
    'Aggregatibacter', 'Alloscardovia', 'Adlercreutzia', 'Oscillibacter',
    'Parvimonas', 'Blautia', 'Butyricimonas', 'Paraprevotella',
    'Pyramidobacter', 'Lachnoanaerobaculum', 'Stomatobaculum', 'Eggerthia',
    'Alloprevotella', 'Lelliottia', 'Coprobacter', 'Intestinimonas',
    'Fusicatenibacter', 'Lachnoclostridium', 'Tyzzerella', 'Faecalitalea',
    'Holdemanella', 'Mageeibacillus', 'Hungatella', 'Pseudopropionibacterium',
    'Peptoanaerobacter', 'Emergencia', 'Prevotellamassilia', 'Criibacterium',
    'Fournierella', 'Negativibacillus', 'Duodenibacillus'
]

# final columns (Added name and label columns)
processed_cols = [
    "name", 'Simonsiella', 'Treponema', 'Campylobacter', 'Helicobacter',
    'Paracoccus', 'Comamonas', 'Pseudomonas', 'Xanthomonas',
    'Agrobacterium', 'Bradyrhizobium', 'Acinetobacter', 'Neisseria',
    'Eikenella', 'Citrobacter', 'Enterobacter', 'Escherichia',
    'Klebsiella', 'Shigella', 'Haemophilus', 'Bacteroides',
    'Butyrivibrio', 'Porphyromonas', 'Prevotella', 'Roseburia',
    'Fusobacterium', 'Desulfovibrio', 'Megasphaera', 'Selenomonas',
    'Capnocytophaga', 'Peptostreptococcus', 'Ruminococcus', 'Staphylococcus',
    'Streptococcus', 'Enterococcus', 'Gemella', 'Atopobium',
    'Clostridium', 'Lactobacillus', 'Actinomyces', 'Bifidobacterium',
    'Corynebacterium', 'Eubacterium', 'Propionibacterium', 'Mycobacterium',
    'Gordonia', 'Mycoplasma', 'Thermosipho', 'Gardnerella',
    'Lachnospira', 'Veillonella', 'Leptotrichia', 'Rothia',
    'Kingella', 'Phascolarctobacterium', 'Coprococcus', 'Bilophila',
    'Dialister', 'Sutterella', 'Tissierella', 'Johnsonella',
    'Catonella', 'Filifactor', 'Abiotrophia', 'Lautropia',
    'Mitsuokella', 'Chryseobacterium', 'Centipeda', 'Eggerthella',
    'Cryptobacterium', 'Pedobacter', 'Mogibacterium', 'Coprobacillus',
    'Collinsella', 'Ensifer', 'Pseudoramibacter', 'Granulicatella',
    'Bulleidia', 'Solobacterium', 'Olsenella', 'Catenibacterium',
    'Anaeroglobus', 'Peptoniphilus', 'Anaerococcus', 'Sneathia',
    'Shuttleworthia', 'Varibaculum', 'Dorea', 'Tannerella',
    'Scardovia', 'Faecalibacterium', 'Ottowia', 'Alistipes',
    'Akkermansia', 'Marvinbryantia', 'Oribacterium', 'Odoribacter',
    'Subdoligranulum', 'Parabacteroides', 'Gulbenkiania', 'Barnesiella',
    'Aggregatibacter', 'Alloscardovia', 'Adlercreutzia', 'Oscillibacter',
    'Parvimonas', 'Blautia', 'Butyricimonas', 'Paraprevotella',
    'Pyramidobacter', 'Lachnoanaerobaculum', 'Stomatobaculum', 'Eggerthia',
    'Alloprevotella', 'Lelliottia', 'Coprobacter', 'Intestinimonas',
    'Fusicatenibacter', 'Lachnoclostridium', 'Tyzzerella', 'Faecalitalea',
    'Holdemanella', 'Mageeibacillus', 'Hungatella', 'Pseudopropionibacterium',
    'Peptoanaerobacter', 'Emergencia', 'Prevotellamassilia', 'Criibacterium',
    'Fournierella', 'Negativibacillus', 'Duodenibacillus', "label"
]

## Pre-processing

* We need to drop the normal samples because we're working with cancer-associated samples "PT".


In [6]:
# Add the sample and label columns from meta_df to the df
df = orig_df.copy()
df["Sample"] = meta_df["Sample"].to_list()
df["label"] = meta_df["project"].to_list()

df = df[df["Sample"] == "PT"].reset_index(drop=True) # Keep all PT samples
df = df[processed_cols] # filter out non-important genera features
df

Unnamed: 0,name,Simonsiella,Treponema,Campylobacter,Helicobacter,Paracoccus,Comamonas,Pseudomonas,Xanthomonas,Agrobacterium,...,Hungatella,Pseudopropionibacterium,Peptoanaerobacter,Emergencia,Prevotellamassilia,Criibacterium,Fournierella,Negativibacillus,Duodenibacillus,label
0,TCGA-CG-5720-01A,0.0,0.0,0.0,0.89505,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,STAD
1,TCGA-CN-4741-01A,0.0,0.0,0.01047,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HNSC
2,TCGA-BR-6801-01A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,STAD
3,TCGA-IG-A3I8-01A,0.0,0.0,0.0,0.067717,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ESCA
4,TCGA-L5-A4OT-01A,0.0,0.0,0.012202,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ESCA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,TCGA-CG-5719-01A,0.0,0.0,0.0,0.106557,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,STAD
508,TCGA-CQ-5329-01A,0.0,0.175564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.136613,0.0,0.0,0.0,0.0,0.0,0.0,HNSC
509,TCGA-CQ-7068-01A,0.0,0.33506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011534,0.0,0.0,0.0,0.0,0.0,0.0,HNSC
510,TCGA-CG-4455-01A,0.0,0.0,0.0,0.0,0.0,0.0,0.014781,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,STAD


## Generating Experiment-Level Datasets

In [7]:
classes = ["HNSC", "STAD", "COAD", "ESCA", "READ"]

In [8]:
df["label"].value_counts()

label
HNSC    155
STAD    127
COAD    125
ESCA     60
READ     45
Name: count, dtype: int64

In [9]:
def exp_1_data_loader(dataframe, label_column, classes, train_test=True):
    """
    Generate a one-vs-all dataset for a specific class for experiment 1

    Parameters:
    - dataframe: pd.DataFrame, the input DataFrame.
    - label_column: str, the column name representing the labels.
    - classes: list, the classes.

    Returns:
    - dataset_dict: a dictionary where the keys is the targeted class and the values are its corresponding features and labels
    """

    dataset_dict = {}

    for i in classes:
        positive_class = i
        dframe = dataframe.copy()
        dframe['label'] = [1 if x == positive_class else 0 for x in dataframe[label_column]]
        print(dframe.label.value_counts())
        X = dframe.drop(["name", "label"], axis=1)
        y = dframe["label"]
        if train_test:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=seed_value)
            dataset_dict[positive_class] = {"train": (X_train, y_train),
                                            "test": (X_test, y_test)}
        else:
            dataset_dict[positive_class] = {"feature": X, 
                                            "label": y}

    return dataset_dict


In [10]:
exp1_datasets = exp_1_data_loader(df, "label", classes, train_test=True)

label
0    357
1    155
Name: count, dtype: int64
label
0    385
1    127
Name: count, dtype: int64
label
0    387
1    125
Name: count, dtype: int64
label
0    452
1     60
Name: count, dtype: int64
label
0    467
1     45
Name: count, dtype: int64


Hyper-parameter Value Range
n_estimators 1 to 400
criterion Gini impurity or Shannon information gain
min_samples_split 2 to 100
min_samples_leaf 1 to 20
max_depth 1 to 50 when specified
max_features 1 to maximum number of features

### Experiment 1: One-vs-All classification. 
* HNSC vs All
* STAD vs All
* COAD vs All
* ESCA vs All
* READ vs All

Study 1: one‑versus‑All. In the one-versus-all study, a total of 5 independent analyses were conducted. For each analysis, a different cancer class was targeted (positive class) and the samples from the remaining classes were grouped together into one major class (negative class), resulting in a binary classification problem. Results are summarized in Table 1 and can be analyzed in greater detail from Supplementary Tables S3 to S7. From the balanced accuracy of the RF models in the test split, the five cancers form two groups with distinct performances. HNSC, STAD, and COAD achieved balanced accuracy values ranging from 87% to 96% while for ESCA and READ, results show an increase in classification difficulty, with performances for both cases below Figure 2. Experiment pipeline for the learning models development, with the isolated implementation of the RF algorithm (Experiment 1), the implementation of dimensionality reduction (DR) and feature engineering (FE) alongside the tuned RF (Experiments 2 and 3, respectively), and the final application of oversampling (O) with the previous techniques (Experiments 4 and 5). Red squares indicate that hyper-parameter tuning is being performed to a specific technique while red arrows indicate the use of a technique tuned in a former experiment

The microbial composition of COAD was the most discriminative, with all samples of this type of cancer being correctly classified by the model. On the other hand, the confusion matrix from the ESCA analysis reveals that the major factor for the poor balanced accuracy of the RF model comes from the low accuracy of 64% when classifying ESCA samples, a significant difference in performance in comparison with the results from the other cancer types. In part, the results obtained from the one-versus-all study demonstrated that microbial data can be successfully applied to classify distinct cancer types independently with promising reliability. Nonetheless, there are also key discrepancies in performance among the different cancers. Although worse results coincide with classes with lower sample size, they may also suggest distinct degrees of complexity and the necessity to adapt ML implementations and the microbial information provided according to cancer type. Furthermore, the investigation conducted by Poore et al.18 encompassed a similar study of a supervised ML model on a one-versus-all approach discriminating, among others, these respective cancer types. Nevertheless, the large number of different cancer types also included in the analysis allied with the large discrepancy in sample size, in some cases having more than 10 times the number of samples provided by TCMA for specific cancer, invalidated the comparison of results.

In [11]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, balanced_accuracy_score, accuracy_score

Class:  HNSC
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 111}
Accuracy on Test Set: 0.8701298701298701

In [12]:
def perform_cross_validation_and_evaluate(dataset_dict, classes, cv_n_splits=5, n_jobs=12):
    report = {}
    for c in classes:
        print("Class: ", c)
        features = dataset_dict[c]["train"][0]
        target = dataset_dict[c]["train"][1]

        # Define the classifier
        rf = RandomForestClassifier()

        # Define the hyperparameters and their potential values for the grid search
        param_grid = {
            'n_estimators': [1, 50, 100, 150, 200, 250, 300, 350, 400],
            'criterion' : ["gini", "entropy"],
            'max_depth': [1,10,20,30,40,50],
            'min_samples_split': [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
            'min_samples_leaf': [1, 10, 20]
        }

        cv = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=seed_value)

        # Create the GridSearchCV object
        grid_search = GridSearchCV(
            estimator=rf,
            param_grid=param_grid,
            scoring=make_scorer(balanced_accuracy_score),  # Choose an appropriate metric for your problem
            cv=cv,
            n_jobs=-1  # Use all available processors
)

        # Perform the grid search
        grid_search.fit(features, target)

        # Print the best hyperparameters
        print("Best Hyperparameters:", grid_search.best_params_)

        # Get the best model
        best_model = grid_search.best_estimator_

        # Evaluate the best model on your test set
        # Assuming you have X_test and y_test from your data
        # Modify this based on your actual test set
        predictions = best_model.predict(dataset_dict[c]["test"][0])
        accuracy = balanced_accuracy_score(dataset_dict[c]["test"][1], predictions)
        print("Accuracy on Test Set:", accuracy)

        report[c] =  {"model": best_model,
                    "best_hyperparameters": grid_search.best_params_,
                    "accuracy_on_test": accuracy}
        
    return report



In [13]:
exp1_report = perform_cross_validation_and_evaluate(exp1_datasets, classes)
exp1_report

Class:  HNSC


1620 fits failed out of a total of 17820.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1620 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-pack

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Accuracy on Test Set: 0.7983091787439613
Class:  STAD


1620 fits failed out of a total of 17820.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1620 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-pack

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Accuracy on Test Set: 0.780852994555354
Class:  COAD


1620 fits failed out of a total of 17820.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1620 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-pack

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
Accuracy on Test Set: 0.8956442831215972
Class:  ESCA


1620 fits failed out of a total of 17820.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1620 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-pack

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 30, 'n_estimators': 1}
Accuracy on Test Set: 0.5408496732026143
Class:  READ
Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 1}
Accuracy on Test Set: 0.7571428571428571


1620 fits failed out of a total of 17820.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1620 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-pack

{'HNSC': {'model': RandomForestClassifier(max_depth=20, min_samples_split=10),
  'best_hyperparameters': {'criterion': 'gini',
   'max_depth': 20,
   'min_samples_leaf': 1,
   'min_samples_split': 10,
   'n_estimators': 100},
  'accuracy_on_test': 0.7983091787439613},
 'STAD': {'model': RandomForestClassifier(criterion='entropy', max_depth=30, min_samples_split=10,
                         n_estimators=200),
  'best_hyperparameters': {'criterion': 'entropy',
   'max_depth': 30,
   'min_samples_leaf': 1,
   'min_samples_split': 10,
   'n_estimators': 200},
  'accuracy_on_test': 0.780852994555354},
 'COAD': {'model': RandomForestClassifier(max_depth=20, min_samples_split=10, n_estimators=300),
  'best_hyperparameters': {'criterion': 'gini',
   'max_depth': 20,
   'min_samples_leaf': 1,
   'min_samples_split': 10,
   'n_estimators': 300},
  'accuracy_on_test': 0.8956442831215972},
 'ESCA': {'model': RandomForestClassifier(criterion='entropy', max_depth=30, min_samples_split=30,
          

In [260]:
exp2_report = perform_cross_validation_and_evaluate(exp1_datasets, classes)
exp2_report

Class:  HNSC
Best Hyperparameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 50, 'n_estimators': 400}
Accuracy on Test Set: 0.8701298701298701
Class:  STAD
Best Hyperparameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Accuracy on Test Set: 0.8831168831168831
Class:  COAD
Best Hyperparameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Accuracy on Test Set: 0.8701298701298701
Class:  ESCA
Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy on Test Set: 0.8701298701298701
Class:  READ
Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy on Test Set: 0.9090909090909091


{'HNSC': {'model': RandomForestClassifier(max_depth=30, min_samples_split=50, n_estimators=400),
  'best_hyperparameters': {'max_depth': 30,
   'min_samples_leaf': 1,
   'min_samples_split': 50,
   'n_estimators': 400},
  'accuracy_on_test': 0.8701298701298701},
 'STAD': {'model': RandomForestClassifier(max_depth=40, n_estimators=400),
  'best_hyperparameters': {'max_depth': 40,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'n_estimators': 400},
  'accuracy_on_test': 0.8831168831168831},
 'COAD': {'model': RandomForestClassifier(max_depth=30, n_estimators=300),
  'best_hyperparameters': {'max_depth': 30,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'n_estimators': 300},
  'accuracy_on_test': 0.8701298701298701},
 'ESCA': {'model': RandomForestClassifier(max_depth=20),
  'best_hyperparameters': {'max_depth': 20,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'n_estimators': 100},
  'accuracy_on_test': 0.8701298701298701},
 'READ': {'model': RandomForestClas

In [14]:
exp1_report

{'HNSC': {'model': RandomForestClassifier(max_depth=20, min_samples_split=10),
  'best_hyperparameters': {'criterion': 'gini',
   'max_depth': 20,
   'min_samples_leaf': 1,
   'min_samples_split': 10,
   'n_estimators': 100},
  'accuracy_on_test': 0.7983091787439613},
 'STAD': {'model': RandomForestClassifier(criterion='entropy', max_depth=30, min_samples_split=10,
                         n_estimators=200),
  'best_hyperparameters': {'criterion': 'entropy',
   'max_depth': 30,
   'min_samples_leaf': 1,
   'min_samples_split': 10,
   'n_estimators': 200},
  'accuracy_on_test': 0.780852994555354},
 'COAD': {'model': RandomForestClassifier(max_depth=20, min_samples_split=10, n_estimators=300),
  'best_hyperparameters': {'criterion': 'gini',
   'max_depth': 20,
   'min_samples_leaf': 1,
   'min_samples_split': 10,
   'n_estimators': 300},
  'accuracy_on_test': 0.8956442831215972},
 'ESCA': {'model': RandomForestClassifier(criterion='entropy', max_depth=30, min_samples_split=30,
          

### Experiment 2: With dimensionality reduction

Dimensionality reduction is a technique used in machine learning and statistics to reduce the number of features or variables in a dataset while preserving its essential information. The goal is to simplify the data and improve computational efficiency, mitigate the curse of dimensionality, and enhance the performance of machine learning models.

Algorithms:
* Non-Negative Matrix Factorization (NMF): NMF is a factorization technique that decomposes a matrix into two non-negative matrices. It is particularly useful for non-negative data, such as images or text, and is often applied in topic modeling and image processing.
* Latent Dirichlet Allocation (LDA): LDA is a probabilistic generative model used for topic modeling. It assumes that documents are mixtures of topics and that each word's presence is attributable to one of the document's topics. LDA helps discover the underlying topics in a collection of documents.


In [15]:
exp2_datasets = exp1_datasets

In [16]:
exp2_datasets["HNSC"]["train"][0]

Unnamed: 0,Simonsiella,Treponema,Campylobacter,Helicobacter,Paracoccus,Comamonas,Pseudomonas,Xanthomonas,Agrobacterium,Bradyrhizobium,...,Mageeibacillus,Hungatella,Pseudopropionibacterium,Peptoanaerobacter,Emergencia,Prevotellamassilia,Criibacterium,Fournierella,Negativibacillus,Duodenibacillus
155,0.0,0.010944,0.0,0.0,0.0,0.0,0.017489,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
414,0.0,0.0,0.0,0.107591,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
367,0.0,0.0,0.0,0.421787,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.023153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.206774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.pipeline import Pipeline

def perform_experiment_2(dataset_dict, classes, use_nmf=True, cv_n_splits=5, n_jobs=12):
    report = {}
    for c in classes:
        print("Class: ", c)
        features = dataset_dict[c]["train"][0]
        target = dataset_dict[c]["train"][1]

        # Define the Random Forest classifier
        rf_classifier = RandomForestClassifier()

        # Create a pipeline with NMF or LDA (based on the use_nmf switch) and Random Forest
        if use_nmf:
            dimensionality_reduction = NMF(init='random', random_state=42, max_iter=20000)  # Initialize NMF without specifying n_components
        else:
            dimensionality_reduction = LatentDirichletAllocation()  # Initialize LDA without specifying n_components

        pipeline = Pipeline([
            ('dimensionality_reduction', dimensionality_reduction),
            ('classifier', rf_classifier)
        ])

        # Define the hyperparameters and their potential values for the grid search
        param_grid = {
            'dimensionality_reduction__n_components': [4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64] if use_nmf else list(range(1, 5)),
            'classifier__n_estimators': [50, 100, 150, 200, 250, 300, 350, 400],
            'classifier__criterion': ["gini", "entropy"],
            'classifier__max_depth': [1, 10, 20, 30, 40, 50],
            'classifier__min_samples_split': [2, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
            'classifier__min_samples_leaf': [1, 10, 20]
        }

        cv = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=seed_value)

        # Create the GridSearchCV object
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            scoring=make_scorer(balanced_accuracy_score),  # Choose an appropriate metric for your problem
            cv=cv,
            n_jobs=n_jobs,  # Use all available processors
            return_train_score=True  # Set to True to calculate train scores
        )

        # Perform the grid search
        grid_search.fit(features, target)

        # Access cv_results_ attribute to get detailed results
        cv_results = grid_search.cv_results_

        # Print the best hyperparameters
        print("Best Hyperparameters:", grid_search.best_params_)

        # Get the best model
        best_model = grid_search.best_estimator_

        # Evaluate the best model on your test set
        # Assuming you have X_test and y_test from your data
        # Modify this based on your actual test set
        predictions = best_model.predict(dataset_dict[c]["test"][0])
        accuracy = balanced_accuracy_score(dataset_dict[c]["test"][1], predictions)
        print("Accuracy on Test Set:", accuracy)

        # Calculate mean train score and mean test score
        mean_train_score = cv_results['mean_train_score'][grid_search.best_index_]
        mean_test_score = cv_results['mean_test_score'][grid_search.best_index_]

        print("Mean Train Score:", mean_train_score)
        print("Mean Test Score:", mean_test_score)

        report[c] = {
            "model": best_model,
            "best_hyperparameters": grid_search.best_params_,
            "accuracy_on_test": accuracy,
            "mean_train_score": mean_train_score,
            "mean_test_score": mean_test_score
        }

    return report


In [41]:
exp2_report = perform_experiment_2(exp2_datasets, classes)

Class:  HNSC


In [35]:
## sample 

features = exp1_datasets["HNSC"]["train"][0]
target = exp1_datasets["HNSC"]["train"][1]

nmf_param_grid = {'n_components': [4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64]}
lda_param_grid = {'n_components': list(range(1, 5))}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed_value)

# Create the GridSearchCV object
# Apply NMF with Grid Search
nmf = NMF()
nmf_grid_search = GridSearchCV(nmf, nmf_param_grid, scoring=make_scorer(balanced_accuracy_score),cv=cv, return_train_score=True)
nmf_grid_search.fit(features, target)
 # Print the best hyperparameters
print("Best Hyperparameters:", nmf_grid_search.best_params_)



Traceback (most recent call last):
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/utils/_response.py", line 109, in _get_response_values
    y_pred, pos_label = estimator.predict(X), None
AttributeError: 'NMF' object 

Best Hyperparameters: {'n_components': 4}


Traceback (most recent call last):
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/utils/_response.py", line 109, in _get_response_values
    y_pred, pos_label = estimator.predict(X), None
AttributeError: 'NMF' object 

In [21]:

lda = LatentDirichletAllocation()
lda_grid_search = GridSearchCV(lda, lda_param_grid, scoring=make_scorer(balanced_accuracy_score),cv=cv)
lda_grid_search.fit(features,target)
 # Print the best hyperparameters
print("Best Hyperparameters:", lda_grid_search.best_params_)

# Get the best model
lda_best_model = lda_grid_search.best_estimator_


Traceback (most recent call last):
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/Users/gbaldonado/miniforge3/envs/ml_env/lib/python3.8/site-packages/sklearn/utils/_response.py", line 109, in _get_response_values
    y_pred, pos_label = estimator.predict(X), None
AttributeError: 'LatentDirich

Best Hyperparameters: {'n_components': 1}


In [20]:
print("Best Hyperparameters:", nmf_grid_search.best_params_)

# # Get the best model
# best_model = nmf_grid_search.best_estimator_

Best Hyperparameters: {'n_components': 4}


In [22]:
print("Best Hyperparameters:", lda_grid_search.best_params_)

# # Get the best model
# best_model = nmf_grid_search.best_estimator_

Best Hyperparameters: {'n_components': 1}


In [28]:
nmf_grid_search.cv_results_

{'mean_fit_time': array([0.0316462 , 0.02249198, 0.03770871, 0.04316406, 0.07689729,
        0.17301879, 0.17670169, 0.18681741, 0.31581106, 0.60307341,
        0.66342077, 0.32058043, 0.2849555 , 0.26315608, 0.36188607,
        0.28586683]),
 'std_fit_time': array([0.03339178, 0.0093233 , 0.01459391, 0.02131672, 0.02646758,
        0.05749673, 0.07567968, 0.04803336, 0.19535245, 0.16149356,
        0.14487382, 0.14740197, 0.07585051, 0.11290411, 0.1799658 ,
        0.05114206]),
 'mean_score_time': array([0.0004272 , 0.00010605, 0.00012407, 0.00012813, 0.00018039,
        0.00016875, 0.00017018, 0.00016689, 0.00018187, 0.00021458,
        0.00018687, 0.00017319, 0.00017424, 0.00018916, 0.00018997,
        0.00018401]),
 'std_score_time': array([6.01076614e-04, 1.70701043e-05, 1.92358734e-05, 3.12787003e-05,
        4.50679481e-05, 2.65176342e-05, 1.75728564e-05, 2.60734453e-05,
        2.51354250e-05, 1.57875591e-05, 7.43835400e-06, 2.10962588e-05,
        2.08462386e-05, 2.46200403e-

In [29]:
nmf_grid_search.cv_results_

# Print training and validation scores
for train_score, val_score, params in zip(nmf_grid_search.cv_results_['mean_train_score'], nmf_grid_search.cv_results_['mean_test_score'], nmf_grid_search.cv_results_['params']):
    print(f"Mean Train Score: {train_score:.4f}, Mean Validation Score: {val_score:.4f}, Parameters: {params}")

KeyError: 'mean_train_score'

In [38]:
nmf_features.shape

(435, 4)

In [39]:
features.shape

(435, 131)

In [46]:
features

Unnamed: 0,Simonsiella,Treponema,Campylobacter,Helicobacter,Paracoccus,Comamonas,Pseudomonas,Xanthomonas,Agrobacterium,Bradyrhizobium,...,Mageeibacillus,Hungatella,Pseudopropionibacterium,Peptoanaerobacter,Emergencia,Prevotellamassilia,Criibacterium,Fournierella,Negativibacillus,Duodenibacillus
155,0.0,0.010944,0.0,0.0,0.0,0.0,0.017489,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
414,0.0,0.0,0.0,0.107591,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
367,0.0,0.0,0.0,0.421787,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.023153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.206774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
pd.DataFrame(nmf_features)

Unnamed: 0,0,1,2,3
0,0.000000,0.057933,0.011393,0.155267
1,0.000051,0.203558,0.014724,0.035997
2,0.125428,0.006309,0.000000,0.000000
3,0.000000,0.073650,0.004637,0.099532
4,0.284119,0.000000,0.000000,0.000000
...,...,...,...,...
430,0.105157,0.000000,0.000000,0.011533
431,0.145236,0.000000,0.221684,0.000000
432,0.000000,0.184698,0.190928,0.000000
433,0.189743,0.004490,0.000000,0.000000


In [None]:
# Create a DataFrame with the NMF features and set the index to match 'features'
nmf_df = pd.DataFrame(nmf_features, columns=column_names, index=features.index)

In [48]:
features.index

Index([155, 414, 172, 367, 462, 150, 400, 168, 394, 292,
       ...
       480, 385, 131, 319,  51,  33,  15, 198, 211, 494],
      dtype='int64', length=435)

In [54]:
# Assuming 'features' is your original feature matrix
# Assuming 'seed_value' is the seed for reproducibility
n_components = 4
nmf_model = NMF(n_components=4, random_state=seed_value)
nmf_features = nmf_model.fit_transform(features)

# Create columns from 'NMF_1' to 'NMF_n'
column_names = [f'NMF_{i}' for i in range(1, n_components+1)]

# # Create a DataFrame with the NMF features and set the index to match 'features'
nmf_df = pd.DataFrame(nmf_features, columns=column_names, index=features.index)

# # Concatenate the new features with the original dataset
augmented_dataset = pd.concat([features, nmf_df], axis=1)
# nmf_df
# column_names

In [55]:
augmented_dataset

Unnamed: 0,Simonsiella,Treponema,Campylobacter,Helicobacter,Paracoccus,Comamonas,Pseudomonas,Xanthomonas,Agrobacterium,Bradyrhizobium,...,Emergencia,Prevotellamassilia,Criibacterium,Fournierella,Negativibacillus,Duodenibacillus,NMF_1,NMF_2,NMF_3,NMF_4
155,0.0,0.010944,0.0,0.0,0.0,0.0,0.017489,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.057933,0.011393,0.155267
414,0.0,0.0,0.0,0.107591,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000051,0.203558,0.014724,0.035997
172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.125428,0.006309,0.000000,0.000000
367,0.0,0.0,0.0,0.421787,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.073650,0.004637,0.099532
462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.284119,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.105157,0.000000,0.000000,0.011533
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.145236,0.000000,0.221684,0.000000
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.184698,0.190928,0.000000
211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.189743,0.004490,0.000000,0.000000


In [28]:
def perform_feature_engineering_and_evaluate(dataset_dict, classes, cv_n_splits=5, n_jobs=12):
    report = {}
    for c in classes:
        print("Class: ", c)
        features = dataset_dict[c]["train"][0]
        target = dataset_dict[c]["train"][1]

        # Define the classifier
        rf = RandomForestClassifier()

        # Define the hyperparameters and their potential values for the grid search
        nmf_param_grid = {'n_components': [4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64]}
        lda_param_grid = {'n_components': list(range(1, 5))}

        cv = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=seed_value)

        # Create the GridSearchCV object
        # Apply NMF with Grid Search
        nmf = NMF()
        nmf_grid_search = GridSearchCV(nmf, 
                                       nmf_param_grid, 
                                       scoring=make_scorer(balanced_accuracy_score),
                                       cv=5)
        nmf_grid_search.fit(features, target)
        best_nmf_model = nmf_grid_search.best_estimator_

        # Apply LDA with Grid Search
        lda = LatentDirichletAllocation()
        lda_grid_search = GridSearchCV(lda, lda_param_grid, cv=5)
        lda_grid_search.fit(features, target)
        best_lda_model = lda_grid_search.best_estimator_
        
        grid_search = GridSearchCV(
            estimator=rf,
            param_grid=param_grid,
            scoring=make_scorer(balanced_accuracy_score),  # Choose an appropriate metric for your problem
            cv=cv,
            n_jobs=-1  # Use all available processors
)

        # Perform the grid search
        grid_search.fit(features, target)

        # Print the best hyperparameters
        print("Best Hyperparameters:", grid_search.best_params_)

        # Get the best model
        best_model = grid_search.best_estimator_

        # Evaluate the best model on your test set
        # Assuming you have X_test and y_test from your data
        # Modify this based on your actual test set
        predictions = best_model.predict(dataset_dict[c]["test"][0])
        accuracy = balanced_accuracy_score(dataset_dict[c]["test"][1], predictions)
        print("Accuracy on Test Set:", accuracy)

        report[c] =  {"model": best_model,
                    "best_hyperparameters": grid_search.best_params_,
                    "accuracy_on_test": accuracy}
        
    return report



['HNSC', 'STAD', 'COAD', 'ESCA', 'READ']

In [18]:
import pandas as pd
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

def apply_nmf_and_lda(datasetdict, nmf_param_grid=None, lda_param_grid=None):
    """
    Apply NMF and LDA for both dimensionality reduction and feature engineering using grid search.

    Parameters:
    - dataset: A pandas DataFrame containing the data.
    - nmf_param_grid: Parameter grid for NMF hyperparameter search.
    - lda_param_grid: Parameter grid for LDA hyperparameter search.

    Returns:
    - best_nmf_model: Best NMF model based on grid search.
    - best_lda_model: Best LDA model based on grid search.
    """

    report = {}

    for c in classes:
        print("Class: ", c)
        features = datasetdict[c]["train"][0]
        target = datasetdict[c]["train"][1]

        # Apply NMF with Grid Search
        nmf = NMF()
        nmf_grid_search = GridSearchCV(nmf, nmf_param_grid, cv=5)
        nmf_grid_search.fit(features)
        best_nmf_model = nmf_grid_search.best_estimator_

        # Apply LDA with Grid Search
        lda = LatentDirichletAllocation()
        lda_grid_search = GridSearchCV(lda, lda_param_grid, cv=5)
        lda_grid_search.fit(features)
        best_lda_model = lda_grid_search.best_estimator_

        
        
        
        
        return best_nmf_model, best_lda_model

    # Example usage:
    # Assuming 'your_numerical_dataset.csv' is your dataset file with numerical columns
    your_numerical_dataset = pd.read_csv('your_numerical_dataset.csv')

    # Define parameter grids for NMF and LDA
    nmf_param_grid = {'n_components': [4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64]}
    lda_param_grid = {'n_components': list(range(1, 5))}

# Applying NMF and LDA with Grid Search
best_nmf_model, best_lda_model = apply_nmf_and_lda(exp2_datasets, nmf_param_grid=nmf_param_grid, lda_param_grid=lda_param_grid)

# Use best_nmf_model and best_lda_model for further analysis or machine learning modeling


NameError: name 'nmf_param_grid' is not defined