In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.svm import SVR
from sklearn.linear_model import Lasso

In [35]:
import pandas as pd
import numpy as np

# Step 1: Read in original abundance data
genus_rel_abundance_original = pd.read_csv(
    "/home/dermot.kelly/Dermot_analysis/Phd/Paper_2/rumen_microbiome_pipeline/exported/genus_relative_abundance.csv",
    index_col=0
)

# Step 2: Simulate 50 samples with noise, clipped to [0, 1]
np.random.seed(42)
base_sample = genus_rel_abundance_original.iloc[0]

samples = [
    np.clip(base_sample + np.random.normal(0, 0.1, size=base_sample.shape[0]), 0, 1)
    for _ in range(50)
]

# Step 3: Create DataFrame
genus_rel_abundance_sim = pd.DataFrame(samples, columns=genus_rel_abundance_original.columns)
genus_rel_abundance_sim.index = [f"Sample{i+1:02d}" for i in range(50)]

# Step 4: Add synthetic methane variable (e.g., 15–25 g/day)
genus_rel_abundance_sim["Methane_g_day"] = np.random.uniform(15, 25, size=50)

genus_rel_abundance_sim.head()

Unnamed: 0,Acidaminococcus,Acinetobacter,Actinomyces,Alistipes,Allisonella,Anaerovibrio,Bacillus,Bacteroides,Bifidobacterium,Butyricimonas,...,Syntrophococcus,Treponema,Turicibacter,Unclassified,Veillonella,Weissella,[Eubacterium]_coprostanoligenes_group,[Ruminococcus]_gauvreauii_group,uncultured,Methane_g_day
Sample01,0.489671,0.0,0.124769,0.162303,0.0,0.006586,0.197921,0.376743,0.143053,0.064256,...,0.312408,0.0,0.0,1.0,0.1431,0.123128,0.0,0.099079,0.083126,21.619136
Sample02,0.537555,0.0,0.041434,0.0,0.0,0.111253,0.175624,0.292799,0.290353,0.046164,...,0.272555,0.0,0.007349,1.0,0.286324,0.010764,0.100155,0.126529,0.0,22.511771
Sample03,0.554282,0.085193,0.139103,0.0,0.160279,0.0,0.098686,0.519046,0.090946,0.0,...,0.255461,0.0,0.0,1.0,0.03229,0.064115,0.097669,0.212718,0.0513,22.538681
Sample04,0.585353,0.0,0.332017,0.072567,0.0,0.0,0.088247,0.277654,0.2614,0.057324,...,0.172226,0.0,0.07796,1.0,0.061646,0.034557,0.00484,0.344394,0.113392,20.778911
Sample05,0.237486,0.028645,0.0,0.095243,0.0,0.018526,0.090499,0.386576,0.06997,0.0,...,0.264821,0.068832,0.038099,1.0,0.019188,0.0,0.011064,0.21496,0.085702,24.043598


In [36]:
# Split features and target
x = genus_rel_abundance.drop(columns="Methane_g_day")    # features
y = genus_rel_abundance["Methane_g_day"]    # target

In [37]:
# Train/test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [39]:
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Regression": SVR(),
    "Lasso Regression": Lasso(alpha=0.1)
}


results = []

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results.append({
        "Model": name,
        "R²": round(r2, 3),
        "RMSE": round(rmse, 3)
    })


results_df = pd.DataFrame(results)
print(results_df)

                       Model     R²   RMSE
0              Random Forest -0.213  2.898
1  Support Vector Regression -0.063  2.713
2           Lasso Regression -0.991  3.712
