In [1]:
import pandas as pd
import numpy as np

In [2]:
#### Load in the dataset

micro_growing = pd.read_csv("/home/dermot.kelly/Dermot_analysis/Phd/Paper_2/microbiome_ml/data/ct_full_growings_microbiome.csv", sep=",", header=0)

micro_growing.head()

Unnamed: 0,VID,X0319.6G20,Abditibacterium,Absconditabacteriales_.SR1.,Acetanaerobacterium,Acetitomaculum,Acetobacteroides,Acinetobacter,Actinobacillus,Actinomyces,...,ct_IMF,ct_rumen,fat_kg,muscle_kg,bone_kg,gigot,EMA,INF,rumen,ct_date_diff
0,498,0.08,0,0.11,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,4.046,0.5,5.0,2.0,2.5,9.5,1.0,4.0,3.0
1,796,0.03,0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,...,1.9,5.21,1.5,6.5,2.5,3.5,11.0,2.0,5.0,3.0
2,847,0.0,0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,...,1.2,6.045,1.0,7.0,2.5,4.0,11.0,1.0,6.0,2.0
3,875,0.0,0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,...,1.4,5.692,1.5,6.5,2.5,3.5,10.0,1.5,5.5,3.0
4,880,0.14,0,0.13,0.0,0.0,0.0,0.02,0.0,0.0,...,1.6,6.018,1.5,6.5,2.5,3.5,9.5,1.5,6.0,2.0


In [3]:
## Need to find which columns are microbial genera
for i, col in enumerate(micro_growing.columns):
    print(f"{i}: {col}")

0: VID
1: X0319.6G20
2: Abditibacterium
3: Absconditabacteriales_.SR1.
4: Acetanaerobacterium
5: Acetitomaculum
6: Acetobacteroides
7: Acinetobacter
8: Actinobacillus
9: Actinomyces
10: Agathobacter
11: Agromyces
12: Akkermansia
13: Aliterella
14: Alkalibacter
15: Allisonella
16: Alloprevotella
17: Allorhizobium.Neorhizobium.Pararhizobium.Rhizobium
18: Alysiella
19: Amnipila
20: Anaerobiospirillum
21: Anaerofustis
22: Anaeroplasma
23: Anaerostipes
24: Anaerovibrio
25: Anaerovorax
26: Asteroleplasma
27: Atopobium
28: Bacillus
29: Bacteroidales_BS11_gut_group
30: Bacteroidales_RF16_group
31: Bacteroidales_UCG.001
32: Bacteroides
33: Bacteroidetes_BD2.2
34: Barnesiella
35: Bdellovibrio
36: Bergeyella
37: Bibersteinia
38: Blastocatellaceae
39: Blautia
40: Bosea
41: Bradymonadales
42: Brevundimonas
43: Buchnera
44: Buttiauxella
45: Butyrivibrio
46: CAG.352
47: COB_P4.1_termite_group
48: CPla.4_termite_group
49: Campylobacter
50: Candidatus_Methanomethylophilus
51: Candidatus_Saccharimonas
5

In [4]:
#### Filtering the dataset #####

# Columns wherein microbial genera are
genus_cols = micro_growing.columns[1:310]
abund = micro_growing[genus_cols].copy()

# Pull in ANI_ID from column 324
abund["ANI_ID"] = micro_growing.iloc[:, 312].values

# ------------------------------------------------------------------
# 3.  PREVALENCE FILTER  (>10 % non-zero)
# ------------------------------------------------------------------
prev = (abund[genus_cols] > 0).mean()
keep = prev[prev >= 0.10].index.tolist()

print(f"Kept {len(keep)} genera out of {len(genus_cols)} (≥10 % prevalence)")


Kept 158 genera out of 309 (≥10 % prevalence)


In [5]:
print(genus_cols)

Index(['X0319.6G20', 'Abditibacterium', 'Absconditabacteriales_.SR1.',
       'Acetanaerobacterium', 'Acetitomaculum', 'Acetobacteroides',
       'Acinetobacter', 'Actinobacillus', 'Actinomyces', 'Agathobacter',
       ...
       'X.Eubacterium._xylanophilum_group', 'X.Ruminococcus._gauvreauii_group',
       'X.Ruminococcus._gnavus_group', 'hoa5.07d05_gut_group', 'p.251.o5',
       'p.2534.18B5_gut_group', 'possible_genus_Sk018', 'probable_genus_10',
       'uncultured', 'vadinBE97'],
      dtype='object', length=309)


In [6]:
# ------------------------------------------------------------------
# 4. LOG TRANSFORM with small pseudocount (Updated 15/08 to CLR)
# ------------------------------------------------------------------
pseudocount = 1e-6

abund_clr = abund.copy()

# take only genera columns you want to transform
X = abund_clr[keep].astype(float) + pseudocount      # add pseudocount
logX = np.log(X)                                     # log step
row_means = logX.mean(axis=1)                        # geometric-mean in log space
clrX = logX.sub(row_means, axis=0)                   # CLR: center by row mean

# put back and keep only transformed genera + ANI_ID
abund_clr[keep] = clrX
abund_clr = abund_clr[keep + ["ANI_ID"]]

abund_clr.head()
np.allclose(abund_clr[keep].sum(axis=1).values, 0.0, atol=1e-8)

True

In [7]:
# Now we can drop raw genera abundances and merge back in these new filtered CLR

# Drop raw values
micro_clean = micro_growing.drop(columns=micro_growing.columns[1:310])

# Force ANI_ID to same type in both DataFrames
micro_clean["ANI_ID"] = micro_clean["ANI_ID"].astype(str)
abund_clr["ANI_ID"]   = abund_clr["ANI_ID"].astype(str)

# Merge on ANI_ID
micro_final = micro_clean.merge(abund_clr, on="ANI_ID")

micro_final.head()

Unnamed: 0,VID,EID,micro_date,ANI_ID,ch4_g_day2_1v3,source,date,animal_type,diet_type,hr_off_feed,...,X.Eubacterium._ruminantium_group,X.Eubacterium._ventriosum_group,X.Eubacterium._xylanophilum_group,X.Ruminococcus._gauvreauii_group,X.Ruminococcus._gnavus_group,hoa5.07d05_gut_group,p.251.o5,probable_genus_10,uncultured,vadinBE97
0,498,372043357200498,2023-09-08,1092082,9.985896,growings_match,2023-09-08,Hill Lamb,Grass,2,...,3.033856,3.439304,4.132435,-6.869682,-6.869682,-6.869682,-6.869682,-6.869682,7.942778,4.738563
1,796,372043357200796,2023-09-08,1090424,8.123172,growings_match,2023-09-08,Hill Lamb,Grass,3,...,5.335184,-6.966204,-6.966204,-6.966204,-6.966204,-6.966204,-6.966204,-6.966204,7.610113,3.630456
2,847,372043357200847,2023-09-05,1090450,11.273459,growings_match,2023-09-05,Hill Lamb,Grass,2,...,3.740465,3.047335,-7.261651,-7.261651,-7.261651,3.558147,-7.261651,-7.261651,8.378409,4.251284
3,875,372043357200875,2023-09-08,1098931,10.125939,growings_match,2023-09-08,Hill Lamb,Grass,3,...,5.101412,-6.411524,-6.411524,-6.411524,-6.411524,-6.411524,-6.411524,-6.411524,8.648142,2.798917
4,880,372045803100880,2023-09-08,1099728,11.976333,growings_match,2023-09-08,Hill Lamb,Grass,1,...,5.16549,-6.347445,3.961541,-6.347445,-6.347445,4.472353,3.556092,-6.347445,8.985388,6.473816


In [8]:
print(micro_final.shape)

(357, 501)


In [9]:
# save as csv
# Define output path
output_path = "/home/dermot.kelly/Dermot_analysis/Phd/Paper_2/microbiome_ml/data/CLR_micro.csv"

# Save without the index column
micro_final.to_csv(output_path, index=False)

print(f"Saved to: {output_path}")


Saved to: /home/dermot.kelly/Dermot_analysis/Phd/Paper_2/microbiome_ml/data/CLR_micro.csv


In [10]:
# Also save abund_log as will be useful for matrix for LMM

output_path = "/home/dermot.kelly/Dermot_analysis/Phd/Paper_2/microbiome_ml/data/CLR_counts_only.csv"

# Save without the index column
abund_clr.to_csv(output_path, index=False)

print(f"Saved to: {output_path}")

Saved to: /home/dermot.kelly/Dermot_analysis/Phd/Paper_2/microbiome_ml/data/CLR_counts_only.csv
