# CLAW Lipidomics Analysis Tutorial

# Import CLAW Module

In [5]:
import pandas as pd
import numpy as np
import scripts.CLAW as CLAW
import scripts.CLAW_preedgeR as preedger
from scripts.CLAW_pathwayanalysis import build_group_matrix
import re

# Run CLAW MRM Pipeline

In [6]:


# Only 4 paths needed:
matched_df = CLAW.run_full_analysis(
    database_path='lipid_database/Lipid_Database.xlsx',  # Path to database
    data_folder='projects/lipid_load/mzml/',              # Where mzML files are
    results_folder='projects/lipid_load/results/',        # Where to save CSV
    output_filename='12012025_tutorial',                         # Output filename
    save_data=True                                        # Enable saving
)

Loading MRM database...
Loaded 3256 transitions

Parsing mzML files...
Parsed: projects/lipid_load/mzml/CAR_Blank_Blank_Blank_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD199_m1_66_B_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD199_m1_66_D_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD199_m3_66_B_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD199_m3_66_D_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD201_m5_61_B_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD201_m5_61_D_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD261_m3_81_A_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD261_m3_81_C_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD265_m2_74_A_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CAR_FAD265_m2_74_C_N1_loading_size_2_13_25.mzML
Parsed: projects/lipid_load/mzml/CA

# üìä PreEdgeR and Pathway Analysis

## Overview
These modules perform formatting for **EdgeR** and **pathway enrichment** 

## Requirements

### Experimental Design
- **Minimum**: 3 biological replicates per group
- **Recommended**: 5+ biological replicates per group for increased statistical power
- **Groups**: At least 2 groups (e.g., Treatment vs Control, Disease vs Healthy)

> ‚ö†Ô∏è **Important**: More replicates = more robust statistical results and better detection of true differences

### Data Format
Your data must include:
- **Labels file** (`labels.csv`): Sample metadata with group assignments
  - Example columns: `Sample_ID`, `Genotype` (or other grouping factor)
- **Intensity file**: Lipid measurements across samples
  - Must contain: `Lipid`, `Sample_ID`, `Intensity` columns

### Example Group Comparison
```
Group A (Control): WT samples     ‚Üí 3-8 replicates
Group B (Treatment): 5XFAD samples ‚Üí 3-8 replicates
```

# PreEdgeR

In [7]:
# Load your data
data_df = pd.read_csv("projects/lipid_load/results/12012025_tutorial.csv")
labels_df = pd.read_csv("projects/lipid_load/labels/labels.csv")

# Build the pre-EdgeR matrix
preedger_df = preedger.build_preedger_matrix(
    data_df=data_df,
    labels_df=labels_df,
    sample_col="Sample Name",
    factor_col="Genotype",
    group1="5XFAD",
    group2="WT"
)

# Save output
preedger_df.to_csv("projects/lipid_load/pre_edger/Genotype_5XFAD_vs_WT_preedger.csv", index=False)

# Or just view it
preedger_df

  Group1 (5XFAD) samples: ['FAD265_m3_59_B', 'FAD199_m1_66_B', 'FAD199_m3_66_B', 'FAD201_m5_61_B', 'FAD265_m3_59_D', 'FAD199_m1_66_D', 'FAD199_m3_66_D', 'FAD201_m5_61_D']
  Group2 (WT) samples: ['FAD265_m2_74_A', 'FAD261_m3_81_A', 'FAD309_55_A', 'FAD325_39_A', 'FAD265_m2_74_C', 'FAD261_m3_81_C', 'FAD309_55_C', 'FAD325_39_C']
  Blank sample: Blank_Blank_Blank


matched_sample,Lipid,Class,FAD265_m3_59_B,FAD199_m1_66_B,FAD199_m3_66_B,FAD201_m5_61_B,FAD265_m3_59_D,FAD199_m1_66_D,FAD199_m3_66_D,FAD201_m5_61_D,...,FAD261_m3_81_C,FAD309_55_C,FAD325_39_C,Blank_Blank_Blank,Title1,Title2,Title,length1,length2,Blank_name
0,CAR,CAR,1965.080132,1892.300140,3503.540237,1950.960148,3088.500217,2822.600189,2178.240150,3656.060230,...,2400.740185,2699.160175,1963.960133,1858.720119,Group: 5XFAD,Group: WT,Group: 5XFAD vs Group: WT,8,8,Blank_Blank_Blank
1,CAR(10:0),CAR,1936.740135,2107.400150,4046.240280,1887.420124,3012.440216,2424.460182,2052.880146,3898.300270,...,2574.920193,2616.260170,1954.320145,2661.900169,Group: 5XFAD,Group: WT,Group: 5XFAD vs Group: WT,8,8,Blank_Blank_Blank
2,CAR(10:0)_QUAL,CAR,1995.500153,2382.660168,5342.220375,2593.640186,3706.280247,3308.240238,2574.680164,4765.580360,...,3252.360233,3535.200275,2258.740166,2898.900215,Group: 5XFAD,Group: WT,Group: 5XFAD vs Group: WT,8,8,Blank_Blank_Blank
3,CAR(10:1),CAR,2039.460133,1795.640125,4520.740330,2224.580154,3514.520233,3281.220219,2258.860161,3818.060295,...,2610.260185,2822.220192,2092.460155,2162.000149,Group: 5XFAD,Group: WT,Group: 5XFAD vs Group: WT,8,8,Blank_Blank_Blank
4,CAR(10:1)_QUAL,CAR,2242.840164,2163.660141,4756.640293,2552.840199,3681.680267,3253.720242,2809.120193,5163.160358,...,3015.320198,3776.580227,2012.940140,3336.540257,Group: 5XFAD,Group: WT,Group: 5XFAD vs Group: WT,8,8,Blank_Blank_Blank
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,[TG(66:7)]_FA16:1,TAG,1092.500084,1092.580074,1097.780079,1094.120075,1096.320072,1094.620075,1089.180073,1088.320076,...,1092.780087,1093.560081,1132.080067,1089.460068,Group: 5XFAD,Group: WT,Group: 5XFAD vs Group: WT,8,8,Blank_Blank_Blank
641,"[TG(66:8),TG(65:1)]_FA16:0",TAG,1005.940067,1007.980080,1007.180069,1004.220074,1005.020069,1006.920067,1007.340073,1006.600063,...,999.560074,1002.320076,1017.520061,1011.700077,Group: 5XFAD,Group: WT,Group: 5XFAD vs Group: WT,8,8,Blank_Blank_Blank
642,"[TG(66:8),TG(65:1)]_FA16:1",TAG,1050.260075,1050.420071,1055.840080,1051.900070,1054.480068,1083.220070,1046.560066,1046.040073,...,1049.840069,1052.700073,1049.340076,1047.240070,Group: 5XFAD,Group: WT,Group: 5XFAD vs Group: WT,8,8,Blank_Blank_Blank
643,"[TG(66:9),TG(65:2)]_FA16:0",TAG,1006.160072,1008.400074,1006.880066,1004.080070,1004.700062,1043.240074,1039.240070,1035.780071,...,1000.160076,1003.160072,1003.220070,1011.660072,Group: 5XFAD,Group: WT,Group: 5XFAD vs Group: WT,8,8,Blank_Blank_Blank


# Pathway Analysis Formatting

In [8]:
labels_file = "projects/lipid_load/labels/labels.csv"
data_file   = "projects/lipid_load/results/12012025_tutorial.csv"

labels_df = pd.read_csv(labels_file)
int_df    = pd.read_csv(data_file)

def extract_sample_name(sid):
    match = re.match(r'^[A-Z]+_(.+?)_N\d+_', sid)
    return match.group(1) if match else sid

int_df["Sample Name"] = int_df["Sample_ID"].apply(extract_sample_name)

group_df = build_group_matrix(
    labels_df=labels_df,
    intensities_df=int_df,
    sample_col="Sample Name",
    factor_col="Genotype",
    group1="5XFAD",
    group2="WT",
    lipid_col="Lipid",
    intensity_col="Intensity",
    group1_col_name="Group_B",
    group2_col_name="Group__A",
)

group_df.to_csv("projects/lipid_load/pathway_analysis/Genotype_5XFAD_vs_WT_GroupMatrix.csv", index=False)
group_df

Unnamed: 0,Lipid,Group_B,Group_B.1,Group_B.2,Group_B.3,Group_B.4,Group_B.5,Group_B.6,Group_B.7,Group__A,Group__A.1,Group__A.2,Group__A.3,Group__A.4,Group__A.5,Group__A.6,Group__A.7
0,CAR,1.965080e+03,1.892300e+03,3.503540e+03,1.950960e+03,3.088500e+03,2.822600e+03,2.178240e+03,3.656060e+03,2.958800e+03,2.004820e+03,2742.140182,3.412620e+03,2.317040e+03,2.400740e+03,2699.160175,1.963960e+03
1,CAR(10:0),1.936740e+03,2.107400e+03,4.046240e+03,1.887420e+03,3.012440e+03,2.424460e+03,2.052880e+03,3.898300e+03,3.377340e+03,1.994220e+03,3290.200222,3.180340e+03,2.155460e+03,2.574920e+03,2616.260170,1.954320e+03
2,CAR(10:0)_QUAL,1.995500e+03,2.382660e+03,5.342220e+03,2.593640e+03,3.706280e+03,3.308240e+03,2.574680e+03,4.765580e+03,4.588060e+03,2.130540e+03,4821.200333,5.206300e+03,2.701720e+03,3.252360e+03,3535.200275,2.258740e+03
3,CAR(10:1),2.039460e+03,1.795640e+03,4.520740e+03,2.224580e+03,3.514520e+03,3.281220e+03,2.258860e+03,3.818060e+03,2.971320e+03,2.141700e+03,3747.080238,3.518900e+03,3.164260e+03,2.610260e+03,2822.220192,2.092460e+03
4,CAR(10:1)_QUAL,2.242840e+03,2.163660e+03,4.756640e+03,2.552840e+03,3.681680e+03,3.253720e+03,2.809120e+03,5.163160e+03,4.871080e+03,2.558920e+03,4702.100346,4.794360e+03,2.605080e+03,3.015320e+03,3776.580227,2.012940e+03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,"PC(O-13:1),LPC(P-14:0)",1.116860e+03,1.195160e+03,1.245020e+03,1.074840e+03,1.019660e+03,1.225480e+03,1.150400e+03,1.035760e+03,1.082700e+03,1.042740e+03,1072.340080,1.075840e+03,1.136860e+03,1.181880e+03,1002.600067,1.004780e+03
292,"PC(O-38:8),PC(36:1),PC(O-37:1),PC(P-37:0)",3.374009e+05,3.992548e+05,4.743829e+05,6.162900e+05,4.799063e+05,4.953147e+05,7.574985e+05,4.587679e+05,3.266732e+05,2.311841e+05,195178.455147,2.339695e+05,6.095674e+05,5.541692e+05,185572.615257,2.578481e+05
293,"PC(O-38:9),PC(36:2),PC(O-37:2),PC(P-37:1)",1.850748e+06,2.036451e+06,2.499762e+06,3.197355e+06,2.788119e+06,2.523777e+06,4.021934e+06,2.410971e+06,1.562019e+06,1.155728e+06,857714.721817,1.211875e+06,3.011284e+06,2.863088e+06,868259.554024,1.232604e+06
294,"PC(O-40:9),PC(38:2),PC(P-39:1)",1.467479e+05,3.227118e+05,3.720170e+05,2.849273e+05,2.301117e+05,4.067000e+05,3.313217e+05,2.540254e+05,2.565545e+05,1.102785e+05,66417.484470,7.917984e+04,2.858770e+05,2.930462e+05,62924.065308,8.810536e+04
