In [None]:
import pandas as pd
from dataclasses import dataclass
from library_utils import read_mgf
from pathlib import Path
import pandas_utils as pu

- filter single spec and groupby collision energy
- filter single spec and groupby mslevel
- groupby Mslevel MS2, MS3, MS4, MS5, 
- groupby spectype (all energies, same energy, pseudo MS2, best)
- groupby adducts
- quality chimeric (passed, )
- 
Result should be a DataFrame with statistics for each mgf library file one row for each mgf

In [None]:
# Specify the directory and pattern
directory = r'C:\git\msn_library\library'
pattern = '*MSn.mgf'  # Adjust the pattern as needed

@dataclass
class Library:
    file:str
    df:pd.DataFrame

libraries = []
# Loop through the files matching the pattern
for file_path in Path(directory).glob(pattern):
    print(f'Processing file: {file_path.name}')
    print(f'Processing file: {file_path}')
    
    df = read_mgf(file_path)
    libraries.append(Library(file_path.name, df))    

In [None]:
df

In [None]:
test = libraries[0].df
test

In [None]:
test["quality_explained_intensity"].mean()

In [None]:
def count_values_to_one_row(df: pd.DataFrame, columns, prefix=None) -> dict:
    if isinstance(columns, str):
        columns = [columns]
    
    results = df.groupby(columns).size().reset_index(name='count').fillna(0)
    counts = {}
    for _, row in results.iterrows():
        header = ":".join([f"{col}_{row[col]}" for col in columns])
        if prefix!=None:
            header = prefix + header
        counts[header] = row["count"]
    return counts

def average_values_to_one_row(df: pd.DataFrame, columns, prefix=None) -> dict:
    if isinstance(columns, str):
        columns = [columns]
    results = df[columns].mean()
    return results


def extract_row(library: Library) -> dict:
    df = library.df
    row = {
        "file": library.file,
    }
    
    best_spec_df = df[df["spectype"]=="SINGLE_BEST_SCAN"]
    row = row | count_values_to_one_row(best_spec_df, "collision_energy", prefix="best_scan_")
    row = row | count_values_to_one_row(best_spec_df, "mslevel", prefix="best_scan_")
    row = row | count_values_to_one_row(best_spec_df, ["ionmode", "mslevel"], prefix="best_scan_")
    
    row = row | count_values_to_one_row(df, "mslevel")
    row = row | count_values_to_one_row(df, "spectype")
    row = row | count_values_to_one_row(df, "adduct")
    row = row | count_values_to_one_row(df, "quality_chimeric")
    row = row | count_values_to_one_row(df, "other_matched_compounds")
    
    
    return row 

In [None]:
results = []
for library in libraries:
    results.append(extract_row(library))
    

df = pd.DataFrame(results).set_index("file").fillna(0).astype(int)
df.loc["Summary"] = df.sum(numeric_only=True)
df = df.reset_index()

df

In [None]:
pu.save_dataframe(df, r"C:\git\msn_library\library\lib_results\20241003_7libraries_results.tsv")

In [None]:
filtered_df = df.filter(regex=('^(best_scan_mslevel)|(spectype)'))
filtered_df["file"] = df["file"]
filtered_df

In [None]:
pu.save_dataframe(filtered_df, r"C:\git\msn_library\library\lib_results\20241003_7libraries_results_filtered.tsv")

In [None]:

combination_counts = df.groupby(['mslevel', 'spectype']).size().reset_index(name='count')
combination_counts

In [None]:

pivot_table = combination_counts.pivot_table(index=None, columns=['mslevel', 'spectype'], values='count', fill_value=0)
# pivot_table = combination_counts.pivot(index=None, columns=['mslevel', 'spectype'], values='count').fillna(0)
# 
# Flatten the MultiIndex columns
pivot_table.columns = [f'{col[0]}_{col[1]}' for col in pivot_table.columns]
pivot_table