In [1]:
import os
import re
import pandas as pd

def extract_details_from_filename(filename):
    # Define the pattern to extract details from the filename
    pattern = r'_(?P<Genotype>\w+)-(?P<Mouse>m\d+)-(?P<Biology>\w+)-(?P<Cage>FAD\d+)_'
    match = re.search(pattern, filename)
    if match:
        return match.group('Genotype'), match.group('Mouse'), match.group('Biology'), match.group('Cage')
    else:
        return None, None, None, None

def get_file_details(directory):
    file_details = {'Biology': [], 'Genotype': [], 'Cage': [], 'Mouse': []}
    
    # Iterate through each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.mzML'):
            genotype, mouse, biology, cage = extract_details_from_filename(filename)
            if all([genotype, mouse, biology, cage]):
                file_details['Biology'].append(biology)
                file_details['Genotype'].append(genotype)
                file_details['Cage'].append(cage)
                file_details['Mouse'].append(mouse)
    
    return file_details

def main():
    directory = 'Projects/AMP/AMP_OFF'  # Specify the directory here
    file_details = get_file_details(directory)
    
    # Convert the details to a pandas DataFrame
    df = pd.DataFrame(file_details)
    
    # Drop duplicates to get unique values
    df_unique = df.drop_duplicates().reset_index(drop=True)
    
    # Create the dictionary of values similar to the example
    new_columns = {
        'Biology': df_unique['Biology'].unique().tolist(),
        'Genotype': df_unique['Genotype'].unique().tolist(),
        'Cage': df_unique['Cage'].unique().tolist(),
        'Mouse': df_unique['Mouse'].unique().tolist()
    }
    
    # Print the resulting dictionary
    print(new_columns)
    
    # Define group columns
    group_columns = ['Lipid', 'Sample_ID', 'Biology', 'Genotype', 'Mouse', 'Cage']
    print("Group columns:", group_columns)

if __name__ == "__main__":
    main()


{'Biology': ['cortex', 'dienc', 'cereb', 'hippo'], 'Genotype': ['5xFAD', 'WT'], 'Cage': ['FAD231', 'FAD259', 'FAD257', 'FAD263', 'FAD249', 'FAD246', 'FAD245'], 'Mouse': ['m2', 'm5', 'm1', 'm3', 'm4']}
Group columns: ['Lipid', 'Sample_ID', 'Biology', 'Genotype', 'Mouse', 'Cage']
