# Combining `.csv`'s, Overall Annotations Data

In [1]:
import pandas as pd
import os
from glob import glob
from pathlib import Path
import numpy as np

cwd = Path.cwd()
print(cwd)
annotations_dir = cwd / "Annotations"

# If current path does not end in "Annotations", move into it
if cwd.name != "Annotations":
    if annotations_dir.exists():
        os.chdir(annotations_dir)
        print(f"Changed working directory to: {annotations_dir}")
    else:
        raise FileNotFoundError(f"'Annotations' directory not found at: {annotations_dir}")
else:
    print("Already in Annotations directory.")

/home/fs1620/MLBD_2024_25/Research_Project/LiaDataAnalysis
Changed working directory to: /home/fs1620/MLBD_2024_25/Research_Project/LiaDataAnalysis/Annotations


In [2]:
def preprocess(df, features_transform, log=True):
    """
    Apply TIC normalisation and optional log(+1) transform to MSI spectral data.
    Handles zero TIC safely

    Parameters:
    - df (pd.DataFrame): Input data with features and metadata.
    - features_transform (list of str): Names of m/z columns to transform.
    - log (bool): If True, applies log(1 + x) after normalization.

    Returns:
    - pd.DataFrame: Transformed data with metadata preserved.
    """

    df_proc = df.copy()
    # Extract feature matrix as numpy array
    features_array = df_proc[features_transform].values.astype(float)

    # TIC normalisation: divide each row by its row sum
    tic = features_array.sum(axis=1, keepdims=True)
    tic[tic == 0] = 1e-10  # avoid division by zero

    normed = features_array / tic

    if log:
        # Log1p transform
        log_normed = np.log1p(normed)
        if np.any(np.isnan(log_normed)) or np.any(np.isinf(log_normed)):
            raise ValueError("NaNs or Infs detected after log normalisation.")
        processed = log_normed
    else:
        processed = normed

    # Replace transformed features in a copy of the original DataFrame
    df_proc[features_transform] = pd.DataFrame(processed, columns=features_transform, index=df_proc.index)

    return df_proc

In [3]:
def compile_csvs(csv_paths, apply_preprocess, log_transform, savename,
                 tag_sample=True, return_df=False, verbose=True):

    if not csv_paths:
        raise FileNotFoundError("No non-imputed CSVs found in Annotations.")

    df_list = []
    for p in csv_paths:
        # Skip already-combined CSVs
        if p.stem.startswith("combined_"):
            continue

        df = pd.read_csv(p, low_memory=False)

        if verbose:
            unique_labels = df["Label"].unique()
            label_counts = df["Label"].value_counts()
            print(f"File: {p.stem}")
            print(f"Total NaNs in dataframe: {df.isna().sum().sum()}")
            print(f"Unique labels: {unique_labels} with counts: {label_counts}\n")

        if tag_sample:
            df["sample_id"] = p.stem

        # Define meta columns present in this file
        meta_cols = [col for col in ["sample_id", "ID", "Label"] if col in df.columns]
        features_transform = [c for c in df.columns if c not in meta_cols]

        if apply_preprocess:
            df = preprocess(df, features_transform=features_transform, log=log_transform)

        df_list.append(df)

    combined_df = pd.concat(df_list, ignore_index=True)

    # Final meta/features ordering
    meta_cols_final = [col for col in ["sample_id", "ID", "Label"] if col in combined_df.columns]
    features = [c for c in combined_df.columns if c not in meta_cols_final]
    combined_df = combined_df[meta_cols_final + features]

    if verbose:
        print(f"Unique labels in the combined dataset: {combined_df['Label'].unique()}")
        print(f"Loaded {len(df_list)} CSVs, total rows = {combined_df.shape[0]}\n")

    if savename:
        combined_df.to_csv(Path.cwd() / savename, index=False)

    if return_df:
        return combined_df

In [4]:
# Point to Annotations folder
base_dir = Path.cwd()

# List all CSVs in there, skip any starting with 'imputed'
csv_paths = [p for p in base_dir.glob("*.csv")]

compilation_settings = [(False, False, 'combined_annotations.csv'),
                        (True, False, 'combined_annotationsTIC.csv'),
                        (True, True, 'combined_annotationsTIC&Log.csv')]

for compilation_setting in compilation_settings:
    compile_csvs(csv_paths, *compilation_setting)

File: MK484_CSV
Total NaNs in dataframe: 0
Unique labels: ['CIN1' 'NORMAL'] with counts: Label
CIN1      196
NORMAL     85
Name: count, dtype: int64

File: MK534_CSV
Total NaNs in dataframe: 0
Unique labels: ['CGIN' 'NORMAL'] with counts: Label
CGIN      73
NORMAL    33
Name: count, dtype: int64

File: MK483_CSV
Total NaNs in dataframe: 0
Unique labels: ['CIN1' 'Normal' 'normal'] with counts: Label
CIN1      460
Normal    196
normal    171
Name: count, dtype: int64

File: MK503_CSV
Total NaNs in dataframe: 0
Unique labels: ['CIN2' 'NORMAL'] with counts: Label
CIN2      164
NORMAL     27
Name: count, dtype: int64

File: imputed_MK478_CSV
Total NaNs in dataframe: 0
Unique labels: ['CIN2'] with counts: Label
CIN2    167
Name: count, dtype: int64

File: MK531_CSV
Total NaNs in dataframe: 0
Unique labels: ['CIN1' 'CIN2' 'Normal' 'normal'] with counts: Label
normal    74
Normal    60
CIN1      51
CIN2      32
Name: count, dtype: int64

File: MK482_CVS
Total NaNs in dataframe: 0
Unique labels

# Label Harmonisation

In [5]:
def label_harmonise(filename, column, harmonised_map, savename=None, return_df=False):
    df = pd.read_csv(filename)
    df_copy = df.copy()
    df_copy[column] = df_copy[column].replace(harmonised_map)
    if savename:
        df_copy.to_csv(savename, index=False)
    if return_df:
        return df_copy

In [6]:
harmonised_map = {
    'NORMAL': 'Normal',
    'normal': 'Normal',
    'normal 1': 'Normal',
}

combined_csv_names = [
    "combined_annotations.csv",
    "combined_annotationsTIC.csv",
    "combined_annotationsTIC&Log.csv"
]

for filename in combined_csv_names:
    savename = filename.removesuffix(".csv") + '_harmonised.csv'
    label_harmonise(filename=filename,
                    column='Label',
                    harmonised_map=harmonised_map,
                    savename=savename)

# Preliminary Analysis
## Counts per Class

In [7]:
from collections import defaultdict

# Set all label maps as a list here: [config0&4, config 1, config2, config3]
label_mapping_list = {'config0&4': {'Normal': 0, 'CIN1': 1, 'CIN2': 2, 'CGIN': 3, 'HPV INFECTION': 4}, # config0&4
                      'config1':   {'Normal': 0, 'CIN1': 1, 'CIN2': 1, 'CGIN': 1, 'HPV INFECTION': 0}, # config1
                      'config2':   {'Normal': 0, 'CIN1': 1, 'CIN2': 1, 'CGIN': 1, 'HPV INFECTION': 1}, # config2
                      'config3':   {'Normal': 0, 'CIN1': 1, 'CIN2': 2, 'CGIN': 2, 'HPV INFECTION': 0}} # config 3

inv_label_mapping_list = {}
for label_configs in label_mapping_list:
    inv_label_maps = defaultdict(list)
    for key, value in label_mapping_list[label_configs].items():
        inv_label_maps[value].append(key)
    # Convert list -> joined string (sorted for consistency)
    inv_label_mapping_list[label_configs] = {
        val: " + ".join(names)
        for val, names in inv_label_maps.items()
    }

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

def counts_per_class(combined_csv, label_mapping_list, inv_label_mapping_list, save=True, show=False):
    for label_mapping in label_mapping_list:
        combined_configured = combined_csv.copy()
        combined_configured['Label'] = combined_configured['Label'].map(label_mapping_list[label_mapping])
        combined_configured['Label'] = combined_configured['Label'].map(inv_label_mapping_list[label_mapping])
        combined_configured['Label'] = combined_configured['Label'].fillna('Unknown')

        order = combined_configured['Label'].value_counts().index

        plt.figure(figsize=(10, 12))
        ax = sns.countplot(
            data=combined_configured,
            x='Label',
            order=order
        )
        ax.bar_label(ax.containers[0])
        plt.title(f'Pixel Count per Label, Mapping: {label_mapping}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        if save:
            plt.savefig(f'pixelcountperclass_{label_mapping}.png', dpi=300)
        if show:
            plt.show()
        plt.close()

In [9]:
combined_csv = pd.read_csv('combined_annotationsTIC&Log_harmonised.csv', sep = ',') # all combined files have the same counts per labels
counts_per_class(combined_csv, label_mapping_list, inv_label_mapping_list)

## Total Intensities per Pixel by Label

In [10]:
def total_intensities(filename, label_mapping_list, inv_label_mapping_list,
                      stat, savename, show):
    
    combined = pd.read_csv(filename)  # label-harmonised combined .csv

    assert stat in ['count', 'density'], "Parameter 'stat' must be one of 'count' or 'density'."
    
    # Drop non-feature columns
    feature_cols = [col for col in combined.columns if col not in ['ID', 'Label', 'sample_id']]
    if not feature_cols:
        raise ValueError("No feature columns found for intensity calculation.")
    
    # Clean filename base once
    file_base = filename.removeprefix('combined_annotations').removesuffix('_harmonised.csv')

    for label_mapping in label_mapping_list:
        combined_configured = combined.copy()    
        combined_configured['Label'] = (combined_configured['Label']
                                        .map(label_mapping_list[label_mapping])
                                        .map(inv_label_mapping_list[label_mapping]))
        combined_configured['Label'] = combined_configured['Label'].fillna('Unknown')

        combined_configured['total_intensity'] = combined_configured[feature_cols].sum(axis=1)

        normalisation = 'Unnormalised' if stat == 'count' else 'Normalised'
        y_axis_label = 'Raw count' if stat == 'count' else 'Density'
        
        plt.figure(figsize=(10, 12))
        sns.histplot(data=combined_configured, x='total_intensity', hue='Label',
                     element='step', stat=stat, common_norm=True)
        plt.title(f"({normalisation}) Total Intensity per Pixel by Label\n{file_base}, {label_mapping}")
        plt.xlabel('Total Intensities per Pixel')
        plt.ylabel(y_axis_label)
        plt.tight_layout()
        if savename:
            plt.savefig(f"{file_base}_{savename}_{normalisation.lower()}_{label_mapping}.png", dpi=300)
        if show:
            plt.show()
        plt.close()

In [11]:
combined_csv_names = ["combined_annotations_harmonised.csv",
                      "combined_annotationsTIC_harmonised.csv", 
                      "combined_annotationsTIC&Log_harmonised.csv"]

for stat in ['count', 'density']:
    for filename in combined_csv_names:
        total_intensities(filename, label_mapping_list, inv_label_mapping_list,
                          stat=stat, savename='totalintensitiesperpixel', show=False)

## Mean spectra per Label

In [12]:
def mean_spectra(filename, label_mapping_list, inv_label_mapping_list,
                 savename, show):
    
    combined = pd.read_csv(filename)  # label harmonised combined .csv
    
    # Drop non-feature columns
    feature_cols = [col for col in combined.columns if col not in ['ID', 'Label', 'sample_id']]
    if not feature_cols:
        raise ValueError("No feature columns found for intensity calculation.")
    
    # Clean filename base once
    file_base = filename.removeprefix('combined_annotations').removesuffix('_harmonised.csv')

    assert file_base in ['TIC&Log', 'TIC', ''], f"File base must be one of ['TIC&Log', 'TIC', ''], got '{file_base}'."

    for label_mapping in label_mapping_list:
        combined_configured = combined.copy()    
        combined_configured['Label'] = (combined_configured['Label']
                                        .map(label_mapping_list[label_mapping])
                                        .map(inv_label_mapping_list[label_mapping]))
        combined_configured['Label'] = combined_configured['Label'].fillna('Unknown')

        # Group by Label and compute mean spectrum
        mean_spectra = combined_configured.groupby('Label')[feature_cols].mean().T

        # Plot average spectra
        plt.figure(figsize=(12, 6))
        for label in mean_spectra.columns:
            plt.plot(mean_spectra.index.astype(float), mean_spectra[label], label=label)

        if file_base == 'TIC&Log':
            plot_title_add = 'TIC Normalised + Log Transformed'
        elif file_base == 'TIC':
            plot_title_add = 'TIC Normalised'
        else:
            plot_title_add = 'Raw'
        
        plt.title(f'Average {plot_title_add} Spectrum by Label {label_mapping}')
        plt.xlabel('m/z')
        plt.ylabel('Mean Intensity')
        plt.legend()
        plt.tight_layout()

        if savename:
            plt.savefig(f'MeanSpectraPerLabel_{label_mapping}.png', dpi=300)
        if show:
            plt.show()
        plt.close()

In [13]:
combined_csv_names = ["combined_annotations_harmonised.csv",
                      "combined_annotationsTIC_harmonised.csv", 
                      "combined_annotationsTIC&Log_harmonised.csv"]

for filename in combined_csv_names:
    mean_spectra(filename, label_mapping_list, inv_label_mapping_list,
                savename='totalintensitiesperpixel', show=False)

# Saving Label Configured Files

In [14]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

def save_configured_files(filename, label_mapping, savename, savefolder):
    combined_copy = pd.read_csv(filename).copy()
    features = [col for col in combined_copy.columns if col not in ['global_pixel_id', 'ID', 'Label', 'sample_id', 'total_intensity']]

    cwd = Path.cwd()
    parent_dir = cwd.parent

    save_dir = parent_dir / savefolder
    os.makedirs(save_dir, exist_ok=True)

    X = combined_copy[features].values
    y_encoded = combined_copy['Label'].map(label_mapping)

    if y_encoded.isnull().any():
        missing = combined_copy.loc[y_encoded.isnull(), 'Label'].unique()
        raise ValueError(f"Found unmapped labels: {missing}")

    y_encoded = y_encoded.astype(int).values

    # Extract sample ids as string numpy array
    sample_ids = combined_copy['sample_id'].astype(str).values

    np.savez_compressed(save_dir / savename, X=X, y=y_encoded, sample_ids = sample_ids)
    print(f"Saved compressed data to {save_dir / savename}")

In [15]:
for label_mapping_name in label_mapping_list:
    save_configured_files(
        "combined_annotationsTIC&Log_harmonised.csv", # only need this file for rest of the analysis
        label_mapping_list[label_mapping_name],
        f'combined_log_transformed_{label_mapping_name}.npz',
        savefolder='PreppedData'
    )

Saved compressed data to /home/fs1620/MLBD_2024_25/Research_Project/LiaDataAnalysis/PreppedData/combined_log_transformed_config0&4.npz
Saved compressed data to /home/fs1620/MLBD_2024_25/Research_Project/LiaDataAnalysis/PreppedData/combined_log_transformed_config1.npz
Saved compressed data to /home/fs1620/MLBD_2024_25/Research_Project/LiaDataAnalysis/PreppedData/combined_log_transformed_config2.npz
Saved compressed data to /home/fs1620/MLBD_2024_25/Research_Project/LiaDataAnalysis/PreppedData/combined_log_transformed_config3.npz
