In [1]:
import pandas as pd
import numpy as np
import os
import glob
from config import *

def add_label(dataframe, label, meta):
    """
    Add a label column to the given dataframe based on metadata.

    Parameters:
    dataframe (pd.DataFrame): Dataframe containing gene expression data.
    label (str): The label column name to be added.
    meta (pd.DataFrame): Metadata dataframe where labels are stored.

    Returns:
    pd.DataFrame: Updated dataframe with the new label column.
    """
    label_list = []
    for spot in dataframe.index.values:
        # Extract sample ID by removing the last underscore-separated segment
        sample_id = '_'.join(spot.split('_')[:-1])
        # Retrieve the corresponding label from metadata
        spot_label = meta.loc[sample_id, label]
        label_list.append(spot_label)
    # Add the extracted labels to the dataframe
    dataframe[label] = label_list
    return dataframe


if __name__ == '__main__':
    # Load metadata file, assuming tab-separated values
    meta_mouse = pd.read_csv(META_PATH, header=0, sep='\t', index_col=0)
    # Extract sample names from the metadata index
    sample_name = list(meta_mouse.index)
    
    total_counts = pd.DataFrame()  # Initialize an empty dataframe to store combined data
    
    # Iterate over all text files in the CM_PATH directory
    for file in glob.glob(CM_PATH+'*.txt'):
        # Extract sample name from the filename
        sample_n = '_'.join(os.path.basename(file).split("_")[0:-4])
        
        # Process only if the sample is present in metadata
        if sample_n in sample_name:
            # Load the count matrix (genes as columns, spots as rows)
            cm = pd.read_csv(file, header=0, sep='\t', index_col=0)
            
            # Reformat spot names to include sample name as prefix
            new_spots = ["{0}_{1}".format(sample_n, spot) for spot in cm.index]
            cm.index = new_spots
            
            # Concatenate current sample data with total_counts dataframe
            total_counts = pd.concat([total_counts, cm], sort=False)

    # Replace infinite values with NaN and fill missing values with 0
    total_counts.replace([np.inf, -np.inf], np.nan)
    total_counts.fillna(0.0, inplace=True)

    # Store initial counts for logging purposes
    num_spots = len(total_counts.index)
    num_genes = len(total_counts.columns)

    # Remove low-quality spots
    min_genes_spot = round((total_counts != 0).sum(axis=1).quantile(THRESHOLD_SPOT))
    print("Number of expressed genes a spot must have to be kept ({}% of total expressed genes) {}".format(THRESHOLD_SPOT, min_genes_spot))
    
    # Filter out spots that express fewer genes than the threshold
    total_counts = total_counts[(total_counts != 0).sum(axis=1) >= min_genes_spot]
    print("Dropped {} spots".format(num_spots - len(total_counts.index)))

    # Transpose the dataframe to have spots as columns and genes as rows
    total_counts = total_counts.transpose()

    # Remove low-quality genes
    min_spots_gene = round(len(total_counts.columns) * THRESHOLD_GENE)
    print("Removing genes that are expressed in less than {} spots with a count of at least {}".format(min_spots_gene, MIN_EXP))
    
    # Filter out genes that do not meet the minimum expression criteria
    total_counts = total_counts[(total_counts >= MIN_EXP).sum(axis=1) >= min_spots_gene]
    print("Dropped {} genes".format(num_genes - len(total_counts.index)))

    # Transpose back to have genes as columns and spots as rows
    total_counts = total_counts.transpose()

    # Normalize gene expression data by dividing each value by the total sum per spot
    row_sum = total_counts.sum(axis=1)
    normal_total_counts = total_counts.div(row_sum, axis=0)

    # Add labels to the normalized data
    normal_total_counts = add_label(normal_total_counts, LABEL_COLUMN, meta_mouse)
    if CONDITION_COLUMN:
        normal_total_counts = add_label(normal_total_counts, CONDITION_COLUMN, meta_mouse)

    # Save the final processed data
    output_path = os.path.join(DATASET_PATH, 'cm_norm.tsv')
    print(f"Saving processed dataset to: {output_path}")
    normal_total_counts.to_csv(output_path, sep='\t')
    print("Dataset saved successfully!\n")

    print("Processing completed! ✅")


Number of expressed genes a spot must have to be kept (0.01% of total expressed genes) 132
Dropped 21 spots
Removing genes that are expressed in less than 22 spots with a count of at least 1
Dropped 6542 genes
Saving processed dataset to: ../dataset/cm_norm.tsv
Dataset saved successfully!

Processing completed! ✅


In [3]:
# Load the TSV file as a DataFrame
df = pd.read_csv(output_path, sep="\t")

# Print the first 10 column names
print(df.columns[0:10])

Index(['Unnamed: 0', 'Fam234a', 'Nefl', 'Sema5a', 'Tom1l2', 'Nbea', 'Mif',
       'Pcsk1n', 'Tsfm', 'Zfp706'],
      dtype='object')


In [25]:
# Get the last two column names
last_two_columns = df.columns[-2:]

# Print the column names
print("Last two columns:", last_two_columns.tolist())

Last two columns: ['age', 'breed']


In [5]:

df = pd.read_csv(output_path, sep="\t")
# Extract the number of rows (spots) and columns (genes)
num_rows, num_columns = df.shape

# Subtract 2 from the number of columns since they are not part of the genes
num_columns -= 2

# Print formatted output
print(f"Row (Spots) = {num_rows}, Column (Genes) = {num_columns}")

Row (Spots) = 2239, Column (Genes) = 11640


In [7]:

import re  # For regex pattern matching

# Define paths
#output_file = os.path.join(DATASET_PATH, "cm_norm.tsv")
spatco_path = os.path.join(DATASET_PATH, "spatial_coords.tsv")

def extract_spatial_coords_from_file(output_path, spatco_path):
    """
    Extract spatial X, Y coordinates from cm_norm.tsv and save to a file.

    Parameters:
    output_file (str): Path to the count matrix file (cm_norm.tsv).
    spatco_path (str): Path to save the extracted spatial coordinates.

    Returns:
    pd.DataFrame: DataFrame with extracted X, Y coordinates.
    """
    # Load the count matrix (only the index is needed)
    cm = pd.read_csv(output_path, sep="\t", index_col=0)

    # Extract X, Y coordinates from spot names
    spatial_coords = cm.index.to_series().str.extract(r'(?P<X>[\d.]+)x(?P<Y>[\d.]+)').astype(float)

    # Save spatial coordinates
    spatial_coords.to_csv(spatco_path, sep="\t")
    print(f"✅ Spatial coordinates extracted and saved to {spatco_path}")

    return spatial_coords

# Run the extraction function
spatial_coords = extract_spatial_coords_from_file(output_path, spatco_path)


✅ Spatial coordinates extracted and saved to ../dataset/spatial_coords.tsv


In [9]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1
