In [None]:
# Importing libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import shutil

# Exploring labels in the PanopTILs dataset

folder_path = "/Users/ashishsingh/Desktop/csv"

# Initialize sets to store unique values across all files
all_raw_group = set()
all_group = set()

# Loop through all files in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a CSV
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        # Load the CSV file
        data = pd.read_csv(file_path)
        
        # Add unique values from current file to the sets
        all_raw_group.update(data['raw_group'].unique())
        all_group.update(data['group'].unique())

# Print the concatenated unique values
print("Concatenated unique values in 'raw_group' column across all files:")
print(all_raw_group)

print("\nConcatenated unique values in 'group' column across all files:")
print(all_group)

In [None]:
# Checking the number of files in each of the four PanopTILs folder

rgb_dir = "/Users/ashishsingh/Desktop/rgbs"  # path to RGB WSI folder
mask_dir = "/Users/ashishsingh/Desktop/masks"  # path to masks folder
csv_dir = "/Users/ashishsingh/Desktop/csv"  # path to CSV folder
vis_dir = "/Users/ashishsingh/Desktop/vis"  # path to visualisation images folder

# Display dataset information
print(f"Number of RGB images: {len(os.listdir(rgb_dir))}")
print(f"Number of masks: {len(os.listdir(mask_dir))}")
print(f"Number of CSV files: {len(os.listdir(csv_dir))}")
print(f"Number of visualizations: {len(os.listdir(vis_dir))}")

In [None]:
# Exploring the CSV folder and files in more detail

csv_file_path = "/Users/ashishsingh/Desktop/ALL_FOV_LOCATIONS.csv"
data = pd.read_csv(csv_file_path)

# Find unique values in 'raw_group' and 'group' columns
number_of_unique_roiname = data['roiname'].nunique()
unique_roiname = data['roiname'].unique()

# Print the unique values
print("Number of unqiue values in 'roiname' column:")
print(number_of_unique_roiname)

print("Unique values in 'roiname' column:")
print(unique_roiname)

In [None]:
# Creating and verifying four 'filtered' folders, each containing 1316 files

# Paths to new filtered folders on Desktop
desktop_path = os.path.expanduser("~/Desktop")
filtered_rgb_dir = os.path.join(desktop_path, "filtered_rgbs")
filtered_mask_dir = os.path.join(desktop_path, "filtered_masks")
filtered_csv_dir = os.path.join(desktop_path, "filtered_csvs")
filtered_vis_dir = os.path.join(desktop_path, "filtered_visualizations")

# Create new filtered folders
os.makedirs(filtered_rgb_dir, exist_ok=True)
os.makedirs(filtered_mask_dir, exist_ok=True)
os.makedirs(filtered_csv_dir, exist_ok=True)
os.makedirs(filtered_vis_dir, exist_ok=True)

# Get base names (without extensions) from each folder
rgb_files = {os.path.splitext(file)[0] for file in os.listdir(rgb_dir) if file.endswith(".png")}
mask_files = {os.path.splitext(file)[0].replace("-mask", "") for file in os.listdir(mask_dir) if file.endswith(".png")}
csv_files = {os.path.splitext(file)[0] for file in os.listdir(csv_dir) if file.endswith(".csv")}
vis_files = {os.path.splitext(file)[0].replace("-viz", "") for file in os.listdir(vis_dir) if file.endswith(".png")}

# Find common base names across all folders
common_files = rgb_files & mask_files & csv_files & vis_files

print(f"Number of common files across all folders: {len(common_files)}")

# Helper function to filter files based on common names
def filter_and_copy_files(source_dir, dest_dir, file_extension, suffix=""):
    for file_name in os.listdir(source_dir):
        base_name = os.path.splitext(file_name)[0].replace(suffix, "")
        if base_name in common_files:
            src_path = os.path.join(source_dir, file_name)
            dest_path = os.path.join(dest_dir, file_name)
            shutil.copy(src_path, dest_path)

# Filter and copy files to new folders
filter_and_copy_files(rgb_dir, filtered_rgb_dir, ".png")
filter_and_copy_files(mask_dir, filtered_mask_dir, ".png", suffix="-mask")
filter_and_copy_files(csv_dir, filtered_csv_dir, ".csv")
filter_and_copy_files(vis_dir, filtered_vis_dir, ".png", suffix="-viz")

print(f"\nFiltered files have been saved to the following directories on your Desktop:")
print(f"Filtered RGBs: {filtered_rgb_dir}")
print(f"Filtered Masks: {filtered_mask_dir}")
print(f"Filtered CSVs: {filtered_csv_dir}")
print(f"Filtered Visualizations: {filtered_vis_dir}")

In [None]:
# Further verifying all roinames in csv file correspond to RGB WSI files

csv_file_path = "/Users/ashishsingh/Desktop/ALL_FOV_LOCATIONS.csv"
rgb_folder_path = "/Users/ashishsingh/Desktop/filtered_rgbs"

try:
    # Load the main CSV file and check for the "roiname" column
    csv_data = pd.read_csv(csv_file_path)
    if "roiname" not in csv_data.columns:
        raise ValueError("The 'roiname' column is not found in the CSV file.")
    
    # Extract unique "roiname" values from the main CSV file and append ".png"
    csv_roinames = {f"{roiname}" for roiname in csv_data['roiname'].unique()}

    # Get the list of file names (with extensions) in the RGB folder
    rgb_files = {file for file in os.listdir(rgb_folder_path) if file.endswith(".png")}

    # Find the "roiname" values that do not match the RGB file names
    non_matching_roinames = csv_roinames - rgb_files

    # Display the results
    if non_matching_roinames:
        print(f"Number of non-matching 'roiname' entries: {len(non_matching_roinames)}")
        print("Non-matching 'roiname' entries:")
        for roiname in non_matching_roinames:
            print(roiname)
    else:
        print("All 'roiname' entries match the file names in the RGB folder.")
except FileNotFoundError as e:
    print(f"File not found: {e}")
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
# Final check of all files across the four PanopTILs folder

rgb_dir = "/Users/ashishsingh/Desktop/filtered_rgbs"  # Replace with the path to your RGB images folder
mask_dir = "/Users/ashishsingh/Desktop/filtered_masks"  # Replace with the path to your masks folder
csv_dir = "/Users/ashishsingh/Desktop/filtered_csvs"  # Replace with the path to your CSV folder
vis_dir = "/Users/ashishsingh/Desktop/filtered_visualizations"  # Replace with the path to your visualisation images folder

# Display dataset information
print(f"Number of RGB images: {len(os.listdir(rgb_dir))}")
print(f"Number of masks: {len(os.listdir(mask_dir))}")
print(f"Number of CSV files: {len(os.listdir(csv_dir))}")
print(f"Number of visualizations: {len(os.listdir(vis_dir))}")

In [None]:
# Exploring and displaying segmented WSIs with ROIs and masks

# Folder paths (different assigned variables as for a separate task)
rgb_folder = "/Users/ashishsingh/Desktop/filtered_rgbs"  
mask_folder = "/Users/ashishsingh/Desktop/filtered_masks" 
csv_folder = "/Users/ashishsingh/Desktop/filtered_csvs"
vis_folder = "/Users/ashishsingh/Desktop/filtered_visualizations"

# Function to load an image with error handling
def load_image(file_path, convert_color=True):
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None
    
    image = cv2.imread(file_path)
    if image is None:
        print(f"Failed to load image: {file_path}. Check the file format or integrity.")
        return None
    
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if convert_color else image

# Process each RGB image in the folder
for rgb_file in os.listdir(rgb_folder):
    if rgb_file.endswith(".png"):
        # File paths
        rgb_image_path = os.path.join(rgb_folder, rgb_file)
        mask_image_path = os.path.join(mask_folder, rgb_file.replace(".png", ".png"))
        vis_image_path = os.path.join(vis_folder, rgb_file.replace(".png", ".png"))
        csv_file_path = os.path.join(csv_folder, rgb_file.replace(".png", ".csv"))

        print(f"Processing file: {rgb_file}")
        
        # Load images
        rgb_image = load_image(rgb_image_path)
        mask_image = load_image(mask_image_path, convert_color=False)
        vis_image = load_image(vis_image_path)

        # Load CSV
        if os.path.exists(csv_file_path):
            csv_data = pd.read_csv(csv_file_path)
        else:
            print(f"CSV file not found: {csv_file_path}")
            csv_data = None

        # Ensure the CSV contains necessary columns
        required_columns = {"raw_group", "group", "type", "left", "right", "top", "bottom", "coords_x", "coords_y"}
        if csv_data is not None and not required_columns.issubset(csv_data.columns):
            print(f"The CSV file does not contain the required columns: {required_columns}")
            continue

        # Display images
        if rgb_image is not None:
            plt.figure(figsize=(10, 10))
            plt.imshow(rgb_image)
            plt.title(f"RGB Image: {rgb_file}")
            plt.axis("off")
            plt.show()

        if mask_image is not None:
            plt.figure(figsize=(10, 10))
            plt.imshow(mask_image, cmap="gray")
            plt.title(f"Segmentation Mask: {rgb_file}")
            plt.axis("off")
            plt.show()

        if vis_image is not None:
            plt.figure(figsize=(10, 10))
            plt.imshow(vis_image)
            plt.title(f"Segmentation Mask Visualisation: {rgb_file}")
            plt.axis("off")
            plt.show()

        # Draw ROIs on the RGB image
        if csv_data is not None and rgb_image is not None:
            rgb_with_rois = rgb_image.copy()
            for _, roi in csv_data.iterrows():
                x1, x2, y1, y2 = roi["left"], roi["right"], roi["top"], roi["bottom"]
                group = roi["group"]
                # Draw rectangle (colour blue for all ROIs)
                cv2.rectangle(rgb_with_rois, (x1, y1), (x2, y2), (255, 0, 0), 2)
                # Optionally, annotate with group type
                cv2.putText(rgb_with_rois, group, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

            # Display the RGB image with ROIs
            plt.figure(figsize=(10, 10))
            plt.imshow(rgb_with_rois)
            plt.title(f"RGB Image with ROIs: {rgb_file}")
            plt.axis("off")
            plt.show()

In [None]:
# Exploratory analysis of TILs

# Paths to folders (different assigned variables as for a separate task)
filtered_rgb_dir = "/Users/ashishsingh/Desktop/filtered_rgbs"
filtered_vis_dir = "/Users/ashishsingh/Desktop/filtered_visualizations"
filtered_mask_dir = "/Users/ashishsingh/Desktop/filtered_masks"
filtered_csv_dir = "/Users/ashishsingh/Desktop/filtered_csvs"

# Loop through all CSV files in the filtered_csv folder
for csv_file in os.listdir(filtered_csv_dir):
    if csv_file.endswith(".csv"):
        csv_path = os.path.join(filtered_csv_dir, csv_file)
        
        # Load the CSV file
        csv_data = pd.read_csv(csv_path)
        
        # Filter for TILsCell
        tils_data = csv_data[csv_data['group'] == 'TILsCell']

        if tils_data.empty:
            print(f"No TILsCell regions found in {csv_file}.")
            continue
        else:
            # Calculate centroids of each TILsCell region
            tils_data['centroid_x'] = (tils_data['left'] + tils_data['right']) / 2
            tils_data['centroid_y'] = (tils_data['top'] + tils_data['bottom']) / 2

            # Plot spatial distribution
            plt.figure(figsize=(10, 10))
            plt.scatter(tils_data['centroid_x'], tils_data['centroid_y'], alpha=0.7, color='green')
            plt.title(f"Spatial Distribution of TILsCell Centroids in {csv_file}")
            plt.xlabel("X Coordinate")
            plt.ylabel("Y Coordinate")
            plt.gca().invert_yaxis()  # Invert y-axis to match image coordinates
            plt.grid(True)
            plt.show()

            # Calculate area of each TILsCell region
            tils_data['area'] = (tils_data['right'] - tils_data['left']) * (tils_data['bottom'] - tils_data['top'])

            # Calculate heterogeneity
            area_mean = tils_data['area'].mean()
            area_std = tils_data['area'].std()
            area_coefficient_of_variation = area_std / area_mean

            # Print analysis results
            print(f"TILsCell Spatial Analysis for {csv_file}:")
            print(f"Number of TILsCell regions: {len(tils_data)}")
            print(f"Mean area: {area_mean:.2f}")
            print(f"Standard deviation of area: {area_std:.2f}")
            print(f"Coefficient of variation (heterogeneity): {area_coefficient_of_variation:.2f}")

# Summary of processing
print("\nProcessing complete. Check the visualizations and results for each file.")

In [None]:
# Measuring spatial distribution of cancer cells, TILs and stroma on a scatter plot

# Define groups of interest
groups_of_interest = [
    'ActiveStromalCellNOS', 
    'ActiveTILsCell', 
    'CancerEpithelium', 
    'StromalCellNOS', 
    'TILsCell'
]

# Loop through all CSV files in the filtered_csv folder
for csv_file in os.listdir(filtered_csv_dir):
    if csv_file.endswith(".csv"):
        csv_path = os.path.join(filtered_csv_dir, csv_file)
        
        # Load the CSV file
        csv_data = pd.read_csv(csv_path)
        
        # Initialise a plot
        plt.figure(figsize=(10, 10))
        
        # Loop through each group of interest
        for group in groups_of_interest:
            group_data = csv_data[csv_data['group'] == group]

            if not group_data.empty:
                # Calculate centroids
                group_data['centroid_x'] = (group_data['left'] + group_data['right']) / 2
                group_data['centroid_y'] = (group_data['top'] + group_data['bottom']) / 2

                # Plot centroids
                plt.scatter(
                    group_data['centroid_x'], 
                    group_data['centroid_y'], 
                    alpha=0.7, 
                    label=group
                )

        # Customise plot
        plt.title(f"Spatial Distribution of Groups in {csv_file}")
        plt.xlabel("X Coordinate")
        plt.ylabel("Y Coordinate")
        plt.gca().invert_yaxis()  # Invert y-axis to match image coordinates
        plt.legend(loc='best')
        plt.grid(True)
        plt.show()


In [None]:
# Assessing spread of cancer epithelium (spatial heterogeneity) using a simple coefficient of variation 

# Loop through all CSV files in the filtered_csv folder
for csv_file in os.listdir(filtered_csv_dir):
    if csv_file.endswith(".csv"):
        csv_path = os.path.join(filtered_csv_dir, csv_file)
        
        # Load the CSV file
        csv_data = pd.read_csv(csv_path)
        
        # Filter for CancerEpithelium
        cancer_data = csv_data[csv_data['group'] == 'CancerEpithelium']

        if cancer_data.empty:
            print(f"No CancerEpithelium regions found in {csv_file}.")
            continue
        else:
            # Calculate centroids of each CancerEpithelium region
            cancer_data['centroid_x'] = (cancer_data['left'] + cancer_data['right']) / 2
            cancer_data['centroid_y'] = (cancer_data['top'] + cancer_data['bottom']) / 2

            # Plot spatial distribution
            plt.figure(figsize=(10, 10))
            plt.scatter(cancer_data['centroid_x'], cancer_data['centroid_y'], alpha=0.7, color='blue')
            plt.title(f"Spatial Distribution of CancerEpithelium Centroids in {csv_file}")
            plt.xlabel("X Coordinate")
            plt.ylabel("Y Coordinate")
            plt.gca().invert_yaxis()  # Invert y-axis to match image coordinates
            plt.grid(True)
            plt.show()

            # Calculate area of each CancerEpithelium region
            cancer_data['area'] = (cancer_data['right'] - cancer_data['left']) * (cancer_data['bottom'] - cancer_data['top'])

            # Calculate heterogeneity
            area_mean = cancer_data['area'].mean()
            area_std = cancer_data['area'].std()
            area_coefficient_of_variation = area_std / area_mean

            # Print analysis results
            print(f"CancerEpithelium Spatial Analysis for {csv_file}:")
            print(f"Number of CancerEpithelium regions: {len(cancer_data)}")
            print(f"Mean area: {area_mean:.2f}")
            print(f"Standard deviation of area: {area_std:.2f}")
            print(f"Coefficient of variation (heterogeneity): {area_coefficient_of_variation:.2f}")

# Summary of processing
print("\nProcessing complete. Check the visualizations and results for each file.")

In [None]:
# Manipulating BCSS (Breast Cancer Semantic Segmentation) dataset to obtain breast cancer patients

input_csv_path = "/Users/ashishsingh/Desktop/bcss.csv"

# Path to save the filtered CSV
output_csv_path = "~/Desktop/filtered_brca.csv"

try:
    # Load the CSV file
    data = pd.read_csv(input_csv_path)

    # Check if the "type" column exists
    if "type" not in data.columns:
        raise ValueError("The 'type' column is not found in the CSV file.")

    # Filter rows where "type" is "BRCA"
    filtered_data = data[data["type"] == "BRCA"]

    # Save the filtered data to a new CSV file
    filtered_data.to_csv(output_csv_path, index=False)

    print(f"Filtered data has been saved to: {output_csv_path}")
    print(f"Number of rows in the filtered data: {len(filtered_data)}")
except FileNotFoundError:
    print(f"File not found: {input_csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Manipulating available PanopTILs data aiming to identify and verify BCSS data

csv_folder_path = "/Users/ashishsingh/Desktop/filtered_csvs"

# Path to save the new CSV file
output_csv_path = "~/Desktop/panoptils_to_bcss.csv"

try:
    # Initialise a list to store the trimmed file names
    trimmed_names = []

    # Loop through all files in the folder
    for file_name in os.listdir(csv_folder_path):
        # Process only CSV files
        if file_name.endswith(".csv"):
            # Extract the first 12 characters of the file name (excluding the extension)
            trimmed_name = file_name[:12]
            trimmed_names.append(trimmed_name)

    # Create a DataFrame from the trimmed names
    trimmed_names_df = pd.DataFrame({"trimmed_names": trimmed_names})

    # Save the DataFrame to a new CSV file on the Desktop
    trimmed_names_df.to_csv(output_csv_path, index=False)

    print(f"New CSV file with trimmed names has been saved to: {output_csv_path}")
except FileNotFoundError:
    print(f"Folder not found: {csv_folder_path}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Identifying relevant data points from BCSS

filtered_brca_path = "~/Desktop/filtered_brca.csv" 
panoptils_to_bcss_path = "~/Desktop/panoptils_to_bcss.csv"

try:
    # Load the CSV files
    filtered_brca_data = pd.read_csv(filtered_brca_path)
    panoptils_to_bcss_data = pd.read_csv(panoptils_to_bcss_path)

    # Check if the required columns exist
    if "bcr_patient_barcode" not in filtered_brca_data.columns:
        raise ValueError("The 'bcr_patient_barcode' column is not found in the filtered_brca CSV file.")
    if "trimmed_names" not in panoptils_to_bcss_data.columns:
        raise ValueError("The 'trimmed_names' column is not found in the panoptils_to_bcss CSV file.")

    # Read and strip spaces from the columns
    bcr_patient_barcodes = filtered_brca_data["bcr_patient_barcode"].str.replace(" ", "").unique()
    trimmed_names = panoptils_to_bcss_data["trimmed_names"].str.replace(" ", "").unique()

    # Find similar values
    similar_values = set(bcr_patient_barcodes) & set(trimmed_names)

    # Display the result
    print(f"Number of similar values: {len(similar_values)}")
    if similar_values:
        print("Similar values:")
        print(similar_values)
    else:
        print("No similar values found.")

except FileNotFoundError as e:
    print(f"File not found: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Verifying relevant data from BCSS

filtered_csvs_path = "/Users/ashishsingh/Desktop/filtered_csvs" 
inter_csvs_path = os.path.expanduser("~/Desktop/inter_csvs") 


# Iterate through the CSV files in the filtered_csvs folder
for csv_file in os.listdir(filtered_csvs_path):
    if csv_file.endswith(".csv"):
        # Extract the first 12 characters of the file name (excluding the extension)
        new_file_name = f"{csv_file[:12]}.csv"
        
        # Paths for the source and destination
        source_file = os.path.join(filtered_csvs_path, csv_file)
        destination_file = os.path.join(inter_csvs_path, new_file_name)
        
        # Copy and rename the file
        shutil.copy(source_file, destination_file)
        print(f"Processed: {csv_file} -> {new_file_name}")

print("Renaming and processing complete. Files saved to the inter_csvs folder.")

# Paths
inter_csvs_path = os.path.expanduser("~/Desktop/inter_csvs")  # Path to the inter_csvs folder
similar_values_path = os.path.expanduser("~/Desktop/similar_values_output.csv")  # Path to the similar_values_output.csv file

try:
    # Check if the folder exists
    if not os.path.exists(inter_csvs_path):
        raise FileNotFoundError(f"The folder '{inter_csvs_path}' does not exist.")

    # Get all file names in the folder and remove the '.csv' extension
    file_names = [
        os.path.splitext(file)[0] for file in os.listdir(inter_csvs_path)
        if os.path.isfile(os.path.join(inter_csvs_path, file)) and file.endswith(".csv")
    ]

    # Load similar_values_output.csv
    similar_values_df = pd.read_csv(similar_values_path)

    if "Similar Values" not in similar_values_df.columns:
        raise ValueError("The 'Similar Values' column is not found in the similar_values_output file.")

    # Extract the values from the column and strip spaces
    similar_values = similar_values_df["Similar Values"].str.strip().tolist()

    # Compare the two lists
    file_names_set = set(file_names)
    similar_values_set = set(similar_values)

    if file_names_set == similar_values_set:
        print("The file names in 'inter_csvs' are equal to the rows in 'similar_values_output.csv'.")
    else:
        print("The file names in 'inter_csvs' do not match the rows in 'similar_values_output.csv'.")
        print("Files in 'inter_csvs' but not in 'similar_values_output.csv':")
        print(file_names_set - similar_values_set)
        print("Values in 'similar_values_output.csv' but not in 'inter_csvs':")
        print(similar_values_set - file_names_set)

except FileNotFoundError as e:
    print(e)
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Manipulating BCSS csv file to extract ID and Disease-Free Interval (DFI) data

filtered_brca_path = os.path.expanduser("~/Desktop/filtered_brca.csv")  # Path to the filtered_brca file
output_path = os.path.expanduser("~/Desktop/bcss-outcomes.csv")  # Path to save the new file

try:
    # Load the filtered_brca CSV file
    filtered_brca_df = pd.read_csv(filtered_brca_path)

    # Check if the required columns exist
    required_columns = ["bcr_patient_barcode", "PFI.time.1"]
    for column in required_columns:
        if column not in filtered_brca_df.columns:
            raise ValueError(f"The column '{column}' is not found in the filtered_brca file.")

    # Extract the required columns
    extracted_df = filtered_brca_df[required_columns]

    # Save the extracted columns as a new CSV file
    extracted_df.to_csv(output_path, index=False)
    print(f"New file 'bcss-outcomes.csv' saved to the Desktop.")

except FileNotFoundError as e:
    print(f"File not found: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# For 1318 WSI
# Calculating cofficients of variability for cancer epithelium in each regions of interest

# Dictionary to store results
results = []

# Iterate through all .png files in the filtered_rgbs folder
for image_file in os.listdir(filtered_rgb_dir):
    if image_file.endswith(".png"):
        image_path = os.path.join(filtered_rgb_dir, image_file)

        # Open the image
        with Image.open(image_path) as img:
            image_array = np.array(img)

        # Mask for cancer epithelium (assuming it's the first channel)
        epithelium_mask = image_array[:, :, 0] > 0  # Non-zero values represent cancer epithelium

        # Calculate coefficient of variation for cancer epithelium
        epithelium_values = image_array[:, :, 0][epithelium_mask]
        if len(epithelium_values) > 0:
            mean_value = np.mean(epithelium_values)
            std_value = np.std(epithelium_values)
            coefficient_of_variation = std_value / mean_value
        else:
            coefficient_of_variation = np.nan  # Assign NaN if no epithelium is detected

        # Store results
        results.append({"File Name": image_file, "Coefficient of Variation": coefficient_of_variation})

# Save results to a CSV file
output_csv_path = os.path.expanduser("~/Desktop/cancer_epithelium_heterogeneity_all.csv")
results_df = pd.DataFrame(results)
results_df.to_csv(output_csv_path, index=False)

print(f"Results saved to {output_csv_path}.")

In [None]:
# Manipulating names to match with BCSS

# Path to the input CSV file
input_csv_path = os.path.expanduser("~/Desktop/cancer_epithelium_heterogeneity_all.csv")

# Path to the output CSV file
output_csv_path = os.path.expanduser("~/Desktop/cancer_epithelium_heterogeneity_modified.csv")

try:
    # Load the CSV file
    heterogeneity_df = pd.read_csv(input_csv_path)

    if "File Name" not in heterogeneity_df.columns:
        raise ValueError("The 'File Name' column is not found in the input CSV file.")

    # Modify the "File Name" column to keep only the first 12 characters
    heterogeneity_df["File Name"] = heterogeneity_df["File Name"].str[:12]

    # Save the updated DataFrame to a new CSV file
    heterogeneity_df.to_csv(output_csv_path, index=False)

    print(f"File names modified and saved to {output_csv_path}.")

except FileNotFoundError as e:
    print(f"File not found: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# For 118 patients
# Calculating mean cofficients of variability for cancer epithelium, grouping by BCSS patient ID

# Path to the input CSV file
input_csv_path = os.path.expanduser("~/Desktop/cancer_epithelium_heterogeneity_modified.csv")

# Path to the output CSV file
output_csv_path = os.path.expanduser("~/Desktop/cancer_epithelium_heterogeneity_combined.csv")

try:
    # Load the CSV file
    heterogeneity_df = pd.read_csv(input_csv_path)

    if "File Name" not in heterogeneity_df.columns or "Coefficient of Variation" not in heterogeneity_df.columns:
        raise ValueError("The required columns ('File Name' and 'Coefficient of Variation') are not found in the input CSV file.")

    # Combine rows with the same "File Name" and calculate the mean of "Coefficient of Variation"
    combined_df = heterogeneity_df.groupby("File Name", as_index=False)["Coefficient of Variation"].mean()

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv(output_csv_path, index=False)

    print(f"Combined results saved to {output_csv_path}.")

except FileNotFoundError as e:
    print(f"File not found: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# Combining coefficients of variability from PanopTILs with PFI from BCSS

# Paths to the input files
heterogeneity_combined_path = os.path.expanduser("~/Desktop/cancer_epithelium_heterogeneity_combined.csv")
bcss_outcomes_path = os.path.expanduser("~/Desktop/bcss-outcomes.csv")

# Path to the output file
output_csv_path = os.path.expanduser("~/Desktop/ml-data.csv")

try:
    # Load the CSV files
    heterogeneity_df = pd.read_csv(heterogeneity_combined_path)
    bcss_outcomes_df = pd.read_csv(bcss_outcomes_path)

    # Ensure required columns are present
    required_columns_heterogeneity = ["File Name", "Coefficient of Variation"]
    required_columns_outcomes = ["bcr_patient_barcode", "PFI.time.1"]

    for column in required_columns_heterogeneity:
        if column not in heterogeneity_df.columns:
            raise ValueError(f"The column '{column}' is not found in the heterogeneity file.")

    for column in required_columns_outcomes:
        if column not in bcss_outcomes_df.columns:
            raise ValueError(f"The column '{column}' is not found in the outcomes file.")

    # Merge the dataframes
    combined_df = pd.merge(
        bcss_outcomes_df,
        heterogeneity_df,
        left_on="bcr_patient_barcode",
        right_on="File Name",
        how="inner"
    )

    # Retain only necessary columns
    combined_df = combined_df[["bcr_patient_barcode", "PFI.time.1", "Coefficient of Variation"]]

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv(output_csv_path, index=False)

    print(f"Combined data saved to {output_csv_path}.")

except FileNotFoundError as e:
    print(f"File not found: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
#Applying machine learning

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Load the dataset
data_path = "~/Desktop/ml-data.csv"
data = pd.read_csv(data_path)

# Feature (independent variable) and target (dependent variable)
X = data[["Coefficient of Variation"]].values
y = data["PFI.time.1"].apply(lambda x: 1 if x > np.median(data["PFI.time.1"]) else 0).values  # Binary classification

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define models
models = {
    "Support Vector Machine": SVC(probability=True, random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42)
}

# Hyperparameter grids for each model
param_grids = {
    "Support Vector Machine": {
        "C": [0.1, 1, 10, 100],
        "kernel": ["linear", "rbf"],
        "gamma": ["scale", "auto"]
    },
    "Logistic Regression": {
        "C": [0.1, 1, 10, 100],
        "solver": ["liblinear", "lbfgs"]
    }
}

# Perform Bayesian-like grid search to find the best hyperparameters
best_models = {}
best_f1_scores = {}
confusion_matrices = {}

for model_name, model in models.items():
    print(f"Tuning hyperparameters for {model_name}...")
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        scoring="f1",
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    y_pred = grid_search.best_estimator_.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    best_f1_scores[model_name] = f1
    confusion_matrices[model_name] = confusion_matrix(y_test, y_pred)
    print(f"Best F1 Score for {model_name}: {f1:.4f}")

# Select the best model based on F1 score
best_model_name = max(best_f1_scores, key=best_f1_scores.get)
print(f"\nBest Model: {best_model_name} with F1 Score: {best_f1_scores[best_model_name]:.4f}")

# Print confusion matrices for all models
for model_name, matrix in confusion_matrices.items():
    print(f"\nConfusion Matrix for {model_name}:")
    print(matrix)

# Final results
print("\nClassification Report for Best Model:")
print(classification_report(y_test, best_models[best_model_name].predict(X_test)))

In [None]:
# Plotting survival curves

from lifelines import KaplanMeierFitter

# Load the dataset
data_path = "~/Desktop/ml-data.csv"
data = pd.read_csv(data_path)

# Check for required columns
if "Coefficient of Variation" not in data.columns or "PFI.time.1" not in data.columns:
    raise ValueError("Required columns are not present in the dataset.")

# Truncate data at time 1000 to exclude outliers
data = data[data["PFI.time.1"] <= 1500]

# Define a threshold for "low" and "high" Coefficient of Variation
threshold = data["Coefficient of Variation"].median()  # Median as the threshold
data["Risk Group"] = np.where(data["Coefficient of Variation"] <= threshold, "Low", "High")

# Prepare data for Kaplan-Meier fitting
time = data["PFI.time.1"]
event_observed = np.ones(len(time))  # Assuming all patients are observed (no censored data)

# Create Kaplan-Meier fitter objects
kmf_low = KaplanMeierFitter()
kmf_high = KaplanMeierFitter()

# Fit survival curves for each group
low_group = data[data["Risk Group"] == "Low"]
high_group = data[data["Risk Group"] == "High"]

kmf_low.fit(durations=low_group["PFI.time.1"], event_observed=np.ones(len(low_group)), label="Low Coefficient of Variation of Cancer Epithelium")
kmf_high.fit(durations=high_group["PFI.time.1"], event_observed=np.ones(len(high_group)), label="High Coefficient of Variation of Cancer Epithelium")

# Plot Kaplan-Meier survival curves
plt.figure(figsize=(10, 6))
kmf_low.plot_survival_function(ci_show=False)
kmf_high.plot_survival_function(ci_show=False)

# Add plot details
plt.title("Kaplan-Meier Curves for Progression-Free Interval")
plt.xlabel("Progression-Free Interval in Days")
plt.ylabel("Survival Probability")
plt.xlim(0, 1000)
plt.grid(True)
plt.legend()
plt.show()