#### Data extraction

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Read the CSV file
file_path = '/Users/diyasharma/Documents/LabProject/features_filtered.csv'
data = pd.read_csv(file_path)
data = data.dropna()

# Assuming the target class column is named 'Marker' and we need to exclude it for UMAP
features = data.drop(columns=['Marker'])
target = data['Marker']

features.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/diyasharma/Documents/LabProject/features_filtered.csv'

#### UMAP

In [None]:
import umap

# Run UMAP
umap_model = umap.UMAP(n_neighbors=5, min_dist=0.0)
umap_embedding = umap_model.fit_transform(features)

# Add UMAP results to the dataframe
umap_df = pd.DataFrame(umap_embedding, columns=['UMAP1', 'UMAP2'])
umap_df['Marker'] = target
umap_df.head()

# Save UMAP results to a new CSV file if needed
#output_file_path = '/Users/diyasharma/Documents/LabProject/umap_results.csv'
#umap_df.to_csv(output_file_path, index=False)


#### Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations

# Define markers
markers = ["GFP", "CCK", "PV", "SST", "NPY", "VIP"]

# Define a color palette and create a mapping from marker to color
palette = sns.color_palette("bright", len(markers))
marker_color_map = {marker: palette[i] for i, marker in enumerate(markers)}

# Plot UMAP for all markers
plt.figure(figsize=(10, 8))
sns.scatterplot(data=umap_df, x='UMAP1', y='UMAP2', hue='Marker', palette=marker_color_map, s=5, edgecolor=None)
plt.title('UMAP Projection of All Markers', fontsize=16)
plt.legend(title='Marker')
plt.show()

# Plot UMAP for each marker individually
plt.figure(figsize=(15, 10))
for i, marker in enumerate(markers):
    plt.subplot(2, 3, i + 1)

    plt.scatter(umap_df['UMAP1'], umap_df['UMAP2'], c='wheat', s=1)

    marker_indices = umap_df[umap_df['Marker'] == marker].index
    plt.scatter(umap_df.loc[marker_indices, 'UMAP1'], umap_df.loc[marker_indices, 'UMAP2'], c=[marker_color_map[marker]], s=.5)

    plt.gca().set_aspect('equal', 'datalim')
    plt.title(f'UMAP projection - {marker}', fontsize=10)

plt.tight_layout()
plt.show()

# Create a function to plot one-vs-one classification
def plot_one_vs_one(marker1, marker2, ax):
    subset = umap_df[(umap_df['Marker'] == marker1) | (umap_df['Marker'] == marker2)]
    subset_palette = {marker1: marker_color_map[marker1], marker2: marker_color_map[marker2]}
    sns.scatterplot(data=subset, x='UMAP1', y='UMAP2', hue='Marker', palette=subset_palette, s=5, edgecolor=None, ax=ax)
    ax.set_title(f'{marker1} vs {marker2}', fontsize=14)
    ax.legend(title='Marker')

# Create subplots for each pair of markers
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(15, 20))
axes = axes.flatten()

# Generate plots for each pair of markers
for idx, (marker1, marker2) in enumerate(combinations(markers, 2)):
    plot_one_vs_one(marker1, marker2, axes[idx])

# Hide any unused subplots
for ax in axes[idx+1:]:
    ax.set_visible(False)

plt.tight_layout()
plt.show()


#### Clustering

In [None]:
from sklearn.cluster import HDBSCAN

# initialize clusterer
clusterer = HDBSCAN(min_cluster_size=500)
clusterer.fit(umap_df[['UMAP1', 'UMAP2']])

# Add cluster labels to the original DataFrame
umap_df['Cluster'] = clusterer.labels_

#### Visualization HDBSCAN

In [None]:
import numpy as np

# Define colors for clusters
unique_clusters = np.unique(clusterer.labels_)
num_clusters = len(unique_clusters)
palette = sns.color_palette("bright", num_clusters)

# Check if cluster group -1 exists
if -1 in unique_clusters:
    # Change color of cluster group -1 to grey
    palette[unique_clusters.tolist().index(-1)] = (0.7, 0.7, 0.7)

# Plot all data points in one plot, color-coded by cluster labels
plt.figure(figsize=(10, 8))
sns.scatterplot(data=umap_df, x='UMAP1', y='UMAP2', hue='Cluster', palette=palette, s=3, legend='full')
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection with HDBSCAN clusters', fontsize=18)
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

#### Calculating Percentages per Cluster

In [None]:
from collections import defaultdict

def count_markers(df):
    # Initialize a defaultdict to store counts of markers within each cluster
    count_dict = defaultdict(int)

    # Iterate through each row of the DataFrame
    for index, row in df.iterrows():
        # Create a key for the count_dict by combining 'Cluster' and 'Marker' values
        Cluster = f"Cluster {row['Cluster']} - {row['Marker']}"
        # Increment the count for this Cluster-Marker combination
        count_dict[Cluster] += 1

    # Return the dictionary containing counts for each Cluster-Marker combination
    return count_dict

def create_summary_dataframe(count_dict):
    # Extract a sorted list of unique cluster identifiers from the count_dict keys
    Clusters = sorted(set(Cluster.split(" - ")[0] for Cluster in count_dict.keys()))

    # Initialize a dictionary to hold the summary data, starting with the cluster names
    data = {'Cluster': Clusters}

    # For each marker, populate the dictionary with counts of cells in each cluster
    for marker in ['GFP', 'PV', 'CCK', 'SST', 'NPY', 'VIP']:
        # Retrieve the count for each marker in each cluster, defaulting to 0 if not present
        data[f'#{marker}'] = [count_dict.get(f'{Cluster} - {marker}', 0) for Cluster in Clusters]

    # Calculate the sum of marker counts for each cluster and add to the data dictionary
    data['Sum'] = [sum(data[f'#{marker}'][i] for marker in ['GFP', 'PV', 'CCK', 'SST', 'NPY', 'VIP']) for i in range(len(Clusters))]

    # Calculate the percentage of each marker within each cluster and add to the data dictionary
    for marker in ['GFP', 'PV', 'CCK', 'SST', 'NPY', 'VIP']:
        # Calculate the percentage and round to two decimal places
        data[f'%{marker}'] = [round((data[f'#{marker}'][i] / data['Sum'][i]) * 100, 2) for i in range(len(Clusters))]

    # Convert the dictionary to a pandas DataFrame and return it
    return pd.DataFrame(data)


In [None]:
umap_df.sort_values(by="Cluster")
percents = create_summary_dataframe(count_markers(umap_df))
percents.sort_values(by="Cluster")
# Filter columns that contain '%' in their names
percentage_columns = [col for col in percents.columns if '%' in col]
percentage_columns.insert(0, 'Cluster')

# Display the head of the DataFrame with only percentage columns
percents[percentage_columns].head()
#percents.to_csv('percents.csv')