In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os.path
import warnings
from natsort import natsorted

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

sns.set(context='talk', style='white', rc={'figure.facecolor':'white'}, font_scale=1)
sns.set_style('ticks')

# functions

In [None]:
def count_atlas(values): 
    """
    count the number of individuals belonging to only ATLAS in a cluster 
    """

    atlas = values[~values["split_id"].str[0].str.isalpha()]
    return atlas["split_id"].nunique()

In [None]:
def remove_common_demo(top_values): 
    """
    for understanding demographics, remove the most common demographics that are not informative 
    for identifying diverse clusters 
    """
        
    generic_categories = ["White or Caucasian", "Not Hispanic or Latino", "English", "None", "No Religious Preference",
                          "Unknown", "*Unspecified", "Patient Refused"]
    
    return [x for x in top_values if x[0] not in generic_categories]

In [None]:
def count_demo(values, column, keep):
    """
    count the indviduals with specified demographic characteristics 
    """
    values = values[["sample_id", column]].drop_duplicates(subset="sample_id")
    subset = values[column].dropna()

    value_counts = subset.value_counts()
    top_values = value_counts.iloc[:keep].items() if value_counts.nunique() > keep else value_counts.items()
    
    return remove_common_demo(list(top_values))


In [None]:
def load_louvain(filename):
    """
    read in louvain cluster assignments and preprocess the sample ids for analysis 
    """
    communities_id = pd.read_csv(filename, header=None, names=["sample_id", "cluster"])
    
    communities_id["split_id"] = communities_id["sample_id"].str.split("_", expand=True)[2].astype(str)
    communities_id["sampleid"] = communities_id["sample_id"].str[2:]
    
    return communities_id


In [None]:
def load_annotations(communities_id, reference, self_report):
    """
    read in reference data and merge with louvain sample data 
    """
    
    labels = pd.read_csv(reference, delimiter="\t")
    labels.columns = ["ID", "Population", "Region", "Continent", "Region2"]
    
    labeled_clusters = labels.merge(communities_id, left_on="ID", right_on="sampleid", how="right")
    
    labeled_clusters_demographics = labeled_clusters.merge(self_report, right_on="Bank_sample_id", left_on="split_id", how="outer")
    
    return labeled_clusters_demographics

# run annotation pipeline

## self reported demographic characteristics 

In [None]:
self_report = pd.read_csv("../biobank_demographics_new2.csv", low_memory=False).iloc[:-1, :]
self_report = self_report.dropna(subset=["UniqueSampleID"])
self_report["Bank_sample_id"] = self_report["UniqueSampleID"].astype(int).astype(str)

## reference data

In [None]:
reference_data = "../reference_labels"

## run for specific level of louvain clustering 

In [None]:
max_number_of_clusters = 10 # the max number of louvain clusters you have
num_demo_to_report = 5 # the number of demographic characteristics you want output

for i in range(max_number_of_clusters): 

    fname = f"louvain_subcluster{i}.csv" 

    if os.path.isfile(fname):

        communities = load_louvain(fname) # read in louvain file 
        labeled_communities = load_annotations(communities, reference, self_report) # annotate with reference data and self-report demographics

        # calculate overall cluster size and atlas only cluster size 
        cluster_size = labeled_communities.groupby("cluster")["sample_id"].nunique().values
        atlas_size = labeled_communities.groupby("cluster").apply(count_atlas).values

        # find the top demographic characteristics per cluster to generate asummary 
        language = labeled_communities.groupby("cluster").apply(count_demo, "PreferredLanguage", num_demo_to_report).values
        population = labeled_communities.groupby("cluster").apply(count_demo, "Population", num_demo_to_report).values
        ethnicity = labeled_communities.groupby("cluster").apply(count_demo, "Ethnicity", num_demo_to_report).values
        race = labeled_communities.groupby("cluster").apply(count_demo, "FirstRace", num_demo_to_report).values
        religion = labeled_communities.groupby("cluster").apply(count_demo, "SimpleReligion", num_demo_to_report).values

        # make a summary dataframe with the top characteristics and statistics per cluster
        cluster_demo = pd.DataFrame([list(range(1, len(cluster_size)+1)), cluster_size, atlas_size, population, race, ethnicity, language, religion]).T
        cluster_demo.columns = ["cluster", "total size", "atlas size", "reference", "race", "ethnicity", "language", "religion"]
        cluster_demo["subcluster"] = i 

        # output to csv, in add mode, so that each cluster as you iterate over it will be included in the output 
        cluster_demo.to_csv("louvain_cluster_demographics_summary.csv", mode="a", index=False, header=None)
