In [1]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np 
import warnings
import pickle as pkl 
import math
warnings.filterwarnings("ignore")

# functions

In [None]:
def check_in_list(pop1, pop2, list_of_clusters): 
    for i, clust in enumerate(list_of_clusters): 
        if pop1 in clust or pop2 in clust: 
            return i  
    return -1

In [None]:
def add_single_clusters(pop, list_of_clusters): 
    for clust in list_of_clusters: 
        if pop in clust: 
            return True 
    return False 

In [None]:
def create_index_map(list_of_sets):
    index_map = {}
    for index, set_of_ids in enumerate(list_of_sets):
        for sid in set_of_ids:
            index_map[sid] = f"cluster{index+1}"
    return index_map

# clusters 
read in louvain clusters 

In [None]:
clusters = pd.read_csv("louvain_clusters.csv")
clusters.columns = ["sid", "l1", "l2", "l3", "name"]

# fst 
read in plink fst output 

In [None]:
fst = pd.read_csv("../../fst/fst.fst.summary", sep="\t")

# merge 
merge clusters based on fst 

In [None]:
list_of_clusters = []
fst_threshold = 0.001 # threshold of choice- any below will be merged
min_cluster_size = 30 # minimum cluster size to consider for merging (small clusters will have unstable fst in some cases) 
cluster_sizes = clusters["name"].value_counts().reset_index()

for i, row in fst.iterrows(): # iterate over pairwise fst 
    pop1 = row["#POP1"]
    pop2 = row["POP2"]
    
    if row["HUDSON_FST"] < fst_threshold: # check if it is below threshold 
        idx = check_in_list(pop1, pop2, list_of_clusters) 

        # add the clusters to a list to be merged 
        if idx == -1:  
            list_of_clusters.append(set([pop1, pop2]))
        else: 
            list_of_clusters[idx].update([pop1, pop2])
            
# since this is pairwise, some clusters may be in the list twice (cluster1-cluster2 pass the threshold and cluster2-cluster3) 
# simplify the list and only consider clusters with a minimum cluster size
# also, if cluster is large enough but does not meet the fst threshold with any other clsuter, keep it as its own cluster
for i, c in cluster_sizes.iterrows():
    if (not add_single_clusters(c["name"], list_of_clusters)) and (c["count"] >= min_cluster_size): 
        list_of_clusters.append(set([c["name"]]))


In [None]:
# rename the merged clusters 
index_map = create_index_map(list_of_clusters)
clusters["merged_cluster"] = clusters["name"].replace(index_map)

In [None]:
clusters["merged_cluster"].nunique()

# finalize clusters 
for my analysis, i preferred clusters with > 30 or 50 individuals

In [None]:
min_cluster_size = 30 

new_cluster_counts = clusters["merged_cluster"].value_counts().reset_index()
largest_clusters = new_cluster_counts[new_cluster_counts["count"] >= min_cluster_size ]["merged_cluster"].values

In [None]:
updated_clusters = clusters[clusters["merged_cluster"].isin(largest_clusters)]
updated_clusters[["sid", "name", "merged_cluster"]].to_csv("fst_merged/louvain_original_001.csv", index=False)