In [1]:
import pandas as pd 
import pickle as pkl 
import numpy as np

In [2]:
def find_clusters_with_identical_hom_samples_as_df(data_dict):
    # Dictionary to store arrays of homozygous sample IDs
    hom_samples_dict = {}
    
    # Convert numpy arrays to tuples for hashability
    for cluster_id, variant_dict in data_dict.items():
        hom_samples = tuple(variant_dict['hom'])
        
        # Skip if there are fewer than 2 homozygous samples
        if len(hom_samples) < 2:
            continue
            
        if hom_samples in hom_samples_dict:
            hom_samples_dict[hom_samples].append(cluster_id)
        else:
            hom_samples_dict[hom_samples] = [cluster_id]
    
    # Filter for groups with more than one cluster (identical hom samples)
    identical_hom_clusters = {samples: clusters for samples, clusters in hom_samples_dict.items() 
                             if len(clusters) >= 1}
    
    # Create data for DataFrame
    rows = []
    for hom_samples, cluster_list in identical_hom_clusters.items():
        # Convert the tuple of samples back to a list
        samples_list = list(hom_samples)
        
        for cluster_id in cluster_list:
            rows.append({
                "cluster_id": cluster_id,
                "hom_samples": samples_list,
                "group_id": "_".join(samples_list),  # Create a group identifier based on samples
                "num_samples": len(samples_list)
            })
    
    # Create DataFrame
    if rows:
        return pd.DataFrame(rows)
    else:
        # Return empty DataFrame with columns if no matches
        return pd.DataFrame(columns=["cluster_id", "hom_samples", "group_id", "num_samples"])

In [3]:
df_list = []

for i in range(1, 23): 
    
    d = pkl.load(open(f"../icurl/libd/dr/dr_{i}_clique_ind.pkl", "rb"))
    df = find_clusters_with_identical_hom_samples_as_df(d) 
    
    df_list.append(df)

In [4]:
merged_df = pd.concat(df_list)

In [5]:
merged_df

Unnamed: 0,cluster_id,hom_samples,group_id,num_samples
0,c234221_1,"[21133038, 17362710]",21133038_17362710,2
1,c237814_1,"[21133038, 17362710]",21133038_17362710,2
2,c240465_1,"[21133038, 17362710]",21133038_17362710,2
3,c242498_1,"[21133038, 17362710]",21133038_17362710,2
4,c244850_1,"[21133038, 17362710]",21133038_17362710,2
...,...,...,...,...
27,c149603_22,"[27164320, 28703190]",27164320_28703190,2
28,c156939_22,"[27164320, 28703190]",27164320_28703190,2
29,c170242_22,"[27164320, 30625846, 32559105]",27164320_30625846_32559105,3
30,c186210_22,"[4375468, 4684522]",4375468_4684522,2


In [6]:
merged_df.to_csv("merged_dr_clusters.csv", index=False)