# Fusion

Attempt to apply the approach used by [Nir Diamant](https://github.com/NirDiamant/RAG_Techniques/blob/main/all_rag_techniques/fusion_retrieval.ipynb) to the ragsc problem.

## Strategy

Consider each cluster as a "document".  Using a random sample of the cluster data and associated embeddings, create a vector database
using FAISS or Chroma.  At the same time, use Lucene to create an index for the "documents".  Score matches on both semantic (vector) and keyword (BM25) and combine the scores to see if we can get more success matching to clusters.

In [1]:
#
# import libraries
#
import pandas as pd
from pathlib import Path
from rank_bm25 import BM25Okapi
import numpy as np
from itertools import chain
from functools import partial, reduce
from typing import Union
from loguru import logger

In [2]:
#
# set constants
#
input_path = Path("../results")
output_path = Path("../results")
training_fraction = 0.5

In [3]:
#
# load the data along with embeddings
#
master_df = pd.read_csv(input_path / Path("ragsc_00_all_large.csv"))
master_n_cells = master_df.shape[0]

train_df = master_df.sample(frac=training_fraction)
test_df= master_df.drop(train_df.index) #.sample(frac=training_fraction) 
print(f"total rows: {master_df.shape[0]}")
print(f"training set has {train_df.shape[0]} rows")
print(f"test set has {test_df.shape[0]} rows")

total rows: 9370
training set has 4685 rows
test set has 4685 rows


In [4]:
for cluster in test_df.groupby('cluster'):
    print(cluster[0], cluster[1].shape[0])

0 586
1 548
2 377
3 375
4 354
5 317
6 287
7 275
8 242
9 200
10 200
11 184
12 169
13 155
14 108
15 99
16 104
17 62
18 43


In [5]:
def get_gene_bags(df: pd.DataFrame, max_genes:int, sort_by_cluster_names=True) -> dict:
    """
    Produces "bags of words" for each cluster to use as documents in BM25 analysis.
    
    Returns a dictionary with cluster name as the keys and a list of gene names as the values.
    """
    clusters = df.groupby("cluster", sort=False)
    word_dict = {}
    for cluster in clusters:
        # each cluster is a tuple (cluster name, cluster dataframe)
        words = []
        cluster_df = cluster[1] # the dataframe
        # convert each signature into a list of string
        word_series = cluster_df.signature.apply(lambda x: x.split(" "))
        # create a bag of words based containing the gene names for this cluster
        for sig in word_series:
            # retain only max_genes gene names to add to the bag of words
            words.extend(sig[:max_genes]) 
        word_dict[cluster[0]] = words
    if sort_by_cluster_names:
        word_dict = {k: word_dict[k] for k in sorted(word_dict)}
    return word_dict


In [168]:
#
# chunking
#
def chunk(s:Union[str,list], size:int, step=1) -> list[str]:
    """
    Takes a string or list of strings and creates a list of overlapping chunks of a given size.

    Args
        size: The number of words (gene names) in each chunk.
        step: The number of words to advance before the next chunk (defaults to 1).
    Returns
        A list of strings representing the chunks.
    """
    if isinstance(s,str):
        a = s.split()
    else:
        a = s
    results = []
    max = len(a)
    for i in range(max):
        if i+size < max:
            results.append(" ".join(a[i:i+size]))
        else:
            results.append(" ".join(a[i:]))
        i += step
    return results
    
# chunks = chunk("this is a test of the splitter", 3)
# print(chunks)
chunk2 = partial(chunk, size=2,step=1)

In [170]:
word_dict = get_gene_bags(train_df,max_genes=120)

#
# create index from the cluster "documents" which are stored in word_dict
#
docs = [chunk2(" ".join(x)) for x in word_dict.values()]
bm25_index = BM25Okapi(docs)


In [171]:
def get_score(bm25, gene_list, max_genes=25, normalized=True) -> list[float]:
    """
    Returns a list containing the scores for a particular list of genes
    """
    query = chunk2(gene_list)[:max_genes]
    scores = bm25.get_scores(query)
    if normalized:
        scores = (scores - np.min(scores))/(np.max(scores)-np.min(scores))
    return scores

In [172]:
def create_score_column(df:pd.DataFrame, bm25, max_genes) -> pd.DataFrame:
    """
    Add a column to the provided dataframe containing the BM25 scores.

    Args:
        df - the dataframe whose signatures will be used to generate the scores
        bm25 - the index to use fo comparison
        max_genes - the maximum number of genes to include from each signature

    Returns a reference to the original dataframe
    """
    df['scores'] = df.signature.apply(lambda x: get_score(bm25, x, max_genes))
    return df

In [173]:
df_test = create_score_column(test_df, bm25_index, 120)

In [174]:
n=121
cluster = test_df.cluster.iloc[n]
scores = test_df.scores.iloc[n]
rating = scores[cluster]
print(cluster, scores, rating)

0 [1.         0.15734795 0.96343006 0.69476747 0.45294369 0.70264113
 0.24852422 0.25640868 0.56046134 0.46370086 0.29132395 0.
 0.03921402 0.         0.         0.4040511  0.         0.29088538
 0.        ] 1.0


In [175]:
row = 25
clusters = df_test.groupby('cluster')
for cluster in clusters:
    # print(cluster[1].shape)
    local_df = cluster[1]
    no = cluster[0]
    if no> 0:
        bad = no-1
    else:
        bad = no+1
    scores = local_df.scores.iloc[row]
    # print(scores)
    print(f"{no:5} {scores[no]:8.2f} {scores[bad]:8.2f}")


    0     0.87     0.74
    1     0.45     0.71
    2     0.37     0.47
    3     0.38     0.64
    4     0.79     0.00
    5     1.00     0.00
    6     1.00     0.56
    7     0.75     0.00
    8     0.52     0.36
    9     1.00     0.00
   10     0.28     0.00
   11     1.00     0.42
   12     0.00     0.00
   13     0.68     0.00
   14     0.00     0.00
   15     0.00     0.00
   16     0.40     0.64
   17     0.00     0.06
   18     0.00     0.00


In [176]:
#
# explore the effect of sample size on mean score
#
print(test_df.shape[0])
print()
sum=0
n_clusters = 19
count=0
# test_df['scores_sum'] = test_df.scores.apply(lambda x: np.sum(x))
for cluster in test_df.groupby('cluster'):
    cluster_no = cluster[0]
    cluster_df = cluster[1]
    cluster_df['predicted_score'] = cluster_df.scores.apply(lambda x: x[cluster_no])
    # assume score is normalized
    avg_score_for_cluster = cluster_df.predicted_score.sum() / cluster_df.shape[0]
    print(f"Cluster: {cluster_no:02}: {avg_score_for_cluster:8.3f} ({cluster_df.shape[0]:02})")

4685

Cluster: 00:    0.855 (586)
Cluster: 01:    0.568 (548)
Cluster: 02:    0.644 (377)
Cluster: 03:    0.756 (375)
Cluster: 04:    0.538 (354)
Cluster: 05:    0.848 (317)
Cluster: 06:    0.711 (287)
Cluster: 07:    0.486 (275)
Cluster: 08:    0.663 (242)
Cluster: 09:    0.774 (200)
Cluster: 10:    0.732 (200)
Cluster: 11:    0.773 (184)
Cluster: 12:    0.296 (169)
Cluster: 13:    0.391 (155)
Cluster: 14:    0.432 (108)
Cluster: 15:    0.482 (99)
Cluster: 16:    0.239 (104)
Cluster: 17:    0.188 (62)
Cluster: 18:    0.156 (43)


In [177]:
#
# explore the effect of training set size on mean score
#
for cluster in train_df.groupby('cluster'):
    cluster_no = cluster[0]
    cluster_df = cluster[1]
    print(f"Cluster: {cluster_no:02}:({cluster_df.shape[0]:02})")

Cluster: 00:(646)
Cluster: 01:(526)
Cluster: 02:(414)
Cluster: 03:(352)
Cluster: 04:(341)
Cluster: 05:(298)
Cluster: 06:(284)
Cluster: 07:(266)
Cluster: 08:(249)
Cluster: 09:(206)
Cluster: 10:(198)
Cluster: 11:(184)
Cluster: 12:(145)
Cluster: 13:(138)
Cluster: 14:(112)
Cluster: 15:(102)
Cluster: 16:(95)
Cluster: 17:(86)
Cluster: 18:(43)


In [157]:
#
# save intermediate results
#
train_df.to_csv("data/train.csv")
test_df.to_csv("data/test.csv")

## Vector database strategy

Each cluster is a text document.
Each cell signature is a sentence.
Need to chunk the cluster documents and restrict sentences to the highest expression genes. A reasonable cut point is 120 genes based on the BM25 analysis, which showed plateauing in the matches at around this number of "words".

In [17]:
#
# docs contains the chunked gene names by cluster
#

total = reduce(lambda x,y: x+y, [len(x) for x in docs],0)
print(total)

562200


In [18]:
#
# given that current form of docs is prohibitively large (n=562200 chunks),
# will use current embeddings as a first attempt
#

In [19]:
import chromadb
import json

In [20]:
def store_embeddings(
    collection: chromadb.Collection,
    df: pd.DataFrame,
    min_item=0,
    max_item=-1,
    embeddings_column: str = "embeddings",
    docs_column: str = "cluster",
) -> int:
    """
    Stores embeddings in the provided ChromaDB collection.

    Args
    collection: the collection to receive the data
    df : the dataframe from which the data is derived
    min_item: the minimum row number to use
    max_item: the maximum row number to use, defaults to -1 (all rows)
    embeddings_column: the column containing the embeddings, defaults to "embeddings"
    docs_column: the column containing the document name, defaults to "cluster"

    Rerturns the number of embeddings added to the database
    """
    if max_item == -1:
        max_item = df.shape[0]
    if max_item <= min_item:
        logger.error("max_item must be greater than min_item")
        return 0
    docs = [] # clusters
    embeds = [] # embeddings
    ids = [] # cell ids
    for i in range(min_item, max_item):
        docs.append(str(df[docs_column].iloc[i]))
        embeds.append(json.loads(df[embeddings_column].iloc[i]))
        ids.append(str(df.index[i]))
    try:
        collection.add(documents=docs, embeddings=embeds, ids=ids)
    except Exception as e:
        logger.error("unable to load data into database")
        logger.exception(e)
        return 0
    else:
        return max_item - min_item

In [21]:
def initialize_database(collection_name: str = "ragsc") -> chromadb.Collection:
    client = chromadb.Client()
    try:
        c = client.get_collection(collection_name)
        client.delete_collection(collection_name)
    except ValueError:
        pass
    c = client.create_collection(collection_name)
    return c

In [22]:
def setup_database(df: pd.DataFrame) -> chromadb.Collection:
    """
    creates an in memory ChromaDB collection  based on the data in the 
    provided dataframe.
    """
    collection = initialize_database()
    df = df[~df.signature.isnull()]  # clean any empty signatures
    store_embeddings(collection, df)
    return collection

In [23]:
def test_embeddings(embeddings:str, collection:chromadb.Collection, n_results=100):
    results = collection.query(
        query_embeddings=[json.loads(embeddings)],
        n_results=n_results,
        include=["documents","distances"])
    return results

In [24]:
#
# need to test results based on cluster orientation
#
def test_item(df: pd.DataFrame, row: int, collection: chromadb.Collection, n_results=100):
    # print(f"Original cluster: {df.cluster.iloc[row]}")
    results = collection.query(
        query_embeddings=[json.loads(df.embeddings.iloc[row])],
        n_results=n_results,
        include=["documents","distances"]
    )
    # print(results)
    # return zip(results['documents'],results['distances'])
    return results


In [25]:
#
# first attempt
#
coll = setup_database(train_df)
results = test_item(test_df,0,coll)


In [26]:
def distance_score(data: list[float], offset = 0.01)->float:
    if len(data) == 0:
        return 0 
    a = np.array(data)
    a = np.log10(1.0/(a+offset))
    return a.sum()
        

In [27]:
def distance_score_per_row(embeddings:str, coll, n_results=100, max_clusters=19):
    results = test_embeddings(embeddings,coll)
    pairs = list(zip(results['documents'][0],results['distances'][0]))
    table ={}
    for k in range(max_clusters): 
        table[k] = []
    for item in pairs:
        table[int(item[0])].append(item[1])
    scores = []
    for k in table:
        scores.append(distance_score(table[k]))
    return scores

In [28]:
def apply_distance_score(df: pd.DataFrame, coll: chromadb.Collection, n_results=100) -> None:
    """
    Calculates distance_score on a row-wise basis, storing the results
    in column "d_score" and normalized (x-min/max-min) in "n_score".
    
    This works in-place, modifying the input dataframe by adding two columns.
    """
    # first create columns of lists to store the results
    df['d_score'] = [[] for i in range(df.shape[0])]
    df['n_score'] = [[] for i in range(df.shape[0])]

    # now calculate the scores and normalized scores
    df.d_score = df.embeddings.apply(lambda x: distance_score_per_row(x, coll))
    df.n_score = df.d_score.apply(lambda x: (x-np.min(x))/(np.max(x) - np.min(x)))

In [29]:
#
# calculate the distance scores
#
apply_distance_score(test_df, coll)

In [30]:
test_df.head(2)

Unnamed: 0,cell_no,cluster,signature,cell_id,embeddings,scores,d_score,n_score
0,0,0,RPL11 CEP350 GNLY PTPN4 ZBTB20 SMARCA5 KIAA082...,TGACCAAGTAGACAAA,"[0.022202235, 0.0039805206, -0.006030237, 0.01...","[1.0, 0.5899505384390359, 0.7178018238587968, ...","[36.66195147104413, 0.919602741008284, 23.6148...","[1.0, 0.02508330037299277, 0.644125315373835, ..."
3,3,0,RIPOR2 ATXN1 HIBADH JAZF1 PDE3B ARID1A TUT4 TR...,AATCATCCAGTTTACG,"[0.027102128, 0.016445303, -0.009962541, 0.008...","[1.0, 0.7138399779046296, 0.5878873131841577, ...","[3.1389357931347757, 36.37367301229885, 10.385...","[0.0862969156860629, 1.0, 0.2855244271063499, ..."


In [31]:
#
# calculate the combined index
#
alpha = 0.5 # alpha is the proportion to assign to each score

df_test['overall'] = df_test.scores * alpha + (1 - alpha) * df_test.n_score

In [32]:
clusters = df_test.groupby('cluster')
for cluster in clusters:
    cluster_no = cluster[0]
    cluster_df = cluster[1]
    cluster_df['accuracy_score'] = cluster_df.overall.apply(lambda x: x[cluster_no])
    # assume score is normalized
    avg_score_for_cluster = cluster_df.accuracy_score.sum() / cluster_df.shape[0]
    print(f"Cluster: {cluster_no:02}: {avg_score_for_cluster:8.3f} ({cluster_df.shape[0]:02})")

Cluster: 00:    0.877 (586)
Cluster: 01:    0.973 (548)
Cluster: 02:    0.765 (377)
Cluster: 03:    0.771 (375)
Cluster: 04:    0.804 (354)
Cluster: 05:    0.826 (317)
Cluster: 06:    0.989 (287)
Cluster: 07:    0.585 (275)
Cluster: 08:    0.846 (242)
Cluster: 09:    0.972 (200)
Cluster: 10:    0.920 (200)
Cluster: 11:    0.938 (184)
Cluster: 12:    0.403 (169)
Cluster: 13:    0.739 (155)
Cluster: 14:    0.946 (108)
Cluster: 15:    0.762 (99)
Cluster: 16:    0.517 (104)
Cluster: 17:    0.376 (62)
Cluster: 18:    0.300 (43)


In [33]:
BM = 0
VECTOR = 1
BOTH = 2
def calculate_summary_stats(clusters_df: pd.DataFrame, method:int) -> dict[int,np.array]:
    clusters = clusters_df.groupby('cluster')
    table = {k:[] for k in range(df_test.cluster.max())}
    for cluster in clusters:
        cluster_no = cluster[0]
        cluster_df = cluster[1]
        row_count = cluster_df.shape[0]
        values = np.zeros(row_count,dtype=float)
        for row in range(row_count):
            n_score = cluster_df.n_score.iloc[row][cluster_no]
            m_score = cluster_df.scores.iloc[row][cluster_no]
            if method == BM:
                values[row] = m_score
            elif method == VECTOR:
                values[row] = n_score
            else:
                if n_score > m_score:
                    values[row] = n_score
                else:
                    values[row] = m_score
        table[cluster_no] = values
    return table


In [34]:
from scipy import stats
table_vector = calculate_summary_stats(df_test, VECTOR)
table_match = calculate_summary_stats(df_test, BM)
table_both = calculate_summary_stats(df_test, BOTH)
for cluster in table_vector:
    print(f"{cluster:02} {table_vector[cluster].mean():8.3f} {stats.sem(table_vector[cluster]):8.3f} ({table_vector[cluster].size})", end='')
    print(f"{cluster:02} {table_match[cluster].mean():8.3f} {stats.sem(table_match[cluster]):8.3f} ({table_match[cluster].size})", end='')
    print(f"{cluster:02} {table_both[cluster].mean():8.3f} {stats.sem(table_both[cluster]):8.3f} ({table_both[cluster].size})")
print("--" * 6)
print("Ratio of both to BM25")
for cluster in table_vector:
    print(f"{cluster:02} {((table_both[cluster].mean()/table_match[cluster].mean())-1) * 100:5.1f}")


00    0.759    0.014 (586)00    0.996    0.001 (586)00    0.998    0.001 (586)
01    0.997    0.002 (548)01    0.950    0.004 (548)01    0.999    0.000 (548)
02    0.658    0.016 (377)02    0.873    0.007 (377)02    0.919    0.006 (377)
03    0.662    0.017 (375)03    0.879    0.008 (375)03    0.932    0.006 (375)
04    0.707    0.015 (354)04    0.901    0.007 (354)04    0.918    0.007 (354)
05    0.685    0.021 (317)05    0.967    0.004 (317)05    0.978    0.004 (317)
06    0.992    0.004 (287)06    0.986    0.003 (287)06    0.998    0.001 (287)
07    0.400    0.017 (275)07    0.770    0.010 (275)07    0.809    0.010 (275)
08    0.746    0.018 (242)08    0.946    0.008 (242)08    0.966    0.006 (242)
09    0.951    0.013 (200)09    0.993    0.004 (200)09    0.994    0.004 (200)
10    0.874    0.016 (200)10    0.966    0.006 (200)10    0.983    0.004 (200)
11    0.896    0.019 (184)11    0.979    0.007 (184)11    0.985    0.006 (184)
12    0.221    0.012 (169)12    0.585    0.014 (169)

## rankings

Need to determine the rankings of scores for individual cells by cluster

In [35]:
def rank_scores( scores: dict[int, float] ) -> dict[int, float]:
    """
    Givn a dictionary of floats, which represent scores, with integer keys, which represent clusters, 
    return a dictionary with the ranking of each float,
    such that the items of the orginal disctionary are ordered from highest to lowest ranking.
    """
    # print("received an dictionary of length", len(scores))
    x = sorted(scores, key=lambda x: scores[x], reverse=True)
    ranks = list(range(len(scores)))
    # print(f"Scores:{scores}")
    # print(f"x:{x}")
    # print(f"Ranks:{ranks}")
    # print(f"Dict:{dict(zip(ranks,x))}")
    return dict(zip(ranks, x))



In [36]:
t={0:1,1:3,2:9,3:-1}
r=rank_scores(t)
result={0:[],1:[],2:[],3:[]}
for k in result:
    result[k].append(r[k])
result

{0: [2], 1: [1], 2: [0], 3: [3]}

In [37]:
t={0:5,1:3,2:9,3:-1}
r=rank_scores(t)
for k in result:
    result[k].append(r[k])
result

{0: [2, 2], 1: [1, 0], 2: [0, 1], 3: [3, 3]}

In [71]:
def create_ranking_dict(a: np.array) -> dict[int, float]:
    d = {}
    for i in range(len(a)):
        d[i] = a[i]
    return d

def sort_categories_by_values(categories: dict[int,float])->dict[int,float]:
    return dict(sorted(categories.items(), key=lambda item: item[1], reverse=True))
    
scores_df = test_df[["cluster","scores"]].copy()
scores_df['rank_dict'] = scores_df.scores.apply(lambda x: create_ranking_dict(x))
scores_df['ranked_dict'] = scores_df.rank_dict.apply(lambda x: sort_categories_by_values(x))
scores_df

Unnamed: 0,cluster,scores,rank_dict,ranked_dict
0,0,"[1.0, 0.5899505384390359, 0.7178018238587968, ...","{0: 1.0, 1: 0.5899505384390359, 2: 0.717801823...","{0: 1.0, 2: 0.7178018238587968, 7: 0.660227676..."
3,0,"[1.0, 0.7138399779046296, 0.5878873131841577, ...","{0: 1.0, 1: 0.7138399779046296, 2: 0.587887313...","{0: 1.0, 3: 0.7325097113987431, 1: 0.713839977..."
7,0,"[1.0, 0.7702480898652903, 0.7541705810199278, ...","{0: 1.0, 1: 0.7702480898652903, 2: 0.754170581...","{0: 1.0, 1: 0.7702480898652903, 2: 0.754170581..."
9,0,"[1.0, 0.8398823881416582, 0.911057325366115, 0...","{0: 1.0, 1: 0.8398823881416582, 2: 0.911057325...","{0: 1.0, 2: 0.911057325366115, 1: 0.8398823881..."
13,0,"[1.0, 0.4570903754579496, 0.4813372996722293, ...","{0: 1.0, 1: 0.4570903754579496, 2: 0.481337299...","{0: 1.0, 3: 0.5502696432774594, 7: 0.541632742..."
...,...,...,...,...
9356,18,"[0.8107541827718205, 0.668534413005574, 0.8409...","{0: 0.8107541827718205, 1: 0.668534413005574, ...","{9: 1.0, 3: 0.9812789465880402, 12: 0.84136008..."
9358,18,"[0.4151818853067396, 0.4735370261830864, 0.463...","{0: 0.4151818853067396, 1: 0.4735370261830864,...","{9: 1.0, 7: 0.7027526327876555, 8: 0.530790532..."
9362,18,"[0.55319222833124, 0.9212308952643732, 0.74075...","{0: 0.55319222833124, 1: 0.9212308952643732, 2...","{9: 1.0, 1: 0.9212308952643732, 2: 0.740751188..."
9367,18,"[0.4406033896961032, 0.49905886144795075, 0.48...","{0: 0.4406033896961032, 1: 0.49905886144795075...","{9: 1.0, 18: 0.8114921894190859, 16: 0.5878831..."


In [72]:
clusters = scores_df.groupby('cluster')
ranks = {}
for cluster in clusters:
    cluster_no = cluster[0]
    cluster_df = cluster[1]
    cluster_rank_total = 0
    # if cluster_no > 0:
        # break
    cluster_df['cluster_rank'] = cluster_df.ranked_dict.apply(lambda x: x[cluster_no])
    ranks[cluster_no] = cluster_df.cluster_rank.mean()

ranks
    
    

{0: 0.9955418580639813,
 1: 0.9496018413612536,
 2: 0.8725218082845725,
 3: 0.878967482367515,
 4: 0.9010000744138762,
 5: 0.9665182960469804,
 6: 0.9863364147250021,
 7: 0.7698907649900156,
 8: 0.9455935808301014,
 9: 0.993417848087002,
 10: 0.9660563602155413,
 11: 0.9793013893429433,
 12: 0.5848580024494654,
 13: 0.802920115081347,
 14: 0.8979174590266658,
 15: 0.8103387498342408,
 16: 0.5851992604461563,
 17: 0.4729533071389919,
 18: 0.41168321206677216}

In [93]:
scores_df = test_df[["cluster","n_score","scores"]].copy()
scores_df['max_score'] = scores_df.apply(lambda row: np.maximum(row['n_score'], row['scores']), axis=1)
scores_df['rank_dict'] = scores_df.max_score.apply(lambda x: create_ranking_dict(x))
scores_df['ranked_dict'] = scores_df.rank_dict.apply(lambda x: sort_categories_by_values(x))
scores_df

Unnamed: 0,cluster,n_score,scores,max_score,rank_dict,ranked_dict
0,0,"[1.0, 0.02508330037299277, 0.644125315373835, ...","[1.0, 0.5899505384390359, 0.7178018238587968, ...","[1.0, 0.5899505384390359, 0.7178018238587968, ...","{0: 1.0, 1: 0.5899505384390359, 2: 0.717801823...","{0: 1.0, 2: 0.7178018238587968, 7: 0.660227676..."
3,0,"[0.0862969156860629, 1.0, 0.2855244271063499, ...","[1.0, 0.7138399779046296, 0.5878873131841577, ...","[1.0, 1.0, 0.5878873131841577, 0.7325097113987...","{0: 1.0, 1: 1.0, 2: 0.5878873131841577, 3: 0.7...","{0: 1.0, 1: 1.0, 3: 0.7325097113987431, 5: 0.7..."
7,0,"[1.0, 0.2390495012663944, 0.309139184138737, 0...","[1.0, 0.7702480898652903, 0.7541705810199278, ...","[1.0, 0.7702480898652903, 0.7541705810199278, ...","{0: 1.0, 1: 0.7702480898652903, 2: 0.754170581...","{0: 1.0, 1: 0.7702480898652903, 2: 0.754170581..."
9,0,"[1.0, 0.0, 0.41935515404995183, 0.608017202907...","[1.0, 0.8398823881416582, 0.911057325366115, 0...","[1.0, 0.8398823881416582, 0.911057325366115, 0...","{0: 1.0, 1: 0.8398823881416582, 2: 0.911057325...","{0: 1.0, 2: 0.911057325366115, 1: 0.8398823881..."
13,0,"[0.1530411151107344, 0.6924163409636979, 0.772...","[1.0, 0.4570903754579496, 0.4813372996722293, ...","[1.0, 0.6924163409636979, 0.772080968257213, 1...","{0: 1.0, 1: 0.6924163409636979, 2: 0.772080968...","{0: 1.0, 3: 1.0, 2: 0.772080968257213, 1: 0.69..."
...,...,...,...,...,...,...
9356,18,"[0.0, 1.0, 0.22872164816940121, 0.028232366790...","[0.8107541827718205, 0.668534413005574, 0.8409...","[0.8107541827718205, 1.0, 0.8409176911235959, ...","{0: 0.8107541827718205, 1: 1.0, 2: 0.840917691...","{1: 1.0, 9: 1.0, 3: 0.9812789465880402, 12: 0...."
9358,18,"[0.0, 0.45849926814538045, 0.0, 0.0, 0.5261163...","[0.4151818853067396, 0.4735370261830864, 0.463...","[0.4151818853067396, 0.4735370261830864, 0.463...","{0: 0.4151818853067396, 1: 0.4735370261830864,...","{9: 1.0, 7: 0.7027526327876555, 16: 0.56691893..."
9362,18,"[0.0, 0.8417271844277981, 0.0, 0.0, 0.40321649...","[0.55319222833124, 0.9212308952643732, 0.74075...","[0.55319222833124, 0.9212308952643732, 0.74075...","{0: 0.55319222833124, 1: 0.9212308952643732, 2...","{9: 1.0, 1: 0.9212308952643732, 2: 0.740751188..."
9367,18,"[0.0, 1.0, 0.09962162318847063, 0.050075796234...","[0.4406033896961032, 0.49905886144795075, 0.48...","[0.4406033896961032, 1.0, 0.48995277969192114,...","{0: 0.4406033896961032, 1: 1.0, 2: 0.489952779...","{1: 1.0, 9: 1.0, 18: 0.8114921894190859, 16: 0..."


In [99]:
clusters = scores_df.groupby('cluster')
ranks = {}
for cluster in clusters:
    cluster_no = cluster[0]
    cluster_df = cluster[1]
    cluster_rank_total = 0
    cluster_df['cluster_rank'] = cluster_df.ranked_dict.apply(lambda x: x[cluster_no])
    ranks[cluster_no] = (cluster_df.cluster_rank.mean(), cluster_df.cluster_rank.std(), cluster_df.cluster_rank.count())

for rank in ranks.values():
    print(f"{rank[2]}: {rank[0]:8.3f} {rank[1]:8.3f}")
    

586:    0.998    0.017
548:    0.999    0.009
377:    0.919    0.109
375:    0.932    0.121
354:    0.918    0.127
317:    0.978    0.070
287:    0.998    0.023
275:    0.809    0.159
242:    0.966    0.091
200:    0.994    0.050
200:    0.983    0.052
184:    0.985    0.080
169:    0.598    0.182
155:    0.857    0.194
108:    0.995    0.049
99:    0.875    0.207
104:    0.652    0.276
62:    0.496    0.180
43:    0.430    0.198
