# Fusion

Attempt to apply the approach used by [Nir Diamant](https://github.com/NirDiamant/RAG_Techniques/blob/main/all_rag_techniques/fusion_retrieval.ipynb) to the ragsc problem.

## Strategy

Consider each cluster as a "document".  Using a random sample of the cluster data and associated embeddings, create a vector database
using FAISS or Chroma.  At the same time, use Lucene to create an index for the "documents".  Score matches on both semantic (vector) and keyword (BM25) and combine the scores to see if we can get more success matching to clusters.

In [384]:
#
# import libraries
#
import pandas as pd
from pathlib import Path
from rank_bm25 import BM25Okapi
import numpy as np
from itertools import chain
from functools import partial
from typing import Union

In [385]:
#
# set constants
#
input_path = Path("../results")
output_path = Path("../results")
training_fraction = 0.5

In [386]:
#
# load the data along with embeddings
#
master_df = pd.read_csv(input_path / Path("ragsc_00_all_large.csv"))
master_n_cells = master_df.shape[0]

train_df = master_df.sample(frac=training_fraction)
test_df= master_df.drop(train_df.index) #.sample(frac=training_fraction) 
print(f"total rows: {master_df.shape[0]}")
print(f"training set has {train_df.shape[0]} rows")
print(f"test set has {test_df.shape[0]} rows")

total rows: 9370
training set has 4685 rows
test set has 4685 rows


In [387]:
for cluster in test_df.groupby('cluster'):
    print(cluster[0], cluster[1].shape[0])

0 587
1 547
2 411
3 364
4 332
5 303
6 286
7 276
8 247
9 212
10 187
11 176
12 160
13 155
14 118
15 97
16 97
17 84
18 46


In [388]:
def get_gene_bags(df: pd.DataFrame, max_genes:int, sort_by_cluster_names=True) -> dict:
    """
    Produces "bags of words" for each cluster to use as documents in BM25 analysis.
    
    Returns a dictionary with cluster name as the keys and a list of gene names as the values.
    """
    clusters = df.groupby("cluster", sort=False)
    word_dict = {}
    for cluster in clusters:
        # each cluster is a tuple (cluster name, cluster dataframe)
        words = []
        cluster_df = cluster[1] # the dataframe
        # convert each signature into a list of string
        word_series = cluster_df.signature.apply(lambda x: x.split(" "))
        # create a bag of words based containing the gene names for this cluster
        for sig in word_series:
            # retain only max_genes gene names to add to the bag of words
            words.extend(sig[:max_genes]) 
        word_dict[cluster[0]] = words
    if sort_by_cluster_names:
        word_dict = {k: word_dict[k] for k in sorted(word_dict)}
    return word_dict


In [389]:
#
# create index from the cluster "documents" which are stored in word_dict
#
# bm25_index = BM25Okapi(word_dict.values())

In [390]:
#
# chunking
#
def chunk(s:Union[str,list], size:int, step=1) -> list[str]:
    """
    Takes a string or list of strings and creates a list of chunks of a given size.
    """
    if isinstance(s,str):
        a = s.split()
    else:
        a = s
    results = []
    max = len(a)
    for i in range(max):
        if i+size < max:
            results.append(" ".join(a[i:i+size]))
        else:
            results.append(" ".join(a[i:]))
        i += step
    return results
    
# chunks = chunk("this is a test of the splitter", 3)
# print(chunks)
chunk2 = partial(chunk, size=2)

In [391]:
chunkn = partial(chunk, size=2)

In [392]:
word_dict = get_gene_bags(train_df,max_genes=120)

#
# create index from the cluster "documents" which are stored in word_dict
#
docs = [chunk2(" ".join(x)) for x in word_dict.values()]
bm25_index = BM25Okapi(docs)


In [393]:
def get_score(bm25, gene_list, max_genes=25, normalized=True) -> list[float]:
    """
    Returns a list containing the scores for a particular list of genes
    """
    query = chunk2(gene_list)[:max_genes]
    scores = bm25.get_scores(query)
    if normalized:
        scores = (scores - np.min(scores))/(np.max(scores)-np.min(scores))
    return scores

In [394]:
def create_score_column(df:pd.DataFrame, bm25, max_genes) -> pd.DataFrame:
    """
    Add a column to the provided dataframe containing the BM25 scores.

    Args:
        df - the dataframe whose signatures will be used to generate the scores
        bm25 - the index to use fo comparison
        max_genes - the maximum number of genes to include from each signature

    Returns a reference to the original dataframe
    """
    df['scores'] = df.signature.apply(lambda x: get_score(bm25, x, max_genes))
    return df

In [395]:
df_test = create_score_column(test_df, bm25_index, 120)

In [396]:
n=121
cluster = test_df.cluster.iloc[n]
scores = test_df.scores.iloc[n]
rating = scores[cluster]
print(cluster, scores, rating)

0 [1.         0.51558109 0.86709428 0.53305399 0.58166585 0.47954012
 0.23371676 0.31831665 0.36897762 0.46709019 0.28887357 0.
 0.29918718 0.24894921 0.05789937 0.22079566 0.22473557 0.00306533
 0.03085111] 1.0


In [397]:
row = 25
clusters = df_test.groupby('cluster')
for cluster in clusters:
    # print(cluster[1].shape)
    local_df = cluster[1]
    no = cluster[0]
    if no> 0:
        bad = no-1
    else:
        bad = no+1
    scores = local_df.scores.iloc[row]
    # print(scores)
    print(f"{no:5} {scores[no]:8.2f} {scores[bad]:8.2f}")


    0     1.00     0.74
    1     0.91     0.74
    2     0.99     0.92
    3     1.00     0.65
    4     1.00     0.44
    5     1.00     0.19
    6     1.00     0.67
    7     1.00     0.47
    8     1.00     0.33
    9     1.00     0.21
   10     1.00     0.08
   11     1.00     0.37
   12     0.60     0.15
   13     1.00     0.17
   14     1.00     0.26
   15     0.92     0.13
   16     0.62     0.30
   17     0.17     0.39
   18     0.71     0.06


In [398]:
#
# explore the effect of sample size on mean score
#
print(test_df.shape[0])
print()
sum=0
n_clusters = 19
count=0
# test_df['scores_sum'] = test_df.scores.apply(lambda x: np.sum(x))
for cluster in test_df.groupby('cluster'):
    cluster_no = cluster[0]
    cluster_df = cluster[1]
    cluster_df['predicted_score'] = cluster_df.scores.apply(lambda x: x[cluster_no])
    # assume score is normalized
    avg_score_for_cluster = cluster_df.predicted_score.sum() / cluster_df.shape[0]
    print(f"Cluster: {cluster_no:02}: {avg_score_for_cluster:8.3f} ({cluster_df.shape[0]:02})")

4685

Cluster: 00:    0.996 (587)
Cluster: 01:    0.956 (547)
Cluster: 02:    0.847 (411)
Cluster: 03:    0.880 (364)
Cluster: 04:    0.913 (332)
Cluster: 05:    0.967 (303)
Cluster: 06:    0.985 (286)
Cluster: 07:    0.755 (276)
Cluster: 08:    0.953 (247)
Cluster: 09:    0.993 (212)
Cluster: 10:    0.970 (187)
Cluster: 11:    0.994 (176)
Cluster: 12:    0.587 (160)
Cluster: 13:    0.796 (155)
Cluster: 14:    0.882 (118)
Cluster: 15:    0.840 (97)
Cluster: 16:    0.651 (97)
Cluster: 17:    0.382 (84)
Cluster: 18:    0.359 (46)


In [399]:
#
# explore the effect of training set size on mean score
#
for cluster in train_df.groupby('cluster'):
    cluster_no = cluster[0]
    cluster_df = cluster[1]
    print(f"Cluster: {cluster_no:02}:({cluster_df.shape[0]:02})")

Cluster: 00:(645)
Cluster: 01:(527)
Cluster: 02:(380)
Cluster: 03:(363)
Cluster: 04:(363)
Cluster: 05:(312)
Cluster: 06:(285)
Cluster: 07:(265)
Cluster: 08:(244)
Cluster: 09:(194)
Cluster: 10:(211)
Cluster: 11:(192)
Cluster: 12:(154)
Cluster: 13:(138)
Cluster: 14:(102)
Cluster: 15:(104)
Cluster: 16:(102)
Cluster: 17:(64)
Cluster: 18:(40)
