# Fusion

Attempt to apply the approach used by [Nir Diamant](https://github.com/NirDiamant/RAG_Techniques/blob/main/all_rag_techniques/fusion_retrieval.ipynb) to the ragsc problem.

## Strategy

Consider each cluster as a "document".  Using a random sample of the cluster data and associated embeddings, create a vector database
using FAISS or Chroma.  At the same time, use Lucene to create an index for the "documents".  Score matches on both semantic (vector) and keyword (BM25) and combine the scores to see if we can get more success matching to clusters.

In [122]:
#
# import libraries
#
import pandas as pd
from pathlib import Path
from rank_bm25 import BM25Okapi
import numpy as np
from itertools import chain
from functools import partial

In [2]:
#
# set constants
#
input_path = Path("../results")
output_path = Path("../results")
training_fraction = 0.2

In [3]:
#
# load the data along with embeddings
#
master_df = pd.read_csv(input_path / Path("ragsc_00_all_large.csv"))
master_n_cells = master_df.shape[0]

train_df = master_df.sample(frac=2*training_fraction)
test_df= master_df.drop(train_df.index).sample(frac=training_fraction) 
print(f"training set has {train_df.shape[0]} rows")
print(f"test set has {test_df.shape[0]} rows")

training set has 3748 rows
test set has 1124 rows


In [238]:
#
# need to create "documents" based on clusters
#
clusters = train_df.groupby("cluster", sort=False)

word_dict = {}
for cluster in clusters:
    words = []
    cluster_df = cluster[1]
    word_series = cluster_df.signature.apply(lambda x: x.split(" "))
    for sig in word_series:
        words.extend(sig[:120])
    word_dict[cluster[0]] = words
#
# sort by the keys (i.e., the clusters)
#
word_dict = {k: word_dict[k] for k in sorted(word_dict)}
    

In [192]:
#
# create index from the cluster "documents" which are stored in word_dict
#
bm25_index = BM25Okapi(word_dict.values())

In [193]:
#
# let's start with a simple query
#
n=150
q_sig = test_df.signature.iloc[n].split(" ")
cluster = test_df.cluster.iloc[n]


#
# let's get a score
#
bm25_scores = bm25_index.get_scores(q_sig[:100])
bm25_scores = (bm25_scores - np.min(bm25_scores))/(np.max(bm25_scores)-np.min(bm25_scores))
print(bm25_scores)
print("-----\n",cluster, bm25_scores[cluster])


[0.86725646 0.83802347 1.         0.87957711 0.64294957 0.37730684
 0.06037929 0.66324782 0.15628418 0.15227508 0.05380354 0.09892914
 0.44558922 0.13895247 0.         0.00835163 0.0319021  0.03352001
 0.11006756]
-----
 1 0.8380234748505551


In [194]:
#
# chunking
#
def chunk(s:str, size:int, step=1) -> list[str]:
    a = s.split()
    results = []
    max = len(a)
    for i in range(max):
        if i+size < max:
            results.append(" ".join(a[i:i+size]))
        else:
            results.append(" ".join(a[i:]))
        i += step
    return results
    
chunks = chunk("this is a test of the splitter", 3)
print(chunks)


['this is a', 'is a test', 'a test of', 'test of the', 'of the splitter', 'the splitter', 'splitter']


In [195]:
chunkn = partial(chunk, size=2)

In [196]:
docs = [
    "this is a test",
    "oddly I have no hamburgers",
    "wish I were here"
]

docs = list(map(chunkn, docs))
print(docs)

bm_crap = BM25Okapi(docs)

[['this is', 'is a', 'a test', 'test'], ['oddly I', 'I have', 'have no', 'no hamburgers', 'hamburgers'], ['wish I', 'I were', 'were here', 'here']]


In [197]:
query = chunkn("bob is a big boy now")
scores = bm_crap.get_scores(query)
# scores = (scores - np.min(scores))/(np.max(scores)-np.min(scores))
print(scores)

[0.52914208 0.         0.        ]


In [239]:
#
# create index from the cluster "documents" which are stored in word_dict
#
# bm25_index = BM25Okapi(word_dict.values())
docs = [chunkn(" ".join(x)) for x in word_dict.values()]
bm25_index = BM25Okapi(docs)


In [240]:
n=20
query = chunkn(test_df.signature.iloc[n])[:25]
cluster = test_df.cluster.iloc[n]

scores = bm25_index.get_scores(query)
print(len(query))
scores = (scores - np.min(scores))/(np.max(scores)-np.min(scores))
print(cluster,scores, scores[cluster])

25
2 [0.6634944  0.66244606 0.51192153 1.         0.73419402 0.
 0.         0.         0.         0.         0.254082   0.
 0.26351492 0.         0.         0.         0.         0.
 0.        ] 0.5119215320488495


In [245]:
print(test_df.shape[0])
sum=0
max = test_df.shape[0]
count=0
for i in range(test_df.shape[0]):
    query = chunkn(test_df.signature.iloc[i])[:100]
    cluster = test_df.cluster.iloc[i]
    scores = bm25_index.get_scores(query)
    if np.max(scores) > np.min(scores):
        scores = (scores - np.min(scores))/(np.max(scores)-np.min(scores))
        # print(cluster, scores[cluster])
        sum += scores[cluster]
        count += 1
    # else:
        # print(cluster,-1)
print(f"Sum:{sum} Percent:{sum/max*100.0}, Count:{count}, Zeros:{max-count}")

1124
Sum:1003.9583017866413 Percent:89.32013361091114, Count:1124, Zeros:0
