In [203]:
import pandas as pd
import json
from functools import reduce
from fixed_token_chunker import FixedTokenChunker
from typing import Callable

# Step 1

In [204]:
questions_df = pd.read_csv('../data/questions_df.csv')
chatlogs = open('../data/chatlogs.md', encoding='UTF-8').read()
# wikitexts = open('../data/wikitexts.md', encoding='UTF-8').read()
# state_of_the_union = open('../data/state_of_the_union.md', encoding='UTF-8').read()

In [205]:
questions_df.head()

Unnamed: 0,question,references,corpus_id
0,What significant regulatory changes and propos...,"[{""content"": ""My administration announced we\u...",state_of_the_union
1,What reasons did President Biden give for the ...,"[{""content"": ""But unfortunately, politics have...",state_of_the_union
2,How many people are no longer denied health in...,"[{""content"": ""Over 100 million of you can no l...",state_of_the_union
3,"Which country is Putin invading, causing chaos...","[{""content"": ""Overseas, Putin of Russia is on ...",state_of_the_union
4,When did the murder rate experience the sharpe...,"[{""content"": ""Last year, the murder rate saw t...",state_of_the_union


In [206]:
chatlogs_questions = questions_df[questions_df['corpus_id'] == 'chatlogs']
chatlogs_questions['references'] = chatlogs_questions['references'].apply(json.loads).copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chatlogs_questions['references'] = chatlogs_questions['references'].apply(json.loads).copy()


In [207]:
chatlogs_questions['references'].iloc[0][0]

{'content': 'However, social media also has negative consequences that can impact personal relationships. One of the most significant concerns is the impact of social comparison, which can lead to feelings of envy, insecurity, and dissatisfaction',
 'start_index': 31798,
 'end_index': 32031}

# Step 2

In [208]:
chunker = FixedTokenChunker(chunk_size=100, chunk_overlap=0)
chunked_text = chunker.split_text(chatlogs)

In [209]:
# I took the function from the mentioned repo
def find_target_in_document(document, target):
    start_index = document.find(target)
    if start_index == -1:
        return None
    end_index = start_index + len(target)
    return start_index, end_index

In [210]:
chunks_location = []
for chunk in chunked_text:
   chunks_location.append(find_target_in_document(chatlogs, chunk))


# Step 3

In [211]:
def intersection(range1, range2):
    '''
    Returns the intersection of two ranges.
    
    :param range1: The first range (start, end).
    :type range1: tuple[int, int]
    :param range2: The second range (start, end).
    :type range2: tuple[int, int]
    :return: The intersection of the two ranges.
    :rtype: tuple[int, int] | None
    '''

    if range1 is None or range2 is None:
        return None

    start1, end1 = range1
    start2, end2 = range2
    
    return (min(max(start1, start2), end1), max(min(end1, end2), start1))

def left_diffrence(range1, range2):
    '''
    Returns the difference between two ranges.

    :param range1: The first range to diffrenciate (start, end).
    :type range1: tulpe[int, int]
    :param range2: The second range (start, end).
    :type range1: tuple[int, int]
    :return: The difference between the two ranges.
    :rtype: tuple[tulpe[int, int], tulpe[int, int]] | None
    '''

    if range1 is None or range2 is None:
        return [None]
    
    start1, end1 = range1
    start2, end2 = range2

    if end1 < start2 or start1 > end2:
        return [range1]
    
    if start2 < start1 and end2 > end1:
        return [None]
    
    if start1 <= start2 and end1 >= end2:
        return [(start1, start2), (end2, end1)]
    
    if end1 < end2:
        return [(start1, start2)]
    
    return [(end1, end2)]
    

def union(range1, range2):
    '''
    Returns the union of two ranges.

    :param range1: The first range (start, end).
    :type range1: tuple[int, int]
    :param range2: The second range (start, end).
    :type range2: tuple[int, int]
    :return: The union of the two ranges.
    :rtype: tuple[int, int] | None
    '''

    if range1 is None or range2 is None:
        return None

    start1, end1 = range1
    start2, end2 = range2
    
    return (min(start1, start2), max(end1, end2)) if start1 < end2 else None

def range_length(range):
    '''
    Returns the length of a range.

    :param range: The range (start, end).
    :type range: tuple[int, int] | list[tuple[int, int]] | None
    :return: The length of the range.
    :rtype: int
    '''
    return range[1] - range[0] if range else 0


def reduce_to_list(list1, list2):
    if list1 is None or list2 is None:
        return None
    
    return list1 + list2

In [216]:
metrics = {'precision': [], 'recall': [], 'f1': [], 'iou': []}

chunks_len = sum([range_length(r) for r in chunks_location])

for row in chatlogs_questions['references']:

    int_len = 0
    exc_len = 0
    unused_ranges = chunks_location.copy()

    # Chunks = intersection(chunks, reference) + diffrence(chunks, reference)
    # so: lenght(chunks) - lenght(unused_ranges) = lenght(intersection(chunks, reference))
    for reference in row:
        ref_range = reference['start_index'], reference['end_index']

        for chunk_range in chunks_location:
            
            intersection_range = intersection(chunk_range, ref_range)

            if intersection_range:
                unused_ranges = reduce(reduce_to_list, [left_diffrence(u_range, intersection_range) for u_range in unused_ranges])
                
                
        exc_len += range_length(ref_range)
    
    
    unused_len = sum([range_length(range) for range in unused_ranges])
    int_len = chunks_len - unused_len

    metrics['precision'].append(int_len / chunks_len)
    metrics['recall'].append(int_len / exc_len)
    metrics['iou'].append(int_len / (chunks_len + exc_len - int_len))
    metrics['f1'].append(2 * int_len / (chunks_len + exc_len))

In [241]:
mean_values = pd.DataFrame(metrics).mean()
std_values = pd.DataFrame(metrics).std()

formatted_stats = pd.DataFrame({
    metric: [f"{100*mean:.2f}±{100*std:.2f}"] 
    for metric, mean, std in zip(mean_values.index, mean_values, std_values)
})

formatted_stats

Unnamed: 0,precision,recall,f1,iou
0,2.68±2.00,280.94±173.54,5.29±3.89,2.76±2.10


# Step 4

In [213]:
# Code from HF

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


encoded_input = tokenizer(chunked_text, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**encoded_input)

chunks_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

chunks_embeddings = F.normalize(chunks_embeddings, p=2, dim=1)




KeyboardInterrupt: 

In [None]:
chunks_embeddings 

tensor([[-0.0030,  0.0533,  0.0271,  ...,  0.0243, -0.0610,  0.0593],
        [-0.0168,  0.0698,  0.0162,  ...,  0.0198, -0.0452,  0.0696],
        [-0.0313,  0.0219,  0.0268,  ..., -0.0389, -0.0846,  0.0665],
        ...,
        [-0.0554, -0.0141,  0.0174,  ...,  0.0583, -0.0526,  0.0016],
        [ 0.0150,  0.0142,  0.0246,  ...,  0.1116, -0.0127,  0.0008],
        [ 0.0305,  0.0584,  0.0278,  ...,  0.0900, -0.0379, -0.0305]])

# Step 5

In [None]:
from dataclasses import dataclass
from typing import Union


@dataclass
class Chunker:
        split_text: Callable[[str], list]



@dataclass
class Embbeder:
    
    embed: Callable[[Union[str, list[str]]], torch.Tensor]

class HFEmbedder(Embbeder):
    def __init__(self):
        from transformers import AutoTokenizer, AutoModel

        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")


    def embed(self, text):
        import torch.nn.functional as F
        import torch

        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        chunks_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])

        return F.normalize(chunks_embeddings, p=2, dim=1)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class RetriEval:
    

    def __init__(self, chunker: Chunker, embedding: Embbeder, retrived_chunks: int):
        self.chunker = chunker
        self.embedding = embedding
        self.retrived_chunks = retrived_chunks

    def _retrival(self, chunks, query):
        corpus_embeddings = self.embedding.embed(chunks)
        query_embeddings = self.embedding.embed(query)

        cos_similarities = torch.nn.functional.cosine_similarity(query_embeddings, corpus_embeddings, dim=-1)
        top_k_indices = torch.topk(cos_similarities, self.retrived_chunks).indices
        
        return top_k_indices.cpu().numpy()
    
    def find_target_in_document(self, document, target):
        start_index = document.find(target)
        if start_index == -1:
            return None
        end_index = start_index + len(target)
        return start_index, end_index

    def evaulate(self, corpus, questions_df):
        import json
        from operator import itemgetter

        questions_df['references'] = questions_df['references'].apply(json.loads)

        chunks = self.chunker.split_text(corpus)
        chunks_range = [self.find_target_in_document(corpus, chunk) for chunk in chunks]
        metrics = {'precision': [], 'recall': [], 'f1': [], 'iou': []}
        
        
        for row in questions_df.iterrows():
            query = row[1]['question']

            topk_indices = self._retrival(chunks, query)
            relevant_chunks_range = list(itemgetter(*topk_indices)(chunks_range))
            
            excerpts_ranges = [(excerpt['start_index'], excerpt['end_index']) for excerpt in row[1]['references']]

            int_len = 0
            exc_len = 0
            relevant_chunks_len = sum([self.range_length(r) for r in relevant_chunks_range])
            unused_ranges = relevant_chunks_range.copy()

            for excerpt_range in excerpts_ranges:

                for chunk_range in relevant_chunks_range:
                    
                    intersection_range = self.intersection(chunk_range, excerpt_range)

                    if intersection_range:
                        unused_ranges = reduce(self.reduce_to_list, [self.left_diffrence(u_range, intersection_range) for u_range in unused_ranges])
                        
                        
                exc_len += self.range_length(ref_range)
            
            unused_len = sum([self.range_length(range) for range in unused_ranges])
            int_len = relevant_chunks_len - unused_len

            metrics['precision'].append(int_len / relevant_chunks_len)
            metrics['recall'].append(int_len / exc_len)
            metrics['iou'].append(int_len / (relevant_chunks_len + exc_len - int_len))
            metrics['f1'].append(2 * int_len / (relevant_chunks_len + exc_len))
            
        metrics_df = pd.DataFrame(metrics).mean()
        metrics_df['corpus'] = questions_df['corpus_id'].iloc[0]
        metrics_df['retrived_chunks'] = self.retrived_chunks

        mean_values = pd.DataFrame(metrics).mean()
        std_values = pd.DataFrame(metrics).std()

        formatted_stats = pd.DataFrame({
            metric: [f"{mean:.2f}±{std:.2f}"] 
            for metric, mean, std in zip(mean_values.index, mean_values, std_values)
        })

        return formatted_stats
    
    def intersection(self, range1, range2):

        if range1 is None or range2 is None:
            return None

        start1, end1 = range1
        start2, end2 = range2
        
        return (min(max(start1, start2), end1), max(min(end1, end2), start1))

    def left_diffrence(self, range1, range2):

        if range1 is None or range2 is None:
            return [None]
        
        start1, end1 = range1
        start2, end2 = range2

        if end1 < start2 or start1 > end2:
            return [range1]
        
        if start2 < start1 and end2 > end1:
            return [None]
        
        if start1 <= start2 and end1 >= end2:
            return [(start1, start2), (end2, end1)]
        
        if end1 < end2:
            return [(start1, start2)]
        
        return [(end1, end2)]

    def range_length(self, range):

        return range[1] - range[0] if range else 0


    def reduce_to_list(self, list1, list2):
        if list1 is None or list2 is None:
            return None
        
        return list1 + list2
        


In [246]:
chunker = FixedTokenChunker(chunk_size=200, chunk_overlap=50)
embedder = HFEmbedder()
retrive_chunks = 5
eval = RetriEval(chunker, embedder, retrive_chunks)



In [247]:
chatlogs_questions = questions_df[questions_df['corpus_id'] == 'chatlogs']

metrics = eval.evaulate(chatlogs, chatlogs_questions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions_df['references'] = questions_df['references'].apply(json.loads)


In [248]:
metrics

Unnamed: 0,precision,recall,f1,iou
0,29.77±20.26,341.43±289.90,53.48±35.82,44.86±35.53
