In [37]:
import inseq
from inseq.commands.attribute_context.attribute_context import attribute_context_with_model, AttributeContextArgs
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer
import json
import numpy as np
import pandas as pd

##################################################################################

inseq_model = inseq.load_model(
    "microsoft/Phi-3-mini-4k-instruct",
    "saliency",
)

df = pd.read_csv("hotpot_bridge.csv")
df = df[120:450]

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [38]:
# FUNZIONI AUSILIARIE VARIE

def top_k_positions(input_context_scores, k):
    # Convert the input array to a numpy array if it's not already
    input_context_scores = np.array(input_context_scores)
    
    # Use argsort to get the indices of the top 8 highest elements
    top_k_indices = np.argsort(input_context_scores)[-k:]
    
    # Reverse to get the indices in descending order of their values
    top_k_indices = top_k_indices[::-1]
    
    return top_k_indices.tolist()

# -------------------------------------------------------------------------

def estrai_frasi_con_token(frasipositioni, posizioni_token):
    # Creiamo un set per le posizioni dei token per una ricerca più veloce
    set_posizioni_token = set(posizioni_token)
    
    # Lista per memorizzare le frasi di contesto
    frasi_contesto = []
    
    for frase, inizio, fine in frasipositioni:
        # Verifica se c'è almeno un token rilevante nell'intervallo [inizio, fine]
        if any(token in set_posizioni_token for token in range(inizio, fine + 1)):
            frasi_contesto.append(frase)
    
    return frasi_contesto

# -------------------------------------------------------------------------

def find_positions(vector, target="<0x0A>"):
    positions = []
    for i, element in enumerate(vector):
        if element == target:
            positions.append(i)   
    return positions

# -------------------------------------------------------------------------

def map_relative_positions(abs_positions, text):
    temp = []
    start = 0
    for i in range(len(abs_positions)):
        if i != 0: 
            start = abs_positions[i-1] + 1
        temp.append([split_text(text)[i], start, abs_positions[i]])

    return temp

# -------------------------------------------------------------------------

def select_passages(text_passage, question, p, out, tokens):
    
    num_of_sets = len(out.cci_scores)
    
    top_scores = []
    for i in range(num_of_sets):
        top_scores.extend(top_k_positions(out.cci_scores[i].input_context_scores, p))
    positions = list(set(top_scores))

    return '\n'.join(estrai_frasi_con_token(map_relative_positions(find_positions(tokens), text_passage), positions))
 
# -------------------------------------------------------------------------

def split_text(text):
    # Split the text on ". " and return the resulting list
    return text.split("\n ")

# -------------------------------------------------------------------------

def invoke_pecore(text_passage, question, p):
    pecore_args = AttributeContextArgs(
        model_name_or_path="microsoft/Phi-3-mini-4k-instruct",
        attribution_method="saliency",
        attributed_fn="contrast_prob_diff",
        context_sensitivity_metric="kl_divergence",
        context_sensitivity_std_threshold=1,
        context_sensitivity_topk = find_top_p(text_passage, p),
        attribution_std_threshold=None,
        attribution_topk=None,
        input_current_text=question, 
        input_context_text=text_passage,
        contextless_input_current_text="""<|system|>
    You are a helpful assistant that provide concise and accurate answers.<|end|>
    <|user|>
    {current}<|end|>
    <|assistant|>""",
        input_template="""<|system|>
    You are a helpful assistant that provide concise and accurate answers.<|end|>
    <|user|>
    {context}
    
    {current}<|end|>
    <|assistant|>""",
        contextless_output_current_text="""{current}""",
        output_template="{current}",
        special_tokens_to_keep=['<|system|>', '<|end|>', '<|assistant|>', '<|user|>'],
        decoder_input_output_separator="""
    """,
        save_path=None,
        viz_path=None,
        show_viz=False,
        generation_kwargs={'max_new_tokens': 50},
    )

    out = attribute_context_with_model(pecore_args, inseq_model) 
    return out

# -------------------------------------------------------------------------

def find_top_p(text_passage, p):
    
    tokens = tokenizer.tokenize(text_passage)
    num_topk_tokens = int(p/100 * len(tokens)) 

    return num_topk_tokens

In [39]:
def run(question, passage, p):
    
    tokens = tokenizer.tokenize(passage)
    test = invoke_pecore(passage, question, p)
    return select_passages(passage, question, p, test, tokens)


In [41]:
run(df['question'][124], df['selected_passages'][124], 5)

'The 12th Lumières Awards ceremony, presented by the Académie des Lumières, was held on 5 February 2007, at the Espace Pierre Cardin in Paris.\nThe ceremony was chaired by Isabelle Mergault. "\nTell No One" won the award for Best Film. Tell No One (French: "Ne le dis à personne" ) is a 2006 French thriller film directed by Guillaume Canet and based on the novel of the same name by Harlan Coben.'

In [1]:
import pandas as pd

df = pd.read_csv('ultramega-test-bridge.csv')
df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,question,correct,alternative,type,level_y,selected_passages,new
0,0,0,0,The Oberoi family is part of a hotel company t...,Delhi,Mumbai,bridge,medium,The Oberoi family is an Indian family that is ...,The Oberoi family is an Indian family that is ...
1,1,1,1,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,President Abraham Lincoln,bridge,hard,"Allison Beth ""Allie"" Goertz (born March 2, 199...",She is co-host of Everything's Coming Up Podca...
2,2,2,2,What nationality was James Henry Miller's wife?,American,British,bridge,medium,James Henry Miller (25 January 1915 – 22 Octob...,James Henry Miller (25 January 1915 – 22 Octob...
3,3,3,3,Cadmium Chloride is slightly soluble in this c...,alcohol,Sodium Chloride,bridge,medium,Cadmium chloride is a white crystalline compou...,It is a hygroscopic solid that is highly solub...
4,4,4,4,Which genus of moth in the world's seventh-lar...,Crambidae,Lepidoptera,bridge,hard,"India, officially the Republic of India (""Bhār...",India's Andaman and Nicobar Islands share a ma...


In [2]:
df = df.dropna()

In [3]:
df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,question,correct,alternative,type,level_y,selected_passages,new
0,0,0,0,The Oberoi family is part of a hotel company t...,Delhi,Mumbai,bridge,medium,The Oberoi family is an Indian family that is ...,The Oberoi family is an Indian family that is ...
1,1,1,1,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,President Abraham Lincoln,bridge,hard,"Allison Beth ""Allie"" Goertz (born March 2, 199...",She is co-host of Everything's Coming Up Podca...
2,2,2,2,What nationality was James Henry Miller's wife?,American,British,bridge,medium,James Henry Miller (25 January 1915 – 22 Octob...,James Henry Miller (25 January 1915 – 22 Octob...
3,3,3,3,Cadmium Chloride is slightly soluble in this c...,alcohol,Sodium Chloride,bridge,medium,Cadmium chloride is a white crystalline compou...,It is a hygroscopic solid that is highly solub...
4,4,4,4,Which genus of moth in the world's seventh-lar...,Crambidae,Lepidoptera,bridge,hard,"India, officially the Republic of India (""Bhār...",India's Andaman and Nicobar Islands share a ma...


In [6]:
possibilities = []
for i in range(len(df)):
    possibilities.append([df['correct'][i], df['alternative'][i]])

df['options'] = possibilities

In [7]:
df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,question,correct,alternative,type,level_y,selected_passages,new,options
0,0,0,0,The Oberoi family is part of a hotel company t...,Delhi,Mumbai,bridge,medium,The Oberoi family is an Indian family that is ...,The Oberoi family is an Indian family that is ...,"[Delhi, Mumbai ]"
1,1,1,1,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,President Abraham Lincoln,bridge,hard,"Allison Beth ""Allie"" Goertz (born March 2, 199...",She is co-host of Everything's Coming Up Podca...,"[President Richard Nixon, President Abraham L..."
2,2,2,2,What nationality was James Henry Miller's wife?,American,British,bridge,medium,James Henry Miller (25 January 1915 – 22 Octob...,James Henry Miller (25 January 1915 – 22 Octob...,"[American, British ]"
3,3,3,3,Cadmium Chloride is slightly soluble in this c...,alcohol,Sodium Chloride,bridge,medium,Cadmium chloride is a white crystalline compou...,It is a hygroscopic solid that is highly solub...,"[alcohol, Sodium Chloride ]"
4,4,4,4,Which genus of moth in the world's seventh-lar...,Crambidae,Lepidoptera,bridge,hard,"India, officially the Republic of India (""Bhār...",India's Andaman and Nicobar Islands share a ma...,"[Crambidae, Lepidoptera ]"


In [8]:
df.to_csv('ultramega-test-bridge.csv')

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('wikihop_dataset/test-wikihop.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,query_x,answer_x,options_x,sum_supports,supports
0,0,What event did Juan Rossell participate in?,1996 summer olympics,"['1996 summer olympics', 'olympic games', 'spo...","The 2004 Summer Olympic Games, held in Athens...","The 2004 Summer Olympic Games, officially know..."
1,1,What languages did John Osteen speak or write?,english,"['english', 'greek', 'koine greek', 'nahuatl',...",Christianity is a monotheistic religion based...,A Christian (or ) is a person who follows or a...
2,2,What is the parent taxon of Australosuchus?,crocodilia,"['animal', 'area', 'crocodile', 'crocodilia', ...",Mekosuchinae was a subfamily of crocodiles th...,Mekosuchinae was a subfamily of crocodiles fro...
3,3,What is the'record_label method man' associate...,loud records,"['1995', '1996', 'album', 'english', 'epic', '...","Wu-Tang Clan's debut album, ""Enter the Wu-Tan...",Enter the Wu-Tang (36 Chambers) is the debut s...
4,4,In which administrative territorial entity is ...,raipur district,"['bangladesh', 'bhopal', 'bhutan', 'canada', '...","Chhattisgarh, a state in central India, is ri...",A province is almost always an administrative ...


In [6]:
def count_tokens(text):
    return len(text.split())

filtered_df = df[df['supports'].apply(count_tokens) < 7500]

In [7]:
len(filtered_df)

332

In [16]:
df.to_csv('wikihop-pecore-gemma.csv')