In [1]:
import inseq
from inseq.commands.attribute_context.attribute_context import attribute_context_with_model, AttributeContextArgs

inseq_model = inseq.load_model(
    "microsoft/Phi-3-mini-4k-instruct",
    "saliency",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
import datasets
from datasets import load_dataset

ds = load_dataset('saracandu/hotpotQA_nli', split= "train")

In [3]:
pecore_args = AttributeContextArgs(
    model_name_or_path="microsoft/Phi-3-mini-4k-instruct",
    attribution_method="saliency",
    attributed_fn="contrast_prob_diff",
    context_sensitivity_metric="kl_divergence",
    # parametri da regolare opportunamente
    context_sensitivity_std_threshold=0,
    context_sensitivity_topk = -5,
    attribution_std_threshold=0,
    attribution_topk=12,
    input_current_text=ds['question'][0], # domanda 
    input_context_text=ds['passages'][0], # TUTTA la sequenza, non splittata
    contextless_input_current_text="""<|system|>
You are a helpful assistant that provide concise and accurate answers.<|end|>
<|user|>
{current}<|end|>
<|assistant|>""",
    input_template="""<|system|>
You are a helpful assistant that provide concise and accurate answers.<|end|>
<|user|>
{context}

{current}<|end|>
<|assistant|>""",
    contextless_output_current_text="""{current}""",
    output_template="{current}",
    special_tokens_to_keep=['<|system|>', '<|end|>', '<|assistant|>', '<|user|>'],
    decoder_input_output_separator="""
""",
    save_path='test.json',
    viz_path=None,
    generation_kwargs={'max_new_tokens': 50},
)

out = attribute_context_with_model(pecore_args, inseq_model) 
# output che contiene tutti i CTI scores -> ma a noi non interessano

In [4]:
# importa il tokenizer del modello che usa pecore

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") 

In [5]:
# splitta il testo in una lista di tokens 

tokens = tokenizer.tokenize(ds['passages'][0])

In [6]:
tokens

['▁Arthur',
 "'",
 's',
 '▁Magazine',
 '▁(',
 '1',
 '8',
 '4',
 '4',
 '–',
 '1',
 '8',
 '4',
 '6',
 ')',
 '▁was',
 '▁an',
 '▁American',
 '▁literary',
 '▁period',
 'ical',
 '▁published',
 '▁in',
 '▁Philadelphia',
 '▁in',
 '▁the',
 '▁',
 '1',
 '9',
 'th',
 '▁century',
 '.',
 '<0x0A>',
 '▁Ed',
 'ited',
 '▁by',
 '▁T',
 '.',
 'S',
 '.',
 '▁Arthur',
 ',',
 '▁it',
 '▁featured',
 '▁work',
 '▁by',
 '▁Ed',
 'gar',
 '▁A',
 '.',
 '▁Po',
 'e',
 ',',
 '▁J',
 '.',
 'H',
 '.',
 '▁In',
 'gra',
 'ham',
 ',',
 '▁Sarah',
 '▁Jose',
 'pha',
 '▁H',
 'ale',
 ',',
 '▁Thomas',
 '▁G',
 '.',
 '▁Spe',
 'ar',
 ',',
 '▁and',
 '▁others',
 '.',
 '<0x0A>',
 '▁In',
 '▁May',
 '▁',
 '1',
 '8',
 '4',
 '6',
 '▁it',
 '▁was',
 '▁merged',
 '▁into',
 '▁"',
 'G',
 'ode',
 'y',
 "'",
 's',
 '▁Lady',
 "'",
 's',
 '▁Book',
 '".',
 '▁First',
 '▁for',
 '▁Women',
 '▁is',
 '▁a',
 '▁woman',
 "'",
 's',
 '▁magazine',
 '▁published',
 '▁by',
 '▁B',
 'auer',
 '▁Media',
 '▁Group',
 '▁in',
 '▁the',
 '▁USA',
 '.',
 '<0x0A>',
 '▁The',
 '▁magazi

In [7]:
# trova tutte le posizioni di \n (encoded differently)

def find_positions(vector, target="<0x0A>"):
    positions = []
    for i, element in enumerate(vector):
        if element == target:
            positions.append(i)   
    return positions

find_positions(tokens)

[32, 76, 118, 130, 144]

In [8]:
def split_text(text):
    # Split the text on ". " and return the resulting list
    return text.split("\n ")

In [9]:
def map_relative_positions(abs_positions, text):
    temp = []
    start = 0
    for i in range(len(abs_positions)):
        if i != 0: 
            start = abs_positions[i-1] + 1
        temp.append([split_text(text)[i], start, abs_positions[i]])

    return temp

map_relative_positions(find_positions(tokens), ds['passages'][0])

[["Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.",
  0,
  32],
 ['Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.',
  33,
  76],
 ['In May 1846 it was merged into "Godey\'s Lady\'s Book". First for Women is a woman\'s magazine published by Bauer Media Group in the USA.',
  77,
  118],
 ['The magazine was started in 1989.', 119, 130],
 ['It is based in Englewood Cliffs, New Jersey.', 131, 144]]

In [10]:
# calcola qual è il top p% per quella stringa

int(0.05 * len(tokens)) #0.05 = 5%

8

In [11]:
# passalo come parametro di MIRAGE

pecore_args = AttributeContextArgs(
    model_name_or_path="microsoft/Phi-3-mini-4k-instruct",
    attribution_method="saliency",
    attributed_fn="contrast_prob_diff",
    context_sensitivity_metric="kl_divergence",
    # parametri da regolare opportunamente
    context_sensitivity_std_threshold=1,
    context_sensitivity_topk = int(0.05 * len(tokens)),
    attribution_std_threshold=0,
    attribution_topk=2,
    input_current_text=ds['question'][0], # domanda 
    input_context_text=ds['passages'][0], # TUTTA la sequenza, non splittata
    contextless_input_current_text="""<|system|>
You are a helpful assistant that provide concise and accurate answers.<|end|>
<|user|>
{current}<|end|>
<|assistant|>""",
    input_template="""<|system|>
You are a helpful assistant that provide concise and accurate answers.<|end|>
<|user|>
{context}

{current}<|end|>
<|assistant|>""",
    contextless_output_current_text="""{current}""",
    output_template="{current}",
    special_tokens_to_keep=['<|system|>', '<|end|>', '<|assistant|>', '<|user|>'],
    decoder_input_output_separator="""
""",
    save_path='test.json',
    viz_path=None,
    generation_kwargs={'max_new_tokens': 50},
)

out = attribute_context_with_model(pecore_args, inseq_model) 
# output che contiene tutti i CTI scores -> ma a noi non interessano

In [12]:
import json

with open('test.json', 'r') as file:
    data = json.load(file)

In [13]:
num_of_sets = len(data['cci_scores'])

In [14]:
num_of_sets

6

In [15]:
import numpy as np

def top_8_positions(input_context_scores):
    # Convert the input array to a numpy array if it's not already
    input_context_scores = np.array(input_context_scores)
    
    # Use argsort to get the indices of the top 8 highest elements
    top_8_indices = np.argsort(input_context_scores)[-8:]
    
    # Reverse to get the indices in descending order of their values
    top_8_indices = top_8_indices[::-1]
    
    return top_8_indices.tolist()

In [16]:
top_scores = []

for i in range(num_of_sets):
    top_scores.extend(top_8_positions(data['cci_scores'][i]['input_context_scores']))
    
positions = list(set(top_scores))

In [17]:
sorted(positions)

[0,
 1,
 2,
 3,
 4,
 5,
 7,
 8,
 9,
 13,
 86,
 90,
 91,
 100,
 111,
 112,
 115,
 118,
 122,
 127,
 128]

In [18]:
map_relative_positions(find_positions(tokens), ds['passages'][0])

[["Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.",
  0,
  32],
 ['Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.',
  33,
  76],
 ['In May 1846 it was merged into "Godey\'s Lady\'s Book". First for Women is a woman\'s magazine published by Bauer Media Group in the USA.',
  77,
  118],
 ['The magazine was started in 1989.', 119, 130],
 ['It is based in Englewood Cliffs, New Jersey.', 131, 144]]

In [19]:
def estrai_frasi_con_token(frasipositioni, posizioni_token):
    """
    Estrae le frasi il cui intervallo di indici include almeno una delle posizioni dei token rilevanti.

    :param frasipositioni: Lista di vettori [frase, indice_inizio, indice_fine]
    :param posizioni_token: Lista di posizioni dei token rilevanti
    :return: Lista di frasi che contengono almeno un token rilevante nel loro intervallo di indici
    """
    # Creiamo un set per le posizioni dei token per una ricerca più veloce
    set_posizioni_token = set(posizioni_token)
    
    # Lista per memorizzare le frasi di contesto
    frasi_contesto = []
    
    for frase, inizio, fine in frasipositioni:
        # Verifica se c'è almeno un token rilevante nell'intervallo [inizio, fine]
        if any(token in set_posizioni_token for token in range(inizio, fine + 1)):
            frasi_contesto.append(frase)
    
    return frasi_contesto

In [20]:
estrai_frasi_con_token(map_relative_positions(find_positions(tokens), ds['passages'][0]), positions)

["Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.",
 'In May 1846 it was merged into "Godey\'s Lady\'s Book". First for Women is a woman\'s magazine published by Bauer Media Group in the USA.',
 'The magazine was started in 1989.']

In [21]:
def find_positions(vector, target="<0x0A>"):
    positions = []
    for i, element in enumerate(vector):
        if element == target:
            positions.append(i)   
    return positions

In [22]:
frontier_pos_tokens = find_positions(tokens)

In [23]:
def split_text(text):
    return text.split("\n ")

In [24]:
def map_relative_positions(abs_positions, text):
    temp = []
    start = 0
    for i in range(len(abs_positions)):
        if i != 0: 
            start = abs_positions[i-1] + 1
        temp.append([split_text(text)[i], start, abs_positions[i]])

    return temp

In [25]:
def find_top_p(text_passage, p):
    
    tokens = tokenizer.tokenize(text_passage)
    num_topk_tokens = int(p/100 * len(tokens)) 

    return num_topk_tokens

In [26]:
def top_k_positions(input_context_scores, k):
    # Convert the input array to a numpy array if it's not already
    input_context_scores = np.array(input_context_scores)
    
    # Use argsort to get the indices of the top 8 highest elements
    top_k_indices = np.argsort(input_context_scores)[-k:]
    
    # Reverse to get the indices in descending order of their values
    top_k_indices = top_k_indices[::-1]
    
    return top_k_indices.tolist()

In [27]:
def estrai_frasi_con_token(frasipositioni, posizioni_token):
    """
    Estrae le frasi il cui intervallo di indici include almeno una delle posizioni dei token rilevanti.

    :param frasipositioni: Lista di vettori [frase, indice_inizio, indice_fine]
    :param posizioni_token: Lista di posizioni dei token rilevanti
    :return: Lista di frasi che contengono almeno un token rilevante nel loro intervallo di indici
    """
    # Creiamo un set per le posizioni dei token per una ricerca più veloce
    set_posizioni_token = set(posizioni_token)
    
    # Lista per memorizzare le frasi di contesto
    frasi_contesto = []
    
    for frase, inizio, fine in frasipositioni:
        # Verifica se c'è almeno un token rilevante nell'intervallo [inizio, fine]
        if any(token in set_posizioni_token for token in range(inizio, fine + 1)):
            frasi_contesto.append(frase)
    
    return frasi_contesto

In [28]:
def invoke_pecore(text_passage, question, p, filename):
    pecore_args = AttributeContextArgs(
        model_name_or_path="microsoft/Phi-3-mini-4k-instruct",
        attribution_method="saliency",
        attributed_fn="contrast_prob_diff",
        context_sensitivity_metric="kl_divergence",
        context_sensitivity_std_threshold=1,
        context_sensitivity_topk = find_top_p(text_passage, p),
        attribution_std_threshold=None,
        attribution_topk=None,
        input_current_text=question, 
        input_context_text=text_passage,
        contextless_input_current_text="""<|system|>
    You are a helpful assistant that provide concise and accurate answers.<|end|>
    <|user|>
    {current}<|end|>
    <|assistant|>""",
        input_template="""<|system|>
    You are a helpful assistant that provide concise and accurate answers.<|end|>
    <|user|>
    {context}
    
    {current}<|end|>
    <|assistant|>""",
        contextless_output_current_text="""{current}""",
        output_template="{current}",
        special_tokens_to_keep=['<|system|>', '<|end|>', '<|assistant|>', '<|user|>'],
        decoder_input_output_separator="""
    """,
        save_path=filename,
        viz_path=None,
        generation_kwargs={'max_new_tokens': 50},
    )

    attribute_context_with_model(pecore_args, inseq_model) 

In [29]:
invoke_pecore(ds['passages'][0], ds['question'][0], 5, 'test3.json')

In [30]:
def select_passages(text_passage, question, p, filename):
    
    with open(filename, 'r') as file:
        data = json.load(file)
    
    num_of_sets = len(data['cci_scores'])
    
    top_scores = []
    for i in range(num_of_sets):
        top_scores.extend(top_k_positions(data['cci_scores'][i]['input_context_scores'], find_top_p(text_passage, p)))
    positions = list(set(top_scores))

    return '\n'.join(estrai_frasi_con_token(map_relative_positions(find_positions(tokens), text_passage), positions))

In [31]:
select_passages(ds['passages'][0], ds['question'][0], 5, 'test3.json')

'Arthur\'s Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.\nIn May 1846 it was merged into "Godey\'s Lady\'s Book". First for Women is a woman\'s magazine published by Bauer Media Group in the USA.\nThe magazine was started in 1989.'