In [1]:
import inseq
from inseq.commands.attribute_context.attribute_context import attribute_context_with_model, AttributeContextArgs
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import json
import numpy as np
import pandas as pd
import torch
from transformers import BitsAndBytesConfig

##################################################################################

inseq_model = inseq.load_model(
    "microsoft/Phi-3-mini-4k-instruct",
    "saliency",
)

df = pd.read_csv("wikihop_comparison.csv")

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
# FUNZIONI AUSILIARIE VARIE

def top_k_positions(input_context_scores, k):
    # Convert the input array to a numpy array if it's not already
    input_context_scores = np.array(input_context_scores)
    
    # Use argsort to get the indices of the top 8 highest elements
    top_k_indices = np.argsort(input_context_scores)[-k:]
    
    # Reverse to get the indices in descending order of their values
    top_k_indices = top_k_indices[::-1]
    
    return top_k_indices.tolist()

# -------------------------------------------------------------------------

def estrai_frasi_con_token(frasipositioni, posizioni_token):
    # Creiamo un set per le posizioni dei token per una ricerca più veloce
    set_posizioni_token = set(posizioni_token)
    
    # Lista per memorizzare le frasi di contesto
    frasi_contesto = []
    
    for frase, inizio, fine in frasipositioni:
        # Verifica se c'è almeno un token rilevante nell'intervallo [inizio, fine]
        if any(token in set_posizioni_token for token in range(inizio, fine + 1)):
            frasi_contesto.append(frase)
    
    return frasi_contesto

# -------------------------------------------------------------------------

def find_positions(vector, target="."):
    positions = []
    for i, element in enumerate(vector):
        if element == target:
            positions.append(i)   
    return positions

# -------------------------------------------------------------------------

def map_relative_positions(abs_positions, text):
    temp = []
    start = 0
    for i in range(len(abs_positions)):
        if i != 0: 
            start = abs_positions[i-1] + 1
        temp.append([split_text(text)[i], start, abs_positions[i]])

    return temp

# -------------------------------------------------------------------------

def select_passages(text_passage, question, p, out, tokens):
    
    num_of_sets = len(out.cci_scores)
    
    top_scores = []
    for i in range(num_of_sets):
        top_scores.extend(top_k_positions(out.cci_scores[i].input_context_scores, p))
    positions = list(set(top_scores))

    return estrai_frasi_con_token(map_relative_positions(find_positions(tokens), text_passage), positions)
 
# -------------------------------------------------------------------------

def split_text(text):
    # Split the text on ". " and return the resulting list
    return text.split(". ")

# -------------------------------------------------------------------------

def invoke_pecore(text_passage, question, p):
    pecore_args = AttributeContextArgs(
        model_name_or_path="microsoft/Phi-3-mini-4k-instruct",
        attribution_method="saliency",
        attributed_fn="contrast_prob_diff",
        context_sensitivity_metric="kl_divergence",
        context_sensitivity_std_threshold=1,
        context_sensitivity_topk = find_top_p(text_passage, p),
        attribution_std_threshold=None,
        attribution_topk=None,
        input_current_text=question, 
        input_context_text=text_passage,
        contextless_input_current_text="""<|system|>
    You are a helpful assistant that provide concise and accurate answers.<|end|>
    <|user|>
    {current}<|end|>
    <|assistant|>""",
        input_template="""<|system|>
    You are a helpful assistant that provide concise and accurate answers.<|end|>
    <|user|>
    {context}
    
    {current}<|end|>
    <|assistant|>""",
        contextless_output_current_text="""{current}""",
        output_template="{current}",
        special_tokens_to_keep=['<|system|>', '<|end|>', '<|assistant|>', '<|user|>'],
        decoder_input_output_separator="""
    """,
        save_path=None,
        viz_path=None,
        show_viz=False,
        generation_kwargs={'max_new_tokens': 50},
    )

    out = attribute_context_with_model(pecore_args, inseq_model) 
    return out

# -------------------------------------------------------------------------

def find_top_p(text_passage, p):
    
    tokens = tokenizer.tokenize(text_passage)
    num_topk_tokens = int(p/100 * len(tokens)) 
    
    if num_topk_tokens == 0:
        num_top_k_tokens = 2

    return num_topk_tokens
    
##################################################################################

def run(question, passage, p):
    
    tokens = tokenizer.tokenize(passage)
    out = invoke_pecore(passage, question, p)
    return select_passages(passage, question, p, out, tokens)

In [3]:
run(df['query'][0], df['supports'][0], 5)

[]

In [5]:
out = invoke_pecore(df['supports'][0], df['query'][0], 5)

In [7]:
num_of_sets = len(out.cci_scores)
    
top_scores = []
for i in range(num_of_sets):
    top_scores.extend(top_k_positions(out.cci_scores[i].input_context_scores, 5))
positions = list(set(top_scores))

In [8]:
print(top_scores)

[441, 438, 440, 170, 437, 149, 134, 143, 133, 135, 474, 72, 438, 441, 473, 441, 438, 149, 72, 170, 149, 143, 145, 134, 72]


In [9]:
print(positions)

[133, 134, 135, 72, 170, 473, 143, 145, 437, 438, 149, 440, 441, 474]


In [18]:
tokens = tokenizer.tokenize(df['supports'][0])
map_relative_positions(find_positions(tokens), df['supports'][0])

[['A Christian (or ) is a person who follows or adheres to Christianity, an Abrahamic, monotheistic religion based on the life and teachings of Jesus Christ',
  0,
  36],
 ['"Christian" derives from the Koine Greek word "Christós" (), a translation of the Biblical Hebrew term "mashiach"',
  37,
  92],
 ['Lakewood Church is a nondenominational charismatic Christian megachurch located in Houston, Texas',
  93,
  121],
 ['It is the largest congregation in the United States, averaging about 52,000 attendees per week',
  122,
  165],
 ['The 16,800-seat Lakewood Church Central Campus, home to four English-language services and two Spanish-language services per week, is located at the former Compaq Center',
  166,
  194],
 ['Joel Osteen is the senior pastor of Lakewood Church with his wife, Victoria, who serves as co-pastor',
  195,
  208],
 ['Lakewood Church is a part of the Word of Faith movement', 209, 235],
 ['Mexico (, modern Nahuatl ), officially the United Mexican States, is a federal 

In [16]:
tokens

['▁A',
 '▁Christian',
 '▁(',
 'or',
 '▁)',
 '▁is',
 '▁a',
 '▁person',
 '▁who',
 '▁follows',
 '▁or',
 '▁ad',
 'her',
 'es',
 '▁to',
 '▁Christian',
 'ity',
 ',',
 '▁an',
 '▁Abraham',
 'ic',
 ',',
 '▁mon',
 'othe',
 'istic',
 '▁religion',
 '▁based',
 '▁on',
 '▁the',
 '▁life',
 '▁and',
 '▁teach',
 'ings',
 '▁of',
 '▁Jesus',
 '▁Christ',
 '.',
 '▁"',
 'Christ',
 'ian',
 '"',
 '▁der',
 'ives',
 '▁from',
 '▁the',
 '▁Ko',
 'ine',
 '▁Greek',
 '▁word',
 '▁"',
 'Christ',
 'ós',
 '"',
 '▁(',
 '),',
 '▁a',
 '▁translation',
 '▁of',
 '▁the',
 '▁Bib',
 'lic',
 'al',
 '▁Heb',
 'rew',
 '▁term',
 '▁"',
 'm',
 'ash',
 'ia',
 'ch',
 '".',
 '▁Lake',
 'wood',
 '▁Church',
 '▁is',
 '▁a',
 '▁n',
 'onden',
 'omin',
 'ational',
 '▁char',
 'ism',
 'atic',
 '▁Christian',
 '▁meg',
 'ach',
 'urch',
 '▁located',
 '▁in',
 '▁Houston',
 ',',
 '▁Texas',
 '.',
 '▁It',
 '▁is',
 '▁the',
 '▁largest',
 '▁con',
 'greg',
 'ation',
 '▁in',
 '▁the',
 '▁United',
 '▁States',
 ',',
 '▁aver',
 'aging',
 '▁about',
 '▁',
 '5',
 '2',
 ','

In [None]:
# FUNZIONI AUSILIARIE VARIE

def top_k_positions(input_context_scores, k):
    # Convert the input array to a numpy array if it's not already
    input_context_scores = np.array(input_context_scores)
    
    # Use argsort to get the indices of the top 8 highest elements
    top_k_indices = np.argsort(input_context_scores)[-k:]
    
    # Reverse to get the indices in descending order of their values
    top_k_indices = top_k_indices[::-1]
    
    return top_k_indices.tolist()

# -------------------------------------------------------------------------

def estrai_frasi_con_token(frasipositioni, posizioni_token):
    # Creiamo un set per le posizioni dei token per una ricerca più veloce
    set_posizioni_token = set(posizioni_token)
    
    # Lista per memorizzare le frasi di contesto
    frasi_contesto = []
    
    for frase, inizio, fine in frasipositioni:
        # Verifica se c'è almeno un token rilevante nell'intervallo [inizio, fine]
        if any(token in set_posizioni_token for token in range(inizio, fine + 1)):
            frasi_contesto.append(frase)
    
    return frasi_contesto

# -------------------------------------------------------------------------

def find_positions(vector, target="<0x0A>"):
    positions = []
    for i, element in enumerate(vector):
        if element == target:
            positions.append(i)   
    return positions

# -------------------------------------------------------------------------

def map_relative_positions(abs_positions, text):
    temp = []
    start = 0
    for i in range(len(abs_positions)):
        if i != 0: 
            start = abs_positions[i-1] + 1
        temp.append([split_text(text)[i], start, abs_positions[i]])

    return temp

# -------------------------------------------------------------------------

def select_passages(text_passage, question, p, out, tokens):
    
    num_of_sets = len(out.cci_scores)
    
    top_scores = []
    for i in range(num_of_sets):
        top_scores.extend(top_k_positions(out.cci_scores[i].input_context_scores, p))
    positions = list(set(top_scores))

    return estrai_frasi_con_token(map_relative_positions(find_positions(tokens), text_passage), positions)
 
# -------------------------------------------------------------------------

def split_text(text):
    # Split the text on ". " and return the resulting list
    return text.split(". ")

# -------------------------------------------------------------------------

def invoke_pecore(text_passage, question, p):
    pecore_args = AttributeContextArgs(
        model_name_or_path="microsoft/Phi-3-mini-4k-instruct",
        attribution_method="saliency",
        attributed_fn="contrast_prob_diff",
        context_sensitivity_metric="kl_divergence",
        context_sensitivity_std_threshold=1,
        context_sensitivity_topk = find_top_p(text_passage, p),
        attribution_std_threshold=None,
        attribution_topk=None,
        input_current_text=question, 
        input_context_text=text_passage,
        contextless_input_current_text="""<|system|>
    You are a helpful assistant that provide concise and accurate answers.<|end|>
    <|user|>
    {current}<|end|>
    <|assistant|>""",
        input_template="""<|system|>
    You are a helpful assistant that provide concise and accurate answers.<|end|>
    <|user|>
    {context}
    
    {current}<|end|>
    <|assistant|>""",
        contextless_output_current_text="""{current}""",
        output_template="{current}",
        special_tokens_to_keep=['<|system|>', '<|end|>', '<|assistant|>', '<|user|>'],
        decoder_input_output_separator="""
    """,
        save_path=None,
        viz_path=None,
        show_viz=False,
        generation_kwargs={'max_new_tokens': 50},
    )

    out = attribute_context_with_model(pecore_args, inseq_model) 
    return out

# -------------------------------------------------------------------------

def find_top_p(text_passage, p):
    
    tokens = tokenizer.tokenize(text_passage)
    num_topk_tokens = int(p/100 * len(tokens)) 

    return num_topk_tokens
    

In [None]:
def run(question, passage, p):
    
    tokens = tokenizer.tokenize(passage)
    out = invoke_pecore(passage, question, p)
    return select_passages(passage, question, p, out, tokens)

In [9]:
invoke_pecore(df['supports'][0], df['query'][0], 5)

AttributeContextOutput({
    input_context: "A Christian (or ) is a person who follows or adheres to Christianity, an Abrahamic, monotheistic religion based on the life and teachings of Jesus Christ. "Christian" derives from the Koine Greek word "Christós" (), a translation of the Biblical Hebrew term "mashiach". Lakewood Church is a nondenominational charismatic Christian megachurch located in Houston, Texas. It is the largest congregation in the United States, averaging about 52,000 attendees per week. The 16,800-seat Lakewood Church Central Campus, home to four English-language services and two Spanish-language services per week, is located at the former Compaq Center. Joel Osteen is the senior pastor of Lakewood Church with his wife, Victoria, who serves as co-pastor. Lakewood Church is a part of the Word of Faith movement. Mexico (, modern Nahuatl ), officially the United Mexican States, is a federal republic in the southern half of North America. It is bordered to the north by th

In [13]:
split_text(df['supports'][4])

['Professional ice hockey has existed since the early 1900s (decade)',
 'The professional game originated in the United States in 1904 and became prominent in Canada during the early 20th century before expanding back into the United States and eventually to many other countries',
 'In addition to the United States and Canada, high-level professional hockey is present in Germany, Russia, Sweden, the Czech Republic, Finland and Switzerland; professional hockey is also played in many other countries, as diverse as Ukraine, the United Kingdom, Austria, Australia and Japan',
 'The Louisiana IceGators is a team of the Southern Professional Hockey League who began play in the 2009 - 10 season ',
 'Like the original IceGators which played from 1995 until folding in 2005 , they are based out of Lafayette , Louisiana ',
 'The IceGators were brought back and owned by local businessman Danny Smith ',
 'In August 2010 , Smith sold the team to two local businessmen , E.C',
 "`` Chuck '' Anselmo , J

In [14]:
df['supports'][0]

'A Christian (or ) is a person who follows or adheres to Christianity, an Abrahamic, monotheistic religion based on the life and teachings of Jesus Christ. "Christian" derives from the Koine Greek word "Christós" (), a translation of the Biblical Hebrew term "mashiach". Lakewood Church is a nondenominational charismatic Christian megachurch located in Houston, Texas. It is the largest congregation in the United States, averaging about 52,000 attendees per week. The 16,800-seat Lakewood Church Central Campus, home to four English-language services and two Spanish-language services per week, is located at the former Compaq Center. Joel Osteen is the senior pastor of Lakewood Church with his wife, Victoria, who serves as co-pastor. Lakewood Church is a part of the Word of Faith movement. Mexico (, modern Nahuatl ), officially the United Mexican States, is a federal republic in the southern half of North America. It is bordered to the north by the United States; to the south and west by th