### Excercise 1

In [None]:
# Setup: import and config
import os, json
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM


# Main paths (local cache for models)
BASE_DIR = os.path.join('../../lab1')
DATA_DIR = os.path.join(BASE_DIR, 'data')
CACHE_DIR = os.path.join(BASE_DIR, 'models_cache')
os.makedirs(CACHE_DIR, exist_ok=True)

bert_model_id = 'bert-base-uncased'
tokenizer_bert = AutoTokenizer.from_pretrained(bert_model_id, cache_dir=CACHE_DIR)
model_bert = AutoModel.from_pretrained(bert_model_id, cache_dir=CACHE_DIR)


# Excercise 1: embedding and similarity among 5 sentences
sentences_path = os.path.join(DATA_DIR, 'sentences.txt')
# if not os.path.exists(sentences_path):
#     # Notebook runs from lab1/notebooks; fallback to relative path
#     sentences_path = os.path.join('..', '..', 'data', 'sentences.txt')

with open(sentences_path, 'r', encoding='utf-8') as f:
    sentences = [line.strip() for line in f if line.strip()]

# Tokenize
enc = tokenizer_bert(sentences, return_tensors='pt', padding=True, truncation=True)

# Inference (embedding as the mean of the last_hidden_state along the token dimension)
with torch.no_grad():
    outputs = model_bert(**enc)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # shape: (N, hidden)

eps = 1e-12
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
norms = np.maximum(norms, eps)
sim_matrix = (embeddings @ embeddings.T) / (norms @ norms.T)


# exclude self-similarity for argmax
np.fill_diagonal(sim_matrix, -np.inf)

best_i, best_j = np.unravel_index(np.argmax(sim_matrix), sim_matrix.shape)
print('Most similar pair — indices:', (best_i, best_j))
print('Sentence A:', sentences[best_i])
print('Sentence B:', sentences[best_j])
print('Cosine similarity:', float(sim_matrix[best_i, best_j]))

# overall similarity (exclude diagonal)
sim_no_diag = sim_matrix.copy()
np.fill_diagonal(sim_no_diag, 0.0)
overall_sim = sim_no_diag.sum(axis=1)
best_overall_idx = np.argmax(overall_sim)
print('\nMost similar overall — index:', best_overall_idx, 'overall similarity:', float(overall_sim[best_overall_idx]))
print('Sentence:', sentences[best_overall_idx])
print('Similarities to others:', sim_no_diag[best_overall_idx])

### Excercise 2

In [None]:
# Setup: import and config
import os, json
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

print('Transformers version:', __import__('transformers').__version__)

# Main paths (local cache for models)
BASE_DIR = os.path.join('../../lab1')
DATA_DIR = os.path.join(BASE_DIR, 'data')
CACHE_DIR = os.path.join(BASE_DIR, 'models_cache')
os.makedirs(CACHE_DIR, exist_ok=True)

model_path= 'unsloth/gemma-3-1B-it' # 'google/gemma-3-270m-it' #  USED TO AVOID HAVING TO ACCEPT TERMS OF SERVICE FOR THE GEMMA MODEL
device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    cache_dir="/data01/pferrazzi/.cache", # "CACHE_DIR, 
    torch_dtype=torch.float16, 
    device_map=device
    )

print('Gemma model:', model, '\n\n')
print('Special tokens:', tokenizer.special_tokens_map, '\n\n')

target_text_path = os.path.join(DATA_DIR, 'target_text.txt')
with open(target_text_path, 'r', encoding='utf-8') as f:
    target_text_list = f.readlines()
    target_text_list = [t.strip() for t in target_text_list]


def apply_chat_template_and_tokenize(messages_list: list, device=None):
    input = [tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
            return_tensors="pt",
        )
        for messages in messages_list
    ]
    tokenized = tokenizer(input, return_tensors='pt', padding=True)# .to(device)
    return tokenized
    
###############  
# 0-shot prompt
###############

messages_zero_shot_list = [
    [
        { 'role': 'user', 'content': f'Find the Persons in the following text:\n{target_text}'}
    ] for target_text in target_text_list
]
tokenized_0 = apply_chat_template_and_tokenize(messages_zero_shot_list)
outputs_0 = model.generate(**tokenized_0, max_new_tokens=40)
print('0-shot output:', tokenizer.decode(outputs_0[0][tokenized_0["input_ids"].shape[-1]:]), '\n\n')


###############  
# with system prompt
###############

system_prompt = (
    'You are an assistant that extracts PERSON names from the given text. '
    'Respond with a JSON array of strings containing the person names found. '
    'If none are present, respond with an empty array.'
)
messages_zero_shot_list = [
    [
        { 'role': 'system', 'content': system_prompt },
        { 'role': 'user', 'content': f'{target_text}'}
    ] for target_text in target_text_list
]
tokenized_0 = apply_chat_template_and_tokenize(messages_zero_shot_list)
outputs_0 = model.generate(**tokenized_0, max_new_tokens=40)
print('0-shot output:', tokenizer.decode(outputs_0[0][tokenized_0["input_ids"].shape[-1]:]), '\n\n')

###############  
# with system prompt and example provided as user message
###############

with open(os.path.join(DATA_DIR, 'few_shot_examples.json'), 'r', encoding='utf-8') as f:
    examples = json.load(f)

example_text = examples[0]['input']
example_answer = examples[0]['output']

messages_one_shot_list = [
    [
        { 'role': 'system', 'content': system_prompt },
        { 'role': 'user', 'content': example_text + example_answer },
        { 'role': 'user', 'content': f'{target_text}'}
    ] for target_text in target_text_list
]
tokenized_1 = apply_chat_template_and_tokenize(messages_one_shot_list)
outputs_1 = model.generate(**tokenized_1, max_new_tokens=40)
print('1-shot output:', tokenizer.decode(outputs_1[0][tokenized_1["input_ids"].shape[-1]:]), '\n\n')


###############  
# with system prompt and 1 example provided as assistant message
###############

messages_one_shot_list = [
    [
        { 'role': 'system', 'content': system_prompt },
        { 'role': 'user', 'content': example_text },
        { 'role': 'assistant', 'content': example_answer },
        { 'role': 'user', 'content': f'{target_text}'}
    ] for target_text in target_text_list
]
tokenized_1 = apply_chat_template_and_tokenize(messages_one_shot_list)
outputs_1 = model.generate(**tokenized_1, max_new_tokens=40)
print('1-shot output:', tokenizer.decode(outputs_1[0][tokenized_1["input_ids"].shape[-1]:]), '\n\n')

###############  
# with system prompt and all examples provided as assistant message
###############

messages_system_and_few_shot= [
    { 'role': 'system', 'content': system_prompt }
]

for example in examples:
    messages_system_and_few_shot.append({ 'role': 'user', 'content': example['input'] })
    messages_system_and_few_shot.append({ 'role': 'assistant', 'content': example['output'] })

messages_few_shot_list = [
    [
        *messages_system_and_few_shot,
        { 'role': 'user', 'content': f'{target_text}'}
    ] for target_text in target_text_list
]
tokenized_few_shot = apply_chat_template_and_tokenize(messages_few_shot_list)
outputs_few_shot = model.generate(**tokenized_few_shot, max_new_tokens=40)
print('few-shot output:', tokenizer.decode(outputs_few_shot[0][tokenized_few_shot["input_ids"].shape[-1]:]), '\n\n')