# Gerador de Arquivo de Submissão (run.tsv)

Este notebook utiliza o modelo `SotirisLegkas/multi-head-xlm-xl-tokens-38` do Hugging Face para gerar predições para o conjunto de dados de teste localizado em `../../valueeval24/test-english/`.

## 1. Instalação de Dependências

In [None]:
!pip install transformers torch pandas tqdm

## 2. Imports e Configuração Inicial

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.notebook import tqdm
import numpy as np
import os

## 3. Carregar Dados

In [None]:
sentences_df = pd.read_csv('../../valueeval24/test-english/sentences.tsv', sep='	')
print("Amostra dos dados de sentenças:")
print(sentences_df.head())

## 4. Carregar Modelo e Tokenizer

In [None]:
MODEL_NAME = 'SotirisLegkas/multi-head-xlm-xl-tokens-38'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Usando dispositivo: {device}')

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)

## 5. Gerar Predições

In [None]:
def predict_batch(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits)
    return probabilities.cpu().numpy()

batch_size = 16
all_predictions = []

texts = sentences_df['Text'].tolist()

for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    batch_preds = predict_batch(batch_texts)
    all_predictions.extend(batch_preds)

## 6. Formatar e Salvar o Arquivo run.tsv

In [None]:
# Nomes dos valores e colunas, conforme definido no script do avaliador
availableValues = ["Self-direction: thought", "Self-direction: action", "Stimulation", "Hedonism", "Achievement", "Power: dominance", "Power: resources", "Face", "Security: personal", "Security: societal", "Tradition", "Conformity: rules", "Conformity: interpersonal", "Humility", "Benevolence: caring", "Benevolence: dependability", "Universalism: concern", "Universalism: nature", "Universalism: tolerance"]
value_columns = [value + postfix for value in availableValues for postfix in [' attained', ' constrained']]

# Criar o DataFrame de resultados
predictions_df = pd.DataFrame(all_predictions, columns=value_columns)

# Combinar com os IDs das sentenças
run_df = pd.concat([sentences_df[['Text-ID', 'Sentence-ID']], predictions_df], axis=1)

# Garantir que o diretório de saída exista
output_dir = 'run'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Salvar o arquivo run.tsv
output_path = os.path.join(output_dir, 'run.tsv')
run_df.to_csv(output_path, sep='	', index=False, float_format='%.4f')

print(f"Arquivo 'run.tsv' salvo em: {os.path.abspath(output_path)}")
print("Amostra do arquivo de saída:")
print(run_df.head())