In [None]:
import pandas as pd
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer, create_optimizer

# Cargar el modelo y el tokenizador
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Función para preparar los datos
def prepare_data(file_path):
    df = pd.read_csv(file_path, delimiter='\t', quoting=3)
    texts = df['text'].tolist()
    questions = df['question'].tolist()
    labels = df['answer'].apply(lambda x: ord(x) - ord('A')).tolist()  # Convertir A-E a 0-4
    encodings = tokenizer(questions, texts, truncation=True, padding=True, return_tensors='tf')
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    return dataset

# Preparar los datos de entrenamiento
train_file = 'train.csv'
train_dataset = prepare_data(train_file).shuffle(100).batch(8)

# Configurar el optimizador y la pérdida
num_train_steps = len(train_dataset) * 5  # 5 epochs
optimizer, lr_schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=num_train_steps)

# Compilar el modelo con la pérdida adecuada
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Entrenar el modelo
model.fit(train_dataset, epochs=5)

# Función para evaluar el modelo en el conjunto de validación
def evaluate_model(dev_file):
    dev_df = pd.read_csv(dev_file, delimiter='\t', quoting=3)
    predictions = []

    for _, row in dev_df.iterrows():
        context = row['text']
        question = row['question']

        inputs = tokenizer(question, context, truncation=True, padding=True, return_tensors='tf')
        outputs = model(inputs)
        logits = outputs.logits
        predicted_label = tf.argmax(logits, axis=1).numpy()[0]
        predicted_answer = chr(predicted_label + ord('A'))

        predictions.append(predicted_answer)

    return predictions

# Ejecutar el modelo en el conjunto de validación y guardar las respuestas en un archivo .txt
dev_file = 'dev.csv'
predictions = evaluate_model(dev_file)

# Guardar las predicciones en un archivo .txt en el formato requerido
with open('predictions.txt', 'w') as f:
    for prediction in predictions:
        f.write(f"{prediction}\n")

print("Predicciones guardadas en predictions.txt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing

Epoch 1/5