In [1]:
def is_colab():
    """Check if the code is running in Google Colab."""
    try:
        import google.colab
        return True
    except ImportError:
        return False

# Use the function to conditionally run magic commands
if is_colab():
    # Run Colab-specific magic commands
    print("Running in Colab, executing magic commands.")
    !rm -rf microproyecto3NLP/
    !git clone https://github.com/cjohana031/microproyecto3NLP
    !cp -R microproyecto3NLP/* .
    # Add any other Colab-specific setup
else:
    # Alternative setup for non-Colab environments
    print("Not running in Colab, nothing else is needed.")

Not running in Colab, nothing else is needed.


In [2]:
!pip install datasets sentencepiece




[notice] A new release of pip is available: 23.1.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from datasets import load_dataset
data_files = {"train": "data/train-00000-of-00001.parquet", "test": "data/test-00000-of-00001.parquet", "validation": "data/validation-00000-of-00001.parquet"}
race = load_dataset("parquet", data_files=data_files)

In [4]:
race

DatasetDict({
    train: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 87866
    })
    test: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 4934
    })
    validation: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 4887
    })
})

In [5]:
# Filter each split to only include examples where the len of article is less than 800
race = race.filter(lambda x: len(x['article']) < 800)

In [6]:
race

DatasetDict({
    train: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 8930
    })
    test: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 480
    })
    validation: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 514
    })
})

In [7]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "meta-llama/Llama-3.2-1B"

token = input("Enter your Hugging Face token: ")
login(token=token) # Token de Hugging Face
base_tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name)

# Configurar tokenizador
base_tokenizer.pad_token = base_tokenizer.eos_token
base_tokenizer.padding_side = "right"

In [None]:
from datasets import DatasetDict
def format_example(example):
    article = example["article"]
    question = example["question"]
    options = example["options"]
    answer_idx = ord(example["answer"]) - ord("A")
    
    # Format options
    formatted_options = ""
    for i, opt in enumerate(options):
        option_letter = chr(65 + i)  # A, B, C, D
        formatted_options += f"{option_letter}. {opt}\n"
    
    # Create instruction format for fine-tuning
    instruction = f"""Read the following passage and answer the question by choosing the correct option.

Passage:
{article}

Question: {question}

Options:
{formatted_options}

The correct answer is:"""
    
    # Create completion (what the model should generate)
    completion = f" {chr(65 + answer_idx)}"
    
    example["instruction"] = instruction
    example["completion"] = completion
    return example

def prepare_datasets(datasetdict: DatasetDict, tokenizer: AutoTokenizer) -> DatasetDict:
    """
    Prepares the datasets for training by tokenizing the inputs and labels.
    """
    def tokenize_function(example):
        # Tokenize the inputs and labels
        example = format_example(example)
        # input_text = example["instruction"] + example["completion"]
        # inputs = tokenizer(input_text, padding="max_length")
        # example["input_ids"] = inputs["input_ids"]
        # example["attention_mask"] = inputs["attention_mask"]
        # example["labels"] = inputs["input_ids"].copy()
        # example["labels"][example["attention_mask"] == 0] = -100
        return example

    # Apply the tokenization function to each split of the dataset
    tokenized_datasets = datasetdict.map(tokenize_function, batched=False)
    return tokenized_datasets

race = prepare_datasets(race,tokenizer=base_tokenizer)

Map:   0%|          | 0/8930 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/514 [00:00<?, ? examples/s]

In [10]:
race['train'][0]

{'example_id': 'high2778.txt',
 'article': 'The air hostess   was in a small kitchen at the back of the plane, preparing the plates for lunch, when a little old lady came and spoke to her, "Could you please tell me," she asked, "where is the ladies\' lavatory   in the plane?"\n"Yes, madam," said the air hostess and smiled. "It is right at the other end of the plane---at the front."\nThe little lady went too far. She walked all the way to the front of the plane, opened the door in front of her, and saw the captain of the plane and the other officers. They were all busy with their work and did not see her. She went out again, shut the door and returned to the air hostess.\n"Oh, didn\'t you find it, madam?" the girl asked her. "Yes, I did," said the little lady. "But there are four men in the ladies\' lavatory watching television."',
 'answer': 'C',
 'question': 'The story happened  _  .',
 'options': ['in the evening',
  'in the afternoon',
  'in the morning',
  'at midnight'],
 'instruc