In [1]:
def is_colab():
    """Check if the code is running in Google Colab."""
    try:
        import google.colab
        return True
    except ImportError:
        return False

# Use the function to conditionally run magic commands
if is_colab():
    # Run Colab-specific magic commands
    print("Running in Colab, executing magic commands.")
    !rm -rf microproyecto3NLP/
    !git clone https://github.com/cjohana031/microproyecto3NLP
    !cp -R microproyecto3NLP/* .
    # Add any other Colab-specific setup
else:
    # Alternative setup for non-Colab environments
    print("Not running in Colab, nothing else is needed.")

Not running in Colab, nothing else is needed.


In [2]:
!pip install datasets sentencepiece



In [1]:
import torch
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from dotenv import load_dotenv
load_dotenv()

model_name = "meta-llama/Llama-3.2-1B"

# Check HF_TOKEN environment variable
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    print("Hugging Face token found in environment variable.")
else:
    token = input("Enter your Hugging Face token: ")
    login(token=token) # Token de Hugging Face

base_tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name)

# Configurar tokenizador
base_tokenizer.pad_token = base_tokenizer.eos_token
base_tokenizer.padding_side = "right"

  from .autonotebook import tqdm as notebook_tqdm


Hugging Face token found in environment variable.


In [2]:
from datasets import load_dataset
from datasets import DatasetDict
import os
def format_example(example):
    article = example["article"]
    question = example["question"]
    options = example["options"]
    answer_idx = ord(example["answer"]) - ord("A")
    
    # Format options
    formatted_options = ""
    for i, opt in enumerate(options):
        option_letter = chr(65 + i)  # A, B, C, D
        formatted_options += f"{option_letter}. {opt}\n"
    
    # Create instruction format for fine-tuning
    instruction = f"""Read the following passage and answer the question by choosing the correct option.

Passage:
{article}

Question: {question}

Options:
{formatted_options}

The correct answer is:"""
    
    # Create completion (what the model should generate)
    completion = f" {chr(65 + answer_idx)}"
    
    example["instruction"] = instruction
    example["completion"] = completion
    return example

def prepare_datasets(datasetdict: DatasetDict, tokenizer: AutoTokenizer) -> DatasetDict:
    """
    Prepares the datasets for training by tokenizing the inputs and labels.
    """
    def tokenize_function(example):
               # Format the example to get instruction and completion
        example = format_example(example)
        instruction = example["instruction"]
        completion = example["completion"]
        
        # First tokenize just the instruction to know its length
        instruction_tokens = tokenizer(instruction, return_length=True)
        instruction_length = instruction_tokens['length'][0]  # Get the length of instruction tokens
        
        # Then tokenize the full text (instruction + completion)
        full_tokens = tokenizer(instruction , completion)
        
        # Set up the labels with -100 for instruction tokens, and actual token IDs for completion tokens
        labels = full_tokens["input_ids"].copy()
        
        # Mask out the instruction part with -100 (these won't contribute to loss)
        labels[:instruction_length] = [-100] * instruction_length
        
        # Mask padding tokens with -100
        labels[full_tokens["attention_mask"] == 0] = -100
        
        # Store all in the example
        example["input_ids"] = full_tokens["input_ids"]
        example["attention_mask"] = full_tokens["attention_mask"]
        example["labels"] = labels
        return example

    # Apply the tokenization function to each split of the dataset
    tokenized_datasets = datasetdict.map(tokenize_function, batched=False)
    return tokenized_datasets

def load_dataset_from_parquet(tokenizer: AutoTokenizer) -> DatasetDict:
    data_files = {"train": "data/train-00000-of-00001.parquet", "test": "data/test-00000-of-00001.parquet", "validation": "data/validation-00000-of-00001.parquet"}
    race = load_dataset("parquet", data_files=data_files)
    race = race.filter(lambda x: len(x['article']) < 800)
    race : DatasetDict = prepare_datasets(race,tokenizer=tokenizer)
    race.save_to_disk('data/datasets', max_shard_size="100MB")
    return race

if not os.path.exists('data/datasets'):
    race = load_dataset_from_parquet(tokenizer=base_tokenizer)
else:
    race = DatasetDict.load_from_disk('data/datasets')
race

Filter: 100%|██████████| 87866/87866 [00:00<00:00, 112294.12 examples/s]
Filter: 100%|██████████| 4934/4934 [00:00<00:00, 102464.71 examples/s]
Filter: 100%|██████████| 4887/4887 [00:00<00:00, 104409.47 examples/s]
Map: 100%|██████████| 8930/8930 [00:06<00:00, 1286.68 examples/s]
Map: 100%|██████████| 480/480 [00:00<00:00, 1287.40 examples/s]
Map: 100%|██████████| 514/514 [00:00<00:00, 1309.03 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8930/8930 [00:00<00:00, 421151.79 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 480/480 [00:00<00:00, 119368.31 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 514/514 [00:00<00:00, 84164.44 examples/s]


DatasetDict({
    train: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options', 'instruction', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8930
    })
    test: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options', 'instruction', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 480
    })
    validation: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options', 'instruction', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 514
    })
})