# Cell 1: Install Dependencies (Colab)

In [None]:
!pip install transformers datasets

# Cell 2: Upload the Apple QA Dataset (JSON or zipped tokenized)


In [None]:
from google.colab import files
uploaded = files.upload()  # Upload either JSON or .zip of tokenized dataset


# Cell 3: If zipped, unzip into a directory


In [None]:
!mkdir -p data/tokenized
!unzip sec_apple_10k.zip -d data/tokenized

# Cell 4: Load Dataset


In [None]:
from datasets import load_dataset, load_from_disk


# OPTION 1: Load from JSON


In [None]:
dataset = load_dataset("json", data_files={"train": "apple_10k_qa.json"})


In [None]:
def preprocess(example):
    questions = [q.strip() for q in example["question"]]
    inputs = tokenizer(
        questions,
        example["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = inputs["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = inputs.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = example["answers"][sample_index]
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])

        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            start_positions.append(token_start_index - 1)

            while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            end_positions.append(token_end_index + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

custom_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)


# OPTION 2: Load already tokenized dataset (if you have it zipped/unzipped)


In [None]:
# custom_dataset = load_from_disk("data/tokenized/sec_apple_10k")

# print(custom_dataset.column_names)  # Inspect columns

# Cell 5: Load Model and Tokenizer


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

# Cell 6: Define TrainingArguments + Trainer


In [None]:
from transformers import TrainingArguments, Trainer, DefaultDataCollator

args = TrainingArguments(
    output_dir="./apple_qa_finetuned",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=custom_dataset,  # ✅ already tokenized
    tokenizer=tokenizer,
    data_collator=DefaultDataCollator()
)

# Cell 7: Train the Model


In [None]:
trainer.train()

# Cell 8: Save Fine-Tuned Model


In [None]:
trainer.save_model("./apple_qa_finetuned")
tokenizer.save_pretrained("./apple_qa_finetuned")

# Cell 9: Try Inference

In [None]:
import torch

context = "Apple Inc. reported a 10% increase in services revenue during the fiscal year."
question = "How much did Apple's services revenue increase?"

inputs = tokenizer(question, context, return_tensors="pt").to(model.device)
outputs = model(**inputs)

start = torch.argmax(outputs.start_logits)
end = torch.argmax(outputs.end_logits) + 1

answer = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end])
)

print("📊 Finance Answer:", answer)


In [None]:
!zip -r apple_qa_finetuned.zip apple_qa_finetuned


In [None]:
from google.colab import files
files.download("apple_qa_finetuned.zip")
