In [1]:
# ==============================================================================
# STEP 1: INSTALLING NECESSARY LIBRARIES
# ==============================================================================
# We start by installing all the required libraries.
# - transformers: For loading models and tokenizers from Hugging Face.
# - peft: The Parameter-Efficient Fine-Tuning library, for our QLoRA configuration.
# - bitsandbytes: For loading the model in 4-bit (quantization).
# - accelerate: A Hugging Face library to help with training on different hardware.
# - datasets: For loading and processing our .jsonl file.
# - trl: A library from Hugging Face for simplified supervised fine-tuning (SFT).

!pip install -q transformers peft bitsandbytes accelerate datasets trl

print("--- Step 1: All necessary libraries installed successfully! ---")

# ==============================================================================
# IMPORTANT: RESTART THE RUNTIME
# ==============================================================================
# After running the installation cell above, you MUST restart the Colab runtime
# for the new library versions to be loaded correctly.
#
# Go to the menu and click:
# Runtime -> Restart session
# (Note: This may also be called "Restart runtime" in older Colab versions).
#
# After the runtime restarts, you can proceed to run the cells from Step 2 onwards.
# ==============================================================================


--- Step 1: All necessary libraries installed successfully! ---


In [1]:
# ==============================================================================
# STEP 2: AUTHENTICATION AND DEPENDENCIES
# ==============================================================================
# We need to log in to Hugging Face to download the Mistral model.
# You will need to create a Hugging Face account and get an access token
# with "write" permissions.

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from huggingface_hub import notebook_login

# This will prompt you to enter your Hugging Face token.
print("--- Step 2: Logging in to Hugging Face... ---")
notebook_login()

--- Step 2: Logging in to Hugging Face... ---


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
# ==============================================================================
# STEP 3: LOADING AND PREPARING THE DATASET
# ==============================================================================
# Now, we'll load our pre-split training and validation datasets.
# IMPORTANT: Before running this cell, upload your `train.jsonl` and
# `validation.jsonl` files to the Colab environment. You can do this by
# clicking the "Files" icon on the left sidebar and then clicking the "Upload" button.

# Load the training and validation sets directly from their files.
print("--- Step 3: Loading pre-split training and validation datasets... ---")
train_dataset = load_dataset("json", data_files="train.jsonl", split="train")
validation_dataset = load_dataset("json", data_files="validation.jsonl", split="train")

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(validation_dataset)}")

# Define a function to format the prompts correctly for the model.
# The model needs to see the data in a consistent instruction-response format.
def format_prompt(example):
    # This format is based on how instruction-tuned models like Mistral are trained.
    return f"""### Instruction:
{example['instruction']}

### Context:
{example['context']}

### Response:
{example['output']}"""

print("Dataset prepared successfully!")

--- Step 3: Loading pre-split training and validation datasets... ---
Training set size: 408
Validation set size: 51
Dataset prepared successfully!


In [3]:
# ==============================================================================
# STEP 4: LOADING THE MODEL AND TOKENIZER
# ==============================================================================
# Here, we load the base Mistral model. We'll use QLoRA to load it in 4-bit
# precision, which drastically reduces the memory required.

# The specific Mistral model we'll fine-tune
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# QLoRA configuration using bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                # Load the model in 4-bit precision
    bnb_4bit_quant_type="nf4",        # Use a specific quantization type for better performance
    bnb_4bit_compute_dtype=torch.bfloat16, # Use this compute dtype for training
    bnb_4bit_use_double_quant=True,   # Use a nested quantization for more memory savings
)

print(f"--- Step 4: Loading base model '{model_name}'... ---")
# Load the model with our quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto", # Automatically map the model to the available GPU
)
# Enable gradient checkpointing to save even more memory
model.config.use_cache = False

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Mistral models don't have a default padding token, so we set one.
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Model and tokenizer loaded successfully!")


--- Step 4: Loading base model 'mistralai/Mistral-7B-Instruct-v0.3'... ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model and tokenizer loaded successfully!


In [4]:
# ==============================================================================
# STEP 5: CONFIGURING THE LoRA/PEFT SETTINGS
# ==============================================================================
# Now we configure the PEFT (Parameter-Efficient Fine-Tuning) settings.
# This tells the trainer which parts of the model to train.

print("--- Step 5: Configuring LoRA... ---")
peft_config = LoraConfig(
    r=16,                     # The "rank" of the LoRA matrices. Higher rank means more parameters to train.
    lora_alpha=32,            # A scaling factor for the LoRA matrices.
    lora_dropout=0.05,        # Dropout to prevent overfitting.
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Target the attention layers
)

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

print("LoRA configured successfully!")

--- Step 5: Configuring LoRA... ---
LoRA configured successfully!


In [9]:
# ==============================================================================
# STEP 6: SETTING UP THE TRAINER
# ==============================================================================
# We define the training arguments and set up the SFTTrainer.

output_dir = "ethno-colleague-finetune"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,       # Batch size for training
    gradient_accumulation_steps=2,       # Simulate a larger batch size
    optim="paged_adamw_32bit",           # A memory-efficient optimizer
    learning_rate=2e-4,                  # The learning rate
    lr_scheduler_type="cosine",          # Learning rate scheduler
    save_strategy="epoch",               # Save a checkpoint at the end of each epoch
    logging_steps=25,                    # Log progress every 25 steps
    num_train_epochs=3,                  # Number of training epochs
    max_steps=-1,                        # -1 means it will be determined by num_train_epochs
    fp16=True,                           # Use 16-bit precision for training
    do_eval=True,                        # Set to True to enable evaluation
)

print("--- Step 6: Setting up the SFTTrainer... ---")

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    peft_config=peft_config,
    args=training_arguments,
    formatting_func=format_prompt, # This function handles our data formatting.
)

print("Trainer setup complete!")

--- Step 6: Setting up the SFTTrainer... ---


Applying formatting function to train dataset:   0%|          | 0/408 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/408 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/408 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/408 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/408 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/51 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/51 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/51 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/51 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/51 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainer setup complete!


In [10]:
# ==============================================================================
# STEP 7: STARTING THE FINE-TUNING
# ==============================================================================
# This is where the magic happens. We call `trainer.train()` to start the process.
# This will take some time, depending on the GPU. You'll see the training and
# validation loss decrease, which means the model is learning!

print("--- Step 7: Starting fine-tuning... ---")
trainer.train()
print("--- Fine-tuning complete! ---")




--- Step 7: Starting fine-tuning... ---


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgurhan-camgoz[0m ([33mgurhan-camgoz-ku-leuven[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss
25,1.7745
50,1.425
75,1.1486
100,1.073
125,0.8463
150,0.8379


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a

--- Fine-tuning complete! ---


In [11]:
# ==============================================================================
# STEP 8: SAVING THE TRAINED MODEL
# ==============================================================================
# We save the trained LoRA adapter. This file is very small and contains all
# the new knowledge your model has learned.

final_model_path = "ethno-colleague-final"
print(f"--- Step 8: Saving the final LoRA adapter to '{final_model_path}'... ---")
trainer.save_model(final_model_path)
print("Adapter saved successfully!")


--- Step 8: Saving the final LoRA adapter to 'ethno-colleague-final'... ---
Adapter saved successfully!


In [16]:
# ==============================================================================
# STEP 9: LOAD FINAL MODEL FOR INFERENCE
# ==============================================================================
# *** EDITED FOR COMPATIBILITY ***
# We explicitly clear the memory used by the training process before loading
# the model for inference. This prevents out-of-memory errors in Colab.

from peft import PeftModel
import gc # Garbage Collector interface

# Clean up memory
del model
del trainer
gc.collect()
torch.cuda.empty_cache()

# Load the base model again for inference
base_model_for_inference = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
# And the tokenizer
inference_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inference_tokenizer.pad_token = inference_tokenizer.eos_token
inference_tokenizer.padding_side = "right"

# Load the LoRA adapter onto the base model
inference_model = PeftModel.from_pretrained(base_model_for_inference, final_model_path)
# Set the model to evaluation mode
inference_model.eval()

print("--- Step 9: Final model loaded and ready for inference! ---")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

--- Step 9: Final model loaded and ready for inference! ---


In [17]:
# ==============================================================================
# STEP 10: AUTOMATED INFERENCE (TESTING ON THE TEST SET)
# ==============================================================================
# This step performs a qualitative evaluation on a random sample of your unseen test data.
# It now uses the `inference_model` and `inference_tokenizer` loaded in the previous step.

import random

# IMPORTANT: Upload your `test.jsonl` file to the Colab environment now.
test_dataset = load_dataset("json", data_files="test.jsonl", split="train")

print("\n--- Step 10: Running automated inference with the fine-tuned model... ---")

# Let's test a few random examples from our "unseen" test set.
for _ in range(5): # Test 5 random examples
    rand_idx = random.randint(0, len(test_dataset) - 1)
    test_sample = test_dataset[rand_idx]

    prompt = f"""### Instruction:
{test_sample['instruction']}

### Context:
{test_sample['context']}

### Response:
"""

    model_inputs = inference_tokenizer(prompt, return_tensors="pt")
    model_inputs = {k: v.to("cuda") for k, v in model_inputs.items()}

    print("\n" + "="*50)
    print(">>> PROMPT:")
    print(f"  Instruction: {test_sample['instruction']}")
    print(f"  Context: {test_sample['context']}")
    print("="*50)

    # Generate a response
    with torch.no_grad():
        outputs = inference_model.generate(
            **model_inputs,
            max_new_tokens=150,
            do_sample=True,
            top_p=0.9,
            temperature=0.6,
            pad_token_id=inference_tokenizer.eos_token_id
        )

    response_ids = outputs[0][len(model_inputs['input_ids'][0]):]
    generated_text = inference_tokenizer.decode(response_ids, skip_special_tokens=True)

    print("\n<<< GENERATED RESPONSE:")
    print(generated_text)
    print("="*50 + "\n")


--- Step 10: Running automated inference with the fine-tuned model... ---

>>> PROMPT:
  Instruction: Investigate the long-term effects of disaster housing projects on the social fabric and economic development of the area.
  Context: The initial plan was to direct new investments towards the disaster housing projects, but the city center's redevelopment seemed more lucrative. The rich acquired the destroyed areas cheaply, while the neighborhood's social life declined.

<<< GENERATED RESPONSE:
What long-term effects did the disaster housing projects have on the social fabric and economic development of the area?


>>> PROMPT:
  Instruction: Explain the significance of studying multiple societies in the career of an anthropologist according to Evans-Pritchard.
  Context: Evans-Pritchard believed that studying a second society was desirable to avoid thinking in terms of a particular type of society for the rest of one's life.

<<< GENERATED RESPONSE:
What are the benefits of studying mu

In [20]:
# ==============================================================================
# STEP 11: INTERACTIVE INFERENCE (DEVELOPER TEST)
# ==============================================================================
# This step allows you, the developer, to test the model with specific,
# complex instructions to probe its capabilities.

print("\n--- Step 11: Interactive Chat (Developer Mode) ---")
print("You can now enter a custom instruction and context.")
print("Enter 'exit' or 'quit' to move to the next step.")

# This cell uses the `inference_model` and `inference_tokenizer` loaded in Step 9.
while True:
    custom_instruction = input("Enter your instruction (or 'exit'): ")
    if custom_instruction.lower() in ['exit', 'quit']:
        break
    custom_context = input("Enter your context: ")
    if custom_context.lower() in ['exit', 'quit']:
        break

    prompt = f"""### Instruction:
{custom_instruction}

### Context:
{custom_context}

### Response:
"""

    model_inputs = inference_tokenizer(prompt, return_tensors="pt")
    model_inputs = {k: v.to("cuda") for k, v in model_inputs.items()}

    with torch.no_grad():
        outputs = inference_model.generate(**model_inputs, max_new_tokens=150, do_sample=True, top_p=0.9, temperature=0.6, pad_token_id=inference_tokenizer.eos_token_id)

    response_ids = outputs[0][len(model_inputs['input_ids'][0]):]
    generated_text = inference_tokenizer.decode(response_ids, skip_special_tokens=True)

    print("\n--- MODEL RESPONSE ---")
    print(generated_text)
    print("----------------------\n")



--- Step 11: Interactive Chat (Developer Mode) ---
You can now enter a custom instruction and context.
Enter 'exit' or 'quit' to move to the next step.
Enter your instruction (or 'exit'): Formulate a structural question for the researcher to make connections with their fieldwork experience in participating in tge remote worker’s routines with the underlying social factors that structure the transformation of social life.
Enter your context: As I was spending time with the research participant, she has done video call in the morning with the software engineer about the design choices of the new product interface, and then she left the house to do groceries in the market because she needed to cook. 

--- MODEL RESPONSE ---
What are the daily routines of the remote workers in this community, and how do these routines reflect the transformation of social life due to remote work?
----------------------

Enter your instruction (or 'exit'): exit


In [21]:
# ==============================================================================
# STEP 12: INTERACTIVE INFERENCE (USER TEST)
# ==============================================================================
# *** NEW STEP ***
# This step simulates the real-world use case. The user only provides a
# context (a fieldnote, a thought), and the model must generate a helpful
# question without being given an explicit instruction.

print("\n--- Step 12: Interactive Chat (User Mode) ---")
print("Simulates a user (e.g., an anthropology student) interacting with the model.")
print("You only need to provide a context. The model will generate a question.")
print("Enter 'exit' or 'quit' when you are finished.")

# A generic, powerful instruction that we give to the model behind the scenes.
# This guides the model to act as a helpful colleague.
fixed_instruction = "Read the following context from an ethnographer's notes. Ask a single, insightful, open-ended question that would help them think more deeply about the situation, connect it to broader themes, or consider a new perspective."

while True:
    # Get user input for context only
    user_context = input("Enter your fieldnote or thought (or 'exit'): ")
    if user_context.lower() in ['exit', 'quit']:
        break

    # Format the prompt with the fixed instruction and user-provided context
    prompt = f"""### Instruction:
{fixed_instruction}

### Context:
{user_context}

### Response:
"""

    # Tokenize and generate
    model_inputs = inference_tokenizer(prompt, return_tensors="pt")
    model_inputs = {k: v.to("cuda") for k, v in model_inputs.items()}

    with torch.no_grad():
        outputs = inference_model.generate(**model_inputs, max_new_tokens=150, do_sample=True, top_p=0.9, temperature=0.7, pad_token_id=inference_tokenizer.eos_token_id)

    response_ids = outputs[0][len(model_inputs['input_ids'][0]):]
    generated_text = inference_tokenizer.decode(response_ids, skip_special_tokens=True)

    print("\n--- DIGITAL COLLEAGUE'S QUESTION ---")
    print(generated_text)
    print("------------------------------------\n")


--- Step 12: Interactive Chat (User Mode) ---
Simulates a user (e.g., an anthropology student) interacting with the model.
You only need to provide a context. The model will generate a question.
Enter 'exit' or 'quit' when you are finished.
Enter your fieldnote or thought (or 'exit'): I was looking at this couple who were remote-working together from their home, sharing their unrelated work experiences with each other, and it was kind of an opaque mirror they had established among themselves. 

--- DIGITAL COLLEAGUE'S QUESTION ---
How does the co-presence of remote work and intimate relationships influence the boundaries and dynamics of each domain?
------------------------------------

Enter your fieldnote or thought (or 'exit'): exit
