In [9]:
#pip install transformers datasets accelerate

In [None]:
import pandas as pd
df = pd.read_csv('../preprocess/combined_finetuning_df.csv')
df = df[df["Buffett_statement"] == 1]

df = df.dropna(subset=["Question", "Answer"])  # drop any incomplete rows
print(f"Filtered dataset size: {len(df)}")

Filtered dataset size: 439


In [11]:
from datasets import Dataset

# Convert pandas DataFrame to HF Dataset
dataset = Dataset.from_pandas(df)

# Optionally, split into train/validation
# For a small dataset, you might do e.g. 90% train, 10% val
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

model_name = "AdaptLLM/finance-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
# ^ Use torch_dtype if model is half-precision or you have GPU memory constraints


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def preprocess_function(examples):
    # Build prompt text
    texts = [
        f"Question: {q}\nAnswer: {a}"
        for q, a in zip(examples["Question"], examples["Answer"])
    ]
    
    # Tokenize in a causal manner
    # Note: We do not separate 'input' vs 'label' because in CLM
    # the model learns to predict every next token in the sequence.
    # We'll rely on the standard LM masking in the collator.
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=512,  # adjust as needed
        return_special_tokens_mask=True
    )
    return tokenized

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False  # Because this is Causal LM, not Masked LM
)


In [None]:
training_args = TrainingArguments(
    output_dir="adaptllm-finance-buffett",
    per_device_train_batch_size=1,  # adjust to fit GPU
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # accumulate grads to simulate bigger batch
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    fp16=True,  # if your GPU supports it
    report_to="none",  # or "tensorboard"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


In [None]:
# Let's load the trained model from the output dir if needed
# model = AutoModelForCausalLM.from_pretrained("adaptllm-finance-buffett")

# Directly use model.generate():
def buffett_answer(question, max_new_tokens=100):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.8
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Try it
test_q = "How should I think about investing during a recession?"
response = buffett_answer(test_q)
print(response)
# "Question: How should I think about investing during a recession? 
#  Answer: I focus on companies with strong fundamentals..."
