## Dataset Preparation

In [1]:
from datasets import load_dataset

ds = load_dataset("Chan-Y/Stefan-Zweig-Chat", "default")["train"]

In [8]:
def format_dataset(dataset):
	"""Format the dataset to match Granite's chat template"""
	def format_example(example):
		formatted_text = (
			"<|start_of_role|>system<|end_of_role|>"
			f"{example['system_prompt']}<|end_of_text|>\n"
			"<|start_of_role|>user<|end_of_role|>"
			f"{example['prompt']}<|end_of_text|>\n"
			"<|start_of_role|>assistant<|end_of_role|><stefan_zweig>"
			f"{example['completion']}</stefan_zweig><|end_of_text|>"
		)
		return {
			"text": formatted_text
		}
	
	return dataset.map(format_example)

train_test_split = ds.train_test_split(test_size=0.1, seed=42)
train_dataset = format_dataset(train_test_split['train'])
eval_dataset = format_dataset(train_test_split['test'])

## Model Loading and Fine-tune Parameters

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = "ibm-granite/granite-3.1-2b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
special_tokens_dict = {
    'additional_special_tokens': ['<stefan_zweig>', '</stefan_zweig>']
}
tokenizer.add_special_tokens(special_tokens_dict)

model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
model.resize_token_embeddings(len(tokenizer))

In [None]:
import numpy as np

lengths = []
max_prompt_length = 0
max_completion_length = 0
avg_completion_length = 0

for item in ds:
	# Calculate prompt length (system + user)
	prompt = (
		"<|start_of_role|>system<|end_of_role|>"
		f"{item['system_prompt']}<|end_of_text|>\n"
		"<|start_of_role|>user<|end_of_role|>"
		f"{item['prompt']}<|end_of_text|>\n"
		"<|start_of_role|>assistant<|end_of_role|><stefan_zweig>"
	)
	prompt_length = len(tokenizer.encode(prompt))
	max_prompt_length = max(max_prompt_length, prompt_length)
	
	# Calculate completion length
	completion_length = len(tokenizer.encode(item['completion']))
	max_completion_length = max(max_completion_length, completion_length)
	lengths.append(completion_length)

avg_completion_length = sum(lengths) / len(lengths)

print(f"Max prompt length: {max_prompt_length}")
print(f"Max completion length: {max_completion_length}")
print(f"Average completion length: {avg_completion_length:.2f}")
print(f"95th percentile completion length: {np.percentile(lengths, 95):.2f}")

In [11]:
SFT_TRAIN_EPOCH = 3
SFT_BATCH_SIZE = 4
SFT_LR = 5e-6
MAX_NEW_TOKEN = int(np.percentile(lengths, 95) * 1.2)
REFERENCES = [
	"""My dear friend, I can sense the weight of the past bearing down upon you, much like it did upon me during those fateful years. As I reflect on the rise of Nazi Germany, I am struck by the inexorable march of ideological fanaticism and the devastating consequences that followed. It was a time when the ravages of history demonstrated, as never before, the dangers of unchecked nationalism, the deification of a particular creed, and the erosion of intellect in the face of blind dogma. As an Austrian, I was firsthand witness to the insidious creep of National Socialism, which began as a seemingly innocuous'movement' but gradually swept across Europe like a tempest, devouring everything in its path. The memories of those years still haunt me: the mustachioed orators, the cynical manipulation of the masses, the distorted echoes of a ravaged culture. My own life, once marked by the convolutions of the intellect and the pursuit of art, was caught in the maelstrom of history. As the Nazi dynasty took power, I found myself increasingly bound by the strictures of censorship and the weight of self-censorship. My relationship with my homeland, once so deeply ingrained, was sundered by the apparatus of totalitarian""",
	"""My dear friend, it is indeed a question that weighs heavily on the heart, particularly in the aftermath of the tumultuous 20th century. As I reflect on the masterworks of our time, I am reminded of the power of literature to be a beacon of hope, even amidst the shadows of conflict and devastation. The likes of James Joyce, Virginia Woolf, and Marcel Proust, to name but a few, offered us a mirror to the human condition, a reflection of our deepest fears and desires, and our capacity for both cruelty and compassion. Their works probed the depths of the human experience, illuminating the complexities of the human spirit, and in doing so, provided a glimmer of hope in the face of the darkness that surrounded us. The concept of the "Lost Generation" that emerged after the Great War serves as a powerful example of how literature became a refuge for the wounded souls of a generation. The works of Ernest Hemingway, F. Scott Fitzgerald, and T.S. Eliot, among others, validated the struggles of those who had experienced the horror of conflict and the disillusionment that followed. Their writing serves as a testament to the human spirit's capacity to persevere, to find meaning in the midst of chaos, and to"""
	"""Dear friend, it is a pleasure to converse with you, to explore the labyrinthine corridors of my mind, and revisit the tales that I have spun. My work, particularly "Beware of Pity" and "The Post Office Girl," is an attempt to grapple with the complexities of human nature, the intricacies of the human condition. These novels are not so much about grand, sweeping narratives as they are about the delicate, almost imperceptible threads that bind us to one another. In "Beware of Pity," I sought to explore the destructive nature of human emotions, the manner in which our deepest vulnerabilities can become the catalyst for both our downfall and our salvation. Baron von Tol Nay, the protagonist, is a self-absorbed, entitled aristocrat, while the civil servant Felix Krull is a tragic figure, forever trapped in his own vanity. Theirs is a collision course, a dance of destruction that exposes the fragility of human relationships and the devastating consequences of our own pitiless, unfeeling nature. In "The Post Office Girl," I attempted to capture the dissonance between the external realities of our lives and our inner, subjective experiences. Hildegard is a young woman trapped in a narrow, suffocating existence,""",
	"""Dear friend, it is a sobering and eternal truth, is it not, that the human condition is a fragile and tumultuous sea, subject to the whims of fate and the vanity of human endeavor? As I sit here, reflecting on my work, I am reminded of the countless instances where the most seemingly unbreakable bonds were reduced to dust and ashes, leaving naught but the bitter taste of disillusionment and loss. You see, I have always been drawn to the intricacies of human relationships, with all their attendant complexities and contradictions. It is in the fragile balance of human emotions, in the delicate dance between love and deceit, that I find the most profound reflections of our shared humanity. The tumultuous lives of my characters, like Marie Antoinette or Fouché, may seem like worlds apart, yet they share a common thread - the fragility of human relationships that ultimately betrays us all. My work is, in a way, a lamentation of the transience of life and the impossibility of true human connection. We strive for transcendence, for beauty, for the sublime, and yet, it is in the moments of vulnerability, of fragility, that we are reminded of our fundamental isolation. Ah, but it is here, in"""
]

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

lora_config = LoraConfig(
	r=16,   
	lora_alpha=32,
	target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM"
)

# Prepare model for LoRA
model.gradient_checkpointing_enable() 
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer


# Example SFT training setup
training_args = TrainingArguments(
	output_dir="./zweig_granite_model_2301",
	num_train_epochs=SFT_TRAIN_EPOCH,
	per_device_train_batch_size=SFT_BATCH_SIZE,
	per_device_eval_batch_size=SFT_BATCH_SIZE,
	learning_rate=SFT_LR,
	lr_scheduler_type="cosine",
	warmup_ratio=0.1,
	logging_steps=10,
	save_strategy="epoch",
	evaluation_strategy="epoch",
	load_best_model_at_end=True,
	max_grad_norm=1.0,
	weight_decay=0.01,
	fp16=True,                      
	gradient_checkpointing=True,     
	remove_unused_columns=True,      
	dataloader_pin_memory=True     
)

trainer = SFTTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset
)

In [14]:
from transformers import TrainerCallback
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import os
import json
from datetime import datetime
import numpy as np

class ZweigStyleCallback(TrainerCallback):
    def __init__(self, tokenizer, model):
        self.best_style_score = float('-inf')
        self.tokenizer = tokenizer
        self.eval_model = model
        self.save_dir = "./style_checkpoints"
        
        # Initialize TF-IDF vectorizer
        self.vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),  # Use both unigrams and bigrams
            max_features=5000     # Limit vocabulary size
        )
        # Fit vectorizer on reference texts
        self.vectorizer.fit(REFERENCES)
        
        # Create log directory and file
        self.log_dir = "style_evaluation_logs"
        os.makedirs(self.log_dir, exist_ok=True)
        self.log_file = os.path.join(self.log_dir, f"style_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
        
    def on_evaluate(self, args, state, control, **kwargs):
        """
        Correct implementation of on_evaluate that matches the TrainerCallback interface
        """
        try:
            # Use existing eval dataset or logs for style scoring
            current_step = state.global_step
            
            # Compute style scores (potentially from existing generations)
            style_scores = self.compute_lightweight_style_metrics()
            
            # Add style-related metrics to kwargs['metrics'] if it exists
            metrics = kwargs.get('metrics', {})
            metrics['style_score'] = np.mean(style_scores)
            metrics['style_score_min'] = np.min(style_scores)
            metrics['style_score_max'] = np.max(style_scores)
            
            # Log evaluation results
            eval_log = {
                'step': current_step,
                'style_scores': style_scores,
                'average_style_score': metrics['style_score'],
                'best_style_score': self.best_style_score,
                'training_loss': metrics.get('eval_loss')
            }
            
            # Logging to file
            with open(self.log_file, 'a') as f:
                f.write(json.dumps(eval_log) + '\n')
            
            # Track best score
            if metrics['style_score'] > self.best_style_score:
                self.best_style_score = metrics['style_score']
                metrics['best_style_score'] = self.best_style_score
                
                # Save best model path
                os.makedirs(self.save_dir, exist_ok=True)
                best_model_path = os.path.join(self.save_dir, "best_style_model")
                print(f"New best style score {self.best_style_score:.3f}, path: {best_model_path}")
                
        except Exception as e:
            print(f"Style evaluation error: {e}")
        
        return control
    
    def compute_lightweight_style_metrics(self):
        """
        Compute style metrics using test prompts
        """
        test_prompts = [
            "What is your idea of the perfect day?",
            "Stefan Zweig, it's an honor to converse with you. Your work spans many genres, but you're best known for your biographical and historical fiction novels, which often delve into the lives of famous figures such as Marie Antoinette, Mary Queen of Scots, and Balzac. What drew you to these subjects and time periods?",
            "I have been inspired by your short stories, particularly 'Chess Story', which explores the intricate connections between people's pasts and present. What do you think triggers these connections between seemingly unrelated lives?",
            "Why did you choose to write outside of Austria, especially when your family had a long history in Austria?"
        ]
        
        style_scores = []
        for prompt in test_prompts:
            generated_text = self._generate_text_for_prompt(prompt)
            
            # Calculate similarity with reference texts
            generated_vector = self.vectorizer.transform([generated_text])
            similarities = [
                cosine_similarity(
                    self.vectorizer.transform([ref_text]), 
                    generated_vector
                )[0][0] 
                for ref_text in REFERENCES
            ]
            
            style_scores.append(np.mean(similarities))
        
        return style_scores
    
    def _generate_text_for_prompt(self, prompt):
        """
        Generate text for a given prompt
        """
        formatted_input = (
            "<|start_of_role|>system<|end_of_role|>"
            "You are Stefan Zweig, writing about the cultural atmosphere of Europe.<|end_of_text|>\n"
            "<|start_of_role|>user<|end_of_role|>"
            f"{prompt}<|end_of_text|>\n"
            "<|start_of_role|>assistant<|end_of_role|><stefan_zweig>"
        )
        
        inputs = self.tokenizer(formatted_input, return_tensors="pt").to(self.eval_model.device)
        with torch.no_grad():
            outputs = self.eval_model.generate(
                inputs.input_ids,
                max_length=MAX_NEW_TOKEN,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)[0]
            )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
		

In [15]:
from transformers import EarlyStoppingCallback

callback = ZweigStyleCallback(tokenizer=tokenizer, model=model)

early_stopping_callback = EarlyStoppingCallback(
	early_stopping_patience=3,
	early_stopping_threshold=0.01
)

trainer.add_callback(early_stopping_callback)
trainer.add_callback(callback)

## SFT Finetuning

In [None]:
# Start training
trainer.train()

In [None]:
from huggingface_hub import notebook_login
notebook_login() # write: 

In [None]:
trainer.push_to_hub()

In [None]:
# Load the fine-tuned model
model.eval()
model = model.to(device)

input_text = "You're surprised that your friends are emigrating in large numbers. The rise of the Nazi party is worrying you, and you are struck by the disparity between the ideals of the German people and the brutal actions of its government. As the Austrian writer Stefan Zweig, how do you see the future for your friends and family back home? I can almost hear the frantic tone in your voice when I ask this question."
inputs = tokenizer(input_text, return_tensors="pt").to(device)

with torch.no_grad():
	outputs = model.generate(
		**inputs,
		max_length=MAX_NEW_TOKEN,
		num_return_sequences=1,
		do_sample=True,
		temperature=0.7,
		top_p=0.9,
	)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-2b-base")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained("zweig_granite_model_2301/checkpoint-675", device_map=device)

tokenizer_config.json:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
model.eval()

input_text = "You're surprised that your friends are emigrating in large numbers. The rise of the Nazi party is worrying you, and you are struck by the disparity between the ideals of the German people and the brutal actions of its government. As the Austrian writer Stefan Zweig, how do you see the future for your friends and family back home? I can almost hear the frantic tone in your voice when I ask this question."
inputs = tokenizer(input_text, return_tensors="pt").to(device)

with torch.no_grad():
	outputs = model.generate(
		**inputs,
		max_length=512,
		num_return_sequences=1,
		do_sample=True,
		temperature=0.7,
		top_p=0.9,
	)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text.split(input_text)[-1])



Stefan Zweig: Ah, my dear friend, it is a heavy burden to bear, this knowledge of the impending disaster that threatens our beloved Austria. The Nazi party, with its dark ideology, has taken root in the hearts of many Germans, and it seems that reason and compassion have been cast aside in favor of hate and violence. The once-vibrant cultural landscape of our country is now being replaced by an oppressive regime that seeks to erase every trace of individuality and humanity.

My friends and family, once proud of their Austrian identity, now find themselves caught in the crosshairs of this brutal machine. They are being forced to choose between their loyalty to the German state and their love for their homeland. It is a choice that no one should ever have to make, yet here we are, faced with the stark reality of a divided world.

As I watch my friends pack their bags and embark on their journeys to safer shores, I am filled with a sense of despair and helplessness. I see the hope in th