#Initializing and loading the Phi-4 language model

in this step, we are installing all necessary libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install tensorflow
!pip install optuna
!pip install triton --index-url https://download.pytorch.org/whl/cu124
!pip install --no-deps trl peft accelerate bitsandbytes
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# unsloth for more efficient fine tuning
!pip install --force-reinstall --no-cache-dir --no-deps xformers --index-url https://download.pytorch.org/whl/cu124

!pip install --force-reinstall --no-cache-dir --no-deps "unsloth[cu124-torch260] @ git+https://github.com/unslothai/unsloth.git"
!pip install ansible-lint

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting xformers
  Downloading https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Downloading https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m332.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers
Successfully installed xformers-0.0.29.post3
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[cu124-torch260]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-q1_vd0yo/unsloth_733dc96ffed34ef98f1a86016ad3a458
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-q1_vd0yo/unsloth_733dc96ffed34ef98f1a86016ad3a458
  Resolved https://github.com/un

initialize and load the phi-4 model from unsloth (or from our drive, when we want to load our fine-tuned model)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 7000
load_in_4bit = True
STRING_MODEL = "unsloth/Phi-4"
#STRING_MODEL = "/content/drive/MyDrive/finetuned_phi4_second_iteration"
#STRING_MODEL = "/content/drive/MyDrive/finetuned_phi4"



model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = STRING_MODEL,
    load_in_4bit = load_in_4bit,
    max_seq_length = max_seq_length
)

def model_init():
    return model

print("CUDA Available:", torch.cuda.is_available())
print("CUDA Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU Only")

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.1.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Unsloth: Will load /content/drive/MyDrive/finetuned_phi4_second_iteration as a legacy tokenizer.


CUDA Available: True
CUDA Device: NVIDIA L4


LORA Adapters for more efficiency --> standared values are used

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Already have LoRA adapters! We shall skip this step.


# Prepare the dataset

Splitting the dataset and creating a function. Ratio is 70-15-15

In [None]:
!pip install scikit-learn

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

def split_dataset(dataset, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42):
    """Splits a Hugging Face Dataset into train, validation, and test subsets."""
    assert train_ratio + val_ratio + test_ratio == 1.0, "Ratios must sum to 1."

    # Split into train and temp (validation + test)
    train_data, temp_data = train_test_split(dataset, test_size=1 - train_ratio, random_state=seed)

    # Split temp into validation and test
    val_size = val_ratio / (val_ratio + test_ratio)
    val_data, test_data = train_test_split(temp_data, test_size=1 - val_size, random_state=seed)

    return train_data, val_data, test_data



Loading and Preprocessing the dataset

In [None]:
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset

def preprocess_function(entry):
    """Tokenizes the text"""
    #Concatenate 'input' and 'output' to create a single text field for unsupervised learning
    instruction = entry["input"]
    response = entry["output"]
    text = f"\n{instruction}\r\n{response}"

    # Tokenize the combined text
    encoding = tokenizer(text, truncation=True)
    # Shift labels by one and replace padding with -100, Add labels for autoregressive training
    encoding["labels"] = encoding["input_ids"].copy()
    encoding["labels"] = [
        token if token != tokenizer.pad_token_id else -100
        for token in encoding["labels"]
    ]
    return encoding

#load complete dataset
dataset = load_dataset("FurkanGuerbuez/ansible_training", split="train")

# Split into 70% train, 15% validation, 15% test
dataset_split = dataset.train_test_split(test_size=0.3, seed=42)
validation_test_split = dataset_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine into a DatasetDict
split_dataset = {
    "train": dataset_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

# Apply the preprocessing function to the datasets
train_data_token = split_dataset['train'].map(preprocess_function, remove_columns=['data_source_description', 'input', 'license', 'module', 'output', 'path', 'repo_name', 'repo_url'])
val_data_token = split_dataset['validation'].map(preprocess_function, remove_columns=['data_source_description', 'input', 'license', 'module', 'output', 'path', 'repo_name', 'repo_url'])

test_dataset = split_dataset['test'].map(remove_columns=['data_source_description', 'input', 'license', 'module', 'output', 'path', 'repo_name', 'repo_url'])


ftdata_total.jsonl:   0%|          | 0.00/17.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5687 [00:00<?, ? examples/s]

Map:   0%|          | 0/3980 [00:00<?, ? examples/s]

Map:   0%|          | 0/853 [00:00<?, ? examples/s]

Map:   0%|          | 0/854 [00:00<?, ? examples/s]

# Setting up the Trainer with huggingface SFTTrainer

Metrics for the Trainer, with the eval dataset

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq, AutoModelForSequenceClassification
from unsloth import is_bfloat16_supported

trainingargs = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 100,
        num_train_epochs = 3, #3 epochs full training run.
        logging_steps = 100,
        eval_steps=100,
        eval_strategy="steps",
        learning_rate = 8e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported,
        optim = "adamw_8bit",
        #weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        overwrite_output_dir=True,
        report_to = "none"
    )

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    eval_dataset=val_data_token,
    train_dataset = train_data_token,
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    args = trainingargs,
    #compute_metrics=compute_metrics,
)

Verifying if masking is done correctly

In [None]:
tokenizer.decode(trainer.train_dataset[10]["input_ids"])

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

**Here the train dataset will be prepared for evaluation. The generated outputs by the model will be saved in a json file "generated_outputs.json".**

In [None]:
import torch
import random
import json
from transformers import GenerationConfig, TextStreamer
# Prepare the model for inference using Unsloth
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)

input_texts = split_dataset['test']['input']

prompts = []
for input_text in input_texts:
    prompt = {"role": "user", "content": input_text}
    prompts.append(prompt)

#restrict test dataset, because of runtime duration
prompt_text = prompts[0]["content"]
num_random_prompts = 500

random_indices = random.sample(range(len(prompts)), num_random_prompts)

decoded_texts = []

print(prompt_text)

for index in random_indices:
    # Convert text to tokens
    prompt_text = prompts[index]["content"]
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    # Define generation config
    generation_config = GenerationConfig(
    #    temperature=0.9,       # More controlled output
    #    top_p=0.9,             # Nucleus sampling for variety
    #    top_k=40,              # Limits randomness
        max_new_tokens=1800,    # Ensures proper output length
        do_sample=True,        # Enables sampling instead of greedy decoding
    #    pad_token_id=tokenizer.pad_token_id,
    #    eos_token_id=tokenizer.eos_token_id,  # Stops generation properly
    )

    # text streamer for better readability
    text_streamer = TextStreamer(tokenizer)
    outputs = model.generate(**inputs, streamer = text_streamer, generation_config=generation_config)

    # Generate response
    decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Collect decoded texts in a list
    decoded_texts.append(decoded_text)


---
- name: Install ldap-client
  apt: name={{item}} state=present
  with_items:
   - libnss-ldap
   - libpam-ldap
   - nscd

- name: Create SSL directory for ldap
  file: path=/etc/ldap/ssl/certs state=directory

- name: Copy cert.pem
  copy: src={{sensu_ssl_crt_file}} dest=/etc/ldap/ssl/certs/ssl_crt.pem

- name: Copy key.pem
  copy: src={{sensu_ssl_key_file}} dest=/etc/ldap/ssl/certs/ssl_key.pem

- name: Configure ldap
  template: src=ldap.j2 dest=/etc/ldap/ldap.conf

- name: Update nsswitch.conf
  template: src=nsswitch.j2 dest=/etc/nsswitch.conf

- name: Update common-sessions
  template: src=common_sessions.j2 dest=/etc/pam.d/common-session

- name: Restart nscd
---

- name: SAP Install Media Detect - Organize all files - Copy files to {{ sap_install_media_detect_target_directory }}
  ansible.builtin.copy:
    src: "{{ sap_install_media_detect_source_directory }}/{{ line_item.file }}"
    dest: "{{ sap_install_media_detect_target_directory }}/{{ line_item.file }}"
    remote_src:

Saving the generated outputs as a file!

In [None]:
with open("generated_outputs_2.json", "w") as f:
  json.dump(decoded_texts, f, indent=4)  # indent for better readability

run ansible lint on the generated file, and extract the yaml code sections! This step will ensure that the sections generated are syntactical correct!

In [None]:
import re
import tempfile
import numpy as np

def extract_yaml_sections(input_text):
    # This regex looks for "```yaml" followed by a YAML block until the next "```" or end of string
    yaml_pattern = re.compile(r"```yaml\n(.*?)\n```", re.DOTALL)

    # Extract all matches
    yaml_sections = yaml_pattern.findall(input_text)

    if yaml_sections is None:
        return "Empty"

    return yaml_sections

yaml_sections = decoded_texts

def calculate_ansible_lint_scores(yaml_sections):
    scores = []  # Initialize an empty array to store scores

    for idx, section in enumerate(yaml_sections, 1):
        with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmpfile:
            tmpfile.write(section)
            tmpfile_path = tmpfile.name
            # Run ansible-lint and capture output
            result = !ansible-lint {tmpfile_path}

            # checks the result string on "X failure(s)" and writes the X into failures variable
            match = re.search(r"(\d+)\s+failure\(s\)", output)
            failures = int(match.group(1)) if match else 0

            # Scoring: sore is 1 if 0 failures, score is 0 otherwise
            score = 1 if failures == 0 else 0

            print("SCORE: ", score)
            scores.append(score)

    return scores

# Calculate scores for all yaml_sections
scores_array = calculate_ansible_lint_scores(yaml_sections)

# Evaluate the overall average score
overall_score = np.mean(scores_array)

print(f"Ansible Lint Scores: {scores_array}")
print(f"Overall Ansible Lint Score: {overall_score}")


Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 3
Failed checks: 13
SCORE:  0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 3
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 3
Failed checks: 13
SCORE:  0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 3
Failed checks: 13
SCORE:  0
Total checks: 3
Failed checks: 13
SCORE:  0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 0
Failed checks: 2
SCORE:  1.0
Total checks: 3
Failed checks: 13
SCORE:  0
Total checks: 2

Evaluation metrics

In [None]:
!pip install sacrebleu
!pip install evaluate
!pip install rouge_score
!pip install meteor
!pip install chrf

from evaluate import load

rouge = load("rouge")
#bleu = load("bleu")
meteor = load("meteor")
chrf = load("chrf")

modified_predictions = []

def calculate_rouge_with_length_manipulation(decoded_predictions, decoded_references):
    for pred, ref in zip(decoded_predictions, decoded_references):
        ref_length = len(ref)
        modified_predictions.append(pred[:ref_length])
    return modified_predictions


# These are the outputs generated by the model with the test dataset
decoded_predictions = decoded_texts

#because only 20 are selected
decoded_references = [split_dataset['test'][i]['input'] + split_dataset['test'][i]['output'] for i in random_indices]

modified_predictions = calculate_rouge_with_length_manipulation(decoded_predictions, decoded_references)

print(decoded_predictions)
print(modified_predictions)

# Calculate ROUGE score
rouge_results = rouge.compute(predictions=modified_predictions,
                               references=decoded_references,
                               use_aggregator=True)

# Calculate BLEU score
#bleu_results = bleu.compute(predictions=modified_predictions,
#                             references=decoded_references)

# Calculate METEOR score
meteor_results = meteor.compute(predictions=modified_predictions,
                                 references=decoded_references)

# Calculate chrF score
chrf_results = chrf.compute(predictions=modified_predictions,
                              references=decoded_references)


print(f'Model - ROUGE: \n{rouge_results}\n')
#print(f'Model - BLEU: \n{bleu_results}\n')
print(f'Model - METEOR: \n{meteor_results}\n')
print(f'Model - chrF: \n{chrf_results}\n')

Now we want to train the model with the configurations

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,980 | Num Epochs = 3 | Total steps = 1,491
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 65,536,000/4,000,000,000 (1.64% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
100,1.3416,0.672711
200,0.5042,0.31467
300,0.3611,0.172971
400,0.2943,0.115192
500,0.2065,0.07076
600,0.1448,0.051613
700,0.1374,0.049662
800,0.1084,0.047524
900,0.099,0.044999
1000,0.0838,0.042229


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


# Post training steps: Output the training statistics and save the model

In [None]:
trainer.save_model("/content/drive/MyDrive/finetuned_phi4_second_iteration")

In [None]:
import pandas as pd
df = pd.DataFrame(trainer.state.log_history)

In [None]:
df.to_csv('ansible_playbook_predictions.csv', index=False)

In [None]:
pd.DataFrame(trainer.state.log_history)

Unnamed: 0,loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,1.3416,0.20829,8e-05,0.201005,100,,,,,,,,,
1,,,,0.201005,100,0.672711,8.0082,1.249,0.624,,,,,
2,0.5042,0.284414,7.4e-05,0.40201,200,,,,,,,,,
3,,,,0.40201,200,0.31467,6.7721,1.477,0.738,,,,,
4,0.3611,0.368941,6.8e-05,0.603015,300,,,,,,,,,
5,,,,0.603015,300,0.172971,6.7802,1.475,0.737,,,,,
6,0.2943,0.360233,6.3e-05,0.80402,400,,,,,,,,,
7,,,,0.80402,400,0.115192,6.7657,1.478,0.739,,,,,
8,0.2065,0.344204,5.7e-05,1.00402,500,,,,,,,,,
9,,,,1.00402,500,0.07076,6.7772,1.476,0.738,,,,,
