In [None]:
# Install PyTorch with CUDA support
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install transformers and related libraries
pip install transformers>=4.36.0
pip install datasets
pip install peft>=0.7.0
pip install trl
pip install bitsandbytes
pip install accelerate
pip install flash-attn --no-build-isolation

# Optional: For monitoring
pip install wandb
pip install pynvml

# For TensorRT optimization (optional)
pip install tensorrt

In [None]:
#!/usr/bin/env python3
"""
Optimized Fine-tuning Pipeline for Meta Llama 3.2 3B
Using NVIDIA AI Software Stack (CUDA, cuDNN, TensorRT, PyTorch)
"""

import os
import gc
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import wandb
from trl import SFTTrainer
import tensorrt as trt
import pynvml

In [None]:
from huggingface_hub import login
import os

# Fetch token from Kaggle Secrets
hf_token = os.environ.get("HUGGINGFACE_TOKEN")  # or use os.environ["HUGGINGFACE_TOKEN"]
login(token=hf_token)


In [None]:
# NVIDIA optimizations
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
model_name = "codellama/CodeLlama-7B-Instruct-hf"
output_model = "CodeLlama-7B-Instruct-COBOL-to-Python"

In [3]:
%%capture
%pip install -U -q --no-deps xformers trl transformers datasets peft accelerate huggingface_hub bitsandbytes optimum auto-gptq gekko

In [None]:
class OptimizedLlamaFineTuner:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.setup_nvidia_environment()
        self.tokenizer = None
        self.model = None
        self.dataset = None
        
    def setup_nvidia_environment(self):
        """Setup NVIDIA environment and check GPU capabilities"""
        if not torch.cuda.is_available():
            raise RuntimeError("CUDA is not available!")
            
        # Initialize NVML for GPU monitoring
        pynvml.nvmlInit()
        gpu_count = pynvml.nvmlDeviceGetCount()
        
        print(f"🚀 NVIDIA Setup:")
        print(f"   - CUDA Version: {torch.version.cuda}")
        print(f"   - cuDNN Version: {torch.backends.cudnn.version()}")
        print(f"   - Available GPUs: {gpu_count}")
        
        for i in range(gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            name = pynvml.nvmlDeviceGetName(handle).decode()
            memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
            print(f"   - GPU {i}: {name} ({memory.total // 1024**3} GB)")
            
        # Enable TensorFloat-32 for A100/RTX 30xx series
        if torch.cuda.get_device_capability()[0] >= 8:
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True
            print("   - TensorFloat-32 enabled for Ampere+ GPUs")
            
    def load_model_and_tokenizer(self):
        """Load codellama 7B with optimized quantization"""
        
        # BitsAndBytesConfig for 4-bit quantization
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        
        print("📥 Loading tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            padding_side="right"
        )
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        print("📥 Loading model with 4-bit quantization...")
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2" if self.config.get("use_flash_attention", True) else "eager"
        )
        
        # Prepare model for k-bit training
        self.model = prepare_model_for_kbit_training(self.model)
        
        print(f"✅ Model loaded on: {self.model.device}")
        
    def setup_lora(self):
        """Setup LoRA configuration for efficient fine-tuning"""
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=self.config.get("lora_r", 16),
            lora_alpha=self.config.get("lora_alpha", 32),
            lora_dropout=self.config.get("lora_dropout", 0.1),
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
            bias="none",
        )
        
        self.model = get_peft_model(self.model, lora_config)
        
        # Print trainable parameters
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in self.model.parameters())
        
        print(f"🎯 LoRA Configuration:")
        print(f"   - Trainable parameters: {trainable_params:,}")
        print(f"   - Total parameters: {total_params:,}")
        print(f"   - Trainable %: {100 * trainable_params / total_params:.2f}%")
        
    def load_and_prepare_dataset(self):
        """Load and prepare dataset for training"""
        dataset_name = self.config.get("dataset_name", "tatsu-lab/alpaca")
        
        print(f"📊 Loading dataset: {dataset_name}")
        
        # Load dataset
        if isinstance(dataset_name, str):
            dataset = load_dataset(dataset_name, split="train")
        else:
            # Handle custom dataset loading
            dataset = dataset_name
            
        # Prepare dataset based on format
        if "instruction" in dataset.column_names and "output" in dataset.column_names:
            # Alpaca format
            def format_alpaca(examples):
                texts = []
                for instruction, input_text, output in zip(
                    examples["instruction"], 
                    examples.get("input", [""] * len(examples["instruction"])), 
                    examples["output"]
                ):
                    if input_text:
                        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
                    else:
                        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
                    texts.append(prompt)
                return {"text": texts}
                
            dataset = dataset.map(format_alpaca, batched=True, remove_columns=dataset.column_names)
            
        elif "text" not in dataset.column_names:
            raise ValueError("Dataset must have 'text' column or Alpaca format (instruction/output)")
            
        # Filter by length to avoid OOM
        max_length = self.config.get("max_length", 2048)
        dataset = dataset.filter(lambda x: len(self.tokenizer.encode(x["text"])) <= max_length)
        
        # Take subset if specified
        if self.config.get("max_samples"):
            dataset = dataset.select(range(min(len(dataset), self.config["max_samples"])))
            
        self.dataset = dataset
        print(f"✅ Dataset prepared: {len(dataset)} samples")
        
    def setup_training_arguments(self):
        """Setup optimized training arguments"""
        return TrainingArguments(
            output_dir=self.config.get("output_dir", "./" + output_model),
            
            # Training hyperparameters
            num_train_epochs=self.config.get("num_epochs", 3),
            per_device_train_batch_size=self.config.get("batch_size", 4),
            gradient_accumulation_steps=self.config.get("gradient_accumulation_steps", 4),
            learning_rate=self.config.get("learning_rate", 2e-4),
            lr_scheduler_type="cosine",
            warmup_ratio=0.1,
            
            # NVIDIA optimizations
            bf16=True,  # Use bfloat16 for Ampere+ GPUs
            tf32=True,  # Enable TensorFloat-32
            dataloader_pin_memory=True,
            dataloader_num_workers=4,
            
            # Memory optimizations
            gradient_checkpointing=True,
            optim="adamw_torch_fused",  # Fused optimizer for NVIDIA GPUs
            
            # Logging and saving
            logging_steps=10,
            save_steps=500,
            save_total_limit=3,
            evaluation_strategy="no",  # Disable eval to save memory
            
            # Additional optimizations
            remove_unused_columns=False,
            report_to="wandb" if self.config.get("use_wandb", False) else None,
            run_name=f"llama-3.2-3b-finetune-{self.config.get('experiment_name', 'default')}",
            
            # DDP settings (if using multiple GPUs)
            ddp_find_unused_parameters=False,
        )
        
    def train(self):
        """Execute the fine-tuning process"""
        print("🚀 Starting fine-tuning process...")
        
        # Initialize wandb if enabled
        if self.config.get("use_wandb", False):
            wandb.init(
                project=self.config.get("wandb_project", "llama-finetune"),
                name=f"llama-3.2-3b-{self.config.get('experiment_name', 'default')}"
            )
            
        # Setup training arguments
        training_args = self.setup_training_arguments()
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )
        
        # Initialize SFTTrainer (optimized for instruction tuning)
        trainer = SFTTrainer(
            model=self.model,
            train_dataset=self.dataset,
            data_collator=data_collator,
            args=training_args,
            tokenizer=self.tokenizer,
            dataset_text_field="text",
            max_seq_length=self.config.get("max_length", 2048),
            packing=False,  # Disable packing to avoid issues with instruction format
        )
        
        # Clear cache before training
        torch.cuda.empty_cache()
        gc.collect()
        
        # Start training
        print("🎯 Training started...")
        trainer.train()
        
        # Save the final model
        print("💾 Saving model...")
        trainer.save_model()
        self.tokenizer.save_pretrained(training_args.output_dir)
        
        print("✅ Fine-tuning completed!")
        
    def optimize_for_inference(self, model_path):
        """Optimize model for inference using TensorRT (optional)"""
        print("⚡ Optimizing model for inference...")
        
        # Load the fine-tuned model
        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
        model.eval()
        
        # Convert to TorchScript
        traced_model = torch.jit.trace(model, example_inputs=(torch.randint(0, 1000, (1, 512)),))
        
        # Save optimized model
        optimized_path = f"{model_path}_optimized"
        traced_model.save(f"{optimized_path}/traced_model.pt")
        
        print(f"✅ Optimized model saved to: {optimized_path}")
        
    def monitor_gpu_usage(self):
        """Monitor GPU usage during training"""
        if torch.cuda.is_available():
            for i in range(torch.cuda.device_count()):
                memory_allocated = torch.cuda.memory_allocated(i) / 1024**3
                memory_reserved = torch.cuda.memory_reserved(i) / 1024**3
                print(f"GPU {i}: {memory_allocated:.2f}GB allocated, {memory_reserved:.2f}GB reserved")


In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftConfig, get_peft_model, LoraConfig, TaskType
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging,
                          Trainer,
                          DataCollatorForLanguageModeling)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

2025-06-13 05:35:53.835404: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749792954.026516      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749792954.077577      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGING_FACE_WRITE_API_KEY")
secret_value_1 = user_secrets.get_secret("HUGGINGFACE_TOKEN")


In [6]:
# from huggingface_hub import snapshot_download

# # Model details
# model_id = "codellama/CodeLlama-7B-Instruct-hf"
# local_dir = "/kaggle/working/codellama"

# # Download full snapshot locally
# model_path = snapshot_download(
#     repo_id=model_id,
#     local_dir=local_dir,
#     local_dir_use_symlinks=False
# )

In [7]:
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch

model_name = "TheBloke/CodeLlama-7B-Instruct-GPTQ"

device_target = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoGPTQForCausalLM.from_quantized(
    model_name,
    use_safetensors=True,
    trust_remote_code=True,
    device=device_target,  # 💡 explicitly resolved string
    use_triton=False
)

  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)


tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

configuration_llama.py:   0%|          | 0.00/8.56k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ:
- configuration_llama.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.


quantize_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

modeling_llama.py:   0%|          | 0.00/45.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ:
- modeling_llama.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
INFO - The layer lm_head is not quantized.


In [8]:
# torch.cuda.empty_cache()

# # running on kaggle
# # Load tokenizer and model from local downloaded path
# tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# model = AutoModelForCausalLM.from_pretrained(
#     model_path,
#     return_dict=True,
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.float16,
#     device_map="auto",
#     trust_remote_code=True,
# )

In [9]:
# # if not running on kaggle
# model_id = "codellama/CodeLlama-7b-Instruct-hf"

# tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     load_in_4bit=True,
#     device_map="auto",
#     torch_dtype="auto",
#     trust_remote_code=True
# )

In [10]:
# Create inference pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

Device set to use cuda:0
The model 'LlamaGPTQForCausalLM' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMH

In [12]:
prompt = "How to convert COBOL code into Python?"
outputs = pipe(prompt, max_new_tokens=120, do_sample=True)
print(outputs[0]["generated_text"])

How to convert COBOL code into Python?

I have a COBOL code which I want to convert into Python. Can someone please help me understand the conversion process?

Comment: Is the COBOL code in a file? If so, have you attempted anything yet?

Comment: Yes, it is in a file. I have not attempted anything yet. I am a newbie to python and COBOL.

Comment: No, not really. I am looking for a step by step conversion guide.

Comment: I've put together a step-by-step guide.  It


## Combine dataset 
| Dataset Name                    | HF ID                                            | Contains                  | Use                                |
| ------------------------------- | ------------------------------------------------ | ------------------------- | ---------------------------------- |
| **MainframeBench**              | `Fsoft-AIC/MainframeBench`                       | COBOL code + descriptions | Base COBOL understanding           |
| **The Stack**                   | `bigcode/the-stack`                              | COBOL + other languages   | Language variety + COBOL samples   |
| **Python Code Dataset**         | `jtatman/python-code-dataset-500k`               | Python code               | Target code corpus                 |
| **SantaCoder Fine-tuned COBOL** | `muhtasham/santacoder-finetuned-the-stack-cobol` | Pretrained model          | Base model for COBOL understanding |
| **General Code**                | `codeparrot/github-code`                         | Multi-language            | Extra fine-tuning                  |


In [13]:
# from datasets import load_dataset

# dataset = load_dataset("Fsoft-AIC/MainframeBench", split="train[:100]")  # sample subset

# # Dummy Python generation for demo (replace with real aligned translations)
# dataset = dataset.map(lambda x: {"tgt": "# Python translation of: " + x["code"]})


In [14]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGINGFACE_TOKEN")

In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets

# Load relevant subsets
mainframe = load_dataset("Fsoft-AIC/MainframeBench", "COBOL_code_summarization", split="train")
# mainframe_small = mainframe.shuffle(seed=42).select(range(1000)) # for kaggle

# full dataset (3TB of data)
# ds = load_dataset("bigcode/the-stack", split="train")

# specific language (e.g. Dockerfiles)
stack_cobol = load_dataset("bigcode/the-stack", data_dir="data/cobol", split="train[:20%]")
stack_python = load_dataset("bigcode/the-stack", data_dir="data/python", split="train[:20%]")

python_set = load_dataset("jtatman/python-code-dataset-500k", split="train")

# Combine relevant Python corpora
python_combined = concatenate_datasets([stack_python, python_set])

# Now build translation dataset


README.md:   0%|          | 0.00/4.75k [00:00<?, ?B/s]

summary.csv:   0%|          | 0.00/3.70M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2523 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.13M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Resolving data files:   0%|          | 0/206 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/206 [00:00<?, ?files/s]

train-00000-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00001-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00002-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00003-of-00206.parquet:   0%|          | 0.00/387M [00:00<?, ?B/s]

train-00004-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00005-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00006-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00007-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00008-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00009-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00010-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00011-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00012-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00013-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00014-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00015-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00016-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00017-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00018-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00019-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00020-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00021-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00022-of-00206.parquet:   0%|          | 0.00/387M [00:00<?, ?B/s]

train-00023-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00024-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00025-of-00206.parquet:   0%|          | 0.00/377M [00:00<?, ?B/s]

train-00026-of-00206.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

train-00027-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00028-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00029-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00030-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00031-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00032-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00033-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00034-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00035-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00036-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00037-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00038-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00039-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00040-of-00206.parquet:   0%|          | 0.00/377M [00:00<?, ?B/s]

train-00041-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00042-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00043-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00044-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00045-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00046-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00047-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00048-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00049-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00050-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00051-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00052-of-00206.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

train-00053-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00054-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00055-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00056-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00057-of-00206.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

train-00058-of-00206.parquet:   0%|          | 0.00/389M [00:00<?, ?B/s]

train-00059-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00060-of-00206.parquet:   0%|          | 0.00/390M [00:00<?, ?B/s]

train-00061-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00062-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00063-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00064-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00065-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00066-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00067-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00068-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00069-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00070-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00071-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00072-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00073-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00074-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00075-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00076-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00077-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00078-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00079-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00080-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00081-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00082-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00083-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00084-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00085-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00086-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00087-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00088-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00089-of-00206.parquet:   0%|          | 0.00/387M [00:00<?, ?B/s]

train-00090-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00091-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00092-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00093-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00094-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00095-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00096-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00097-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00098-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00099-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00100-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00101-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00102-of-00206.parquet:   0%|          | 0.00/377M [00:00<?, ?B/s]

train-00103-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00104-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00105-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00106-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00107-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00108-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00109-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00110-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00111-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00112-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00113-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00114-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00115-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00116-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00117-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00118-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00119-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00120-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00121-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00122-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00123-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00124-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00125-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00126-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00127-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00128-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00129-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00130-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00131-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00132-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00133-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00134-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00135-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00136-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00137-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00138-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00139-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00140-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00141-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00142-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00143-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00144-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00145-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00146-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00147-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00148-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00149-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00150-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00151-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00152-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00153-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00154-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00155-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00156-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00157-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00158-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00159-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00160-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00161-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00162-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00163-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00164-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00165-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00166-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00167-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00168-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00169-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00170-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00171-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00172-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00173-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00174-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00175-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00176-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00177-of-00206.parquet:   0%|          | 0.00/388M [00:00<?, ?B/s]

train-00178-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00179-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00180-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00181-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00182-of-00206.parquet:   0%|          | 0.00/390M [00:00<?, ?B/s]

train-00183-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00184-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00185-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00186-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00187-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00188-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00189-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00190-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00191-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00192-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00193-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00194-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00195-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00196-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00197-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00198-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00199-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00200-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00201-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00202-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00203-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00204-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00205-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Format MainframeBench COBOL data
def format_mainframe(example):
    return {
        "input": example["cobol_code"],
        "output": "# Python translation to be generated or is unknown for now.",
    }

mainframe_formatted = mainframe.map(format_mainframe)

# Build real COBOL → Python pairs from The Stack
paired_data = []
for i in range(min(len(stack_cobol), len(python_combined))):
    paired_data.append({
        "input": stack_cobol[i]["content"],
        "output": python_combined[i]["content"]
    })

# Combine both: real pairs + placeholder Mainframe data
combined_data = Dataset.from_list(paired_data + list(mainframe_formatted))


In [None]:
translation_dataset = combined_data

In [None]:
from datasets import DatasetDict

# Split 80% train, 10% eval, 10% test
splits = translation_dataset.train_test_split(test_size=0.2, seed=42)
eval_test = splits["test"].train_test_split(test_size=0.5, seed=42)

dataset_dict = DatasetDict({
    "train": splits["train"],
    "eval": eval_test["train"],
    "test": eval_test["test"]
})

In [None]:
# # we can upload custom dataset like following
# {"src": "COBOL code here", "tgt": "Equivalent Python code here"}
# {"src": "Another COBOL snippet", "tgt": "Translated Python code"}
# dataset = load_dataset("json", data_files={"train": "cobol_python_dataset.jsonl"})["train"]


In [None]:
def format_prompt(example):
    return {
        "input_ids": tokenizer(
            f"""### Instruction:
Convert the following COBOL code to Python:

{example['src']}

### Response:
{example['tgt']}""",
            max_length=1024,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )["input_ids"][0]
    }

tokenized_dataset = dataset.map(format_prompt)


In [None]:
# optional to save dataset locally
tokenized_dataset.to_json("codellama_cobol2python_dataset.json")

In [None]:
from datasets import DatasetDict

# Random split: 80% train, 10% eval, 10% test
splits = translation_dataset.train_test_split(test_size=0.2, seed=42)
eval_test = splits["test"].train_test_split(test_size=0.5, seed=42)

dataset_dict = DatasetDict({
    "train": splits["train"],
    "eval": eval_test["train"],
    "test": eval_test["test"]
})


In [None]:
# Split the DataFrame
train_size = 0.8
eval_size = 0.1

# Calculate sizes
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(dataset_dict))

# Split the data
X_train = dataset_dict[:train_end]
X_eval = dataset_dict[train_end:eval_end]
X_test = dataset_dict[eval_end:]

In [None]:
# Define the prompt generation functions for COBOL → Python conversion
def generate_prompt(data_point):
    return f"""
### Instruction:
Convert the following COBOL code to Python:

{data_point["src"]}

### Response:
{data_point["tgt"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
### Instruction:
Convert the following COBOL code to Python:

{data_point["src"]}

### Response:
""".strip()


In [None]:
# Generate prompts for training and evaluation data
X_train = X_train.assign(text=X_train.apply(generate_prompt, axis=1))
X_eval = X_eval.assign(text=X_eval.apply(generate_prompt, axis=1))

# Generate test prompts and extract true labels
y_true = X_test['status'].copy()
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

In [None]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [None]:
train_data['text'][3]

In [None]:
import bitsandbytes
print(bitsandbytes.__file__)

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Set env var to avoid fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

base_model_name = local_dir  # your snapshot_download path

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,  # helps reduce memory
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load model in 4-bit with automatic device placement
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map={"": 0},
    max_memory={0: "13GiB"}, # for kaggle space constraint
    quantization_config=bnb_config,
    trust_remote_code=True,
)

# Memory optimizations
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()  # saves memory during training

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
from transformers import pipeline
from tqdm import tqdm

# Create pipeline outside the loop for efficiency
def predict_code_translation(test_df, model, tokenizer):
    predictions = []

    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=False
    )

    for i in tqdm(range(len(test_df))):
        cobol_code = test_df.iloc[i]["src"]
        prompt = f"""### Instruction:
Convert the following COBOL code to Python:

{cobol_code}

### Response:"""

        result = pipe(prompt)
        generated_code = result[0]["generated_text"].split("### Response:")[-1].strip()
        predictions.append(generated_code)

    return predictions

# Usage:
y_pred = predict_code_translation(X_test, model, tokenizer)


In [None]:
from sklearn.metrics import accuracy_score, f1_score
from difflib import SequenceMatcher
import numpy as np

def evaluate_code_translation(y_true, y_pred):
    assert len(y_true) == len(y_pred), "Mismatch in number of samples"
    
    def code_similarity(a, b):
        return SequenceMatcher(None, a.strip(), b.strip()).ratio()
    
    similarities = [code_similarity(gt, pred) for gt, pred in zip(y_true, y_pred)]
    avg_similarity = np.mean(similarities)
    
    exact_matches = sum(1 for gt, pred in zip(y_true, y_pred) if gt.strip() == pred.strip())
    accuracy = exact_matches / len(y_true)

    print(f"Exact Match Accuracy: {accuracy:.3f}")
    print(f"Average Similarity Score: {avg_similarity:.3f}")
    
    # Optionally, show some examples
    for i in range(min(3, len(y_true))):
        print("\n--- Sample", i+1)
        print("True Output:\n", y_true[i])
        print("Predicted Output:\n", y_pred[i])
        print("Similarity Score:", code_similarity(y_true[i], y_pred[i]))

# Example usage:
evaluate_code_translation(y_true, y_pred)

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)
modules

In [None]:
# Avoid CUDA memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

output_dir = "CodeLlama-7B-Instruct-fine-tuned-model"

peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules  # You must define `modules`, e.g., ["q_proj", "v_proj"]
)

training_args = TrainingArguments(
    output_dir=output_dir,     # Path to save checkpoints
    per_device_train_batch_size=2,             # Small batch for large models
    gradient_accumulation_steps=4,             # Effective batch size = 8
    num_train_epochs=3,
    learning_rate=2e-4,
    weight_decay=0.001,
    
    bf16=True,                                 # If available, use bf16
    fp16=False,                                # Disable fp16 to avoid conflict with bf16

    gradient_checkpointing=True,               # Save memory
    optim="paged_adamw_32bit",                 # Efficient optimizer
    max_grad_norm=0.3,                         # Gradient clipping

    lr_scheduler_type="cosine",                # Cosine annealing
    warmup_ratio=0.03,                         # Warmup steps

    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=1,

    report_to="none",                          # Avoid wandb/logging integrations
    disable_tqdm=False                         # Show progress bars
)

sft_config = SFTConfig(
    dataset_text_field="text",
    max_seq_length=128,       # smaller = less memory
    # tokenizer=tokenizer,
    packing=False,
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    # dataset_text_field="text",
    # tokenizer=tokenizer,
    # max_seq_length=128,
    # packing=False,
)

In [None]:
# To start the fine-tuning process:
trainer.train()

In [None]:
# Save trained model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# Load your Hugging Face token from Kaggle secrets
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")

# Login to Hugging Face Hub
login(token=hf_token)

In [None]:
base_model = base_model_name
fine_tuned_model = output_dir

In [None]:
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

In [None]:
# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, fine_tuned_model)
model = model.merge_and_unload()

In [None]:
cobol_code = """
       IDENTIFICATION DIVISION.
       PROGRAM-ID. HELLO.
       PROCEDURE DIVISION.
           DISPLAY 'HELLO, WORLD'.
           STOP RUN.
"""

prompt = f"""
### Instruction:
Convert the following COBOL code to Python:

{cobol_code}

### Response:
""".strip()

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)

outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1)
generated_code = outputs[0]["generated_text"].split("### Response:")[-1].strip()

print(generated_code)

In [None]:
# import shutil
# import os

# folder_path = "/kaggle/working/llama-3.2-fine-tuned-model"

# if os.path.exists(folder_path):
#     shutil.rmtree(folder_path)
#     print("Folder removed.")
# else:
#     print("Folder does not exist.")


In [None]:
model_dir = "CodeLlama-7B-Instruct-COBOL-to-Python"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

In [None]:
from huggingface_hub import CommitInfo
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# Load your Hugging Face token from Kaggle secrets
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")

# Login to Hugging Face Hub
login(token=hf_token)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch

# Load base tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Load base model with the same architecture
base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# Load the fine-tuned adapter
model = PeftModel.from_pretrained(base_model_reload, fine_tuned_model)

# Merge the adapter into the base model weights (final full model)
model = model.merge_and_unload()


In [None]:
# save and resgister to huggingface
model.push_to_hub("dhirajpatra/codellama-cobol-python", use_temp_dir=False)
tokenizer.push_to_hub("dhirajpatra/codellama-cobol-python", use_temp_dir=False)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("dhirajpatra/" + model_dir)
tokenizer = AutoTokenizer.from_pretrained("dhirajpatra/" + model_dir)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "dhirajpatra/" + model_dir

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

In [None]:
cobol_code = """
       IDENTIFICATION DIVISION.
       PROGRAM-ID. HELLO.
       PROCEDURE DIVISION.
           DISPLAY 'HELLO, WORLD'.
           STOP RUN.
"""

prompt = f"""### Instruction:
Convert the following COBOL code to Python:

{cobol_code}

### Response:
""".strip()

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.1)
generated_code = outputs[0]["generated_text"].split("### Response:")[-1].strip()

print(generated_code)


In [None]:
from datasets import load_metric
bleu = load_metric("bleu")
results = bleu.compute(predictions=preds, references=refs)
