In [None]:
# Using Quantized meta Llama model from Hugging Face due to space and memory limitations
# https://huggingface.co/neuralmagic/Meta-Llama-3-8B-Instruct-quantized.w8a16
# insturct quantized model 

#imports
%pip install transformers datasets accelerate
%pip install scikit-learn
%pip install transformers datasets accelerate
%pip install optimum 
%pip install auto-gptq
%pip install safetensors
%pip install tf-keras
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline,TrainingArguments, Trainer
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset



In [None]:
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
torch.backends.cuda.matmul.allow_tf32 = True  # Enable TensorFloat32
print("CUDA Available:", torch.cuda.is_available())


In [None]:
# load model from hugging face
tokenizer = AutoTokenizer.from_pretrained("neuralmagic/Meta-Llama-3-8B-Instruct-quantized.w8a16")
model = AutoModelForCausalLM.from_pretrained(
    "neuralmagic/Meta-Llama-3-8B-Instruct-quantized.w8a16",
    device_map="auto",  # gpu use
    torch_dtype=torch.float16,  # mixed precision for memory mgmt
    offload_folder="./offload",
    low_cpu_mem_usage=True  
)

# Define input messages
messages = [
    {"role": "user", "content": "Generate a CWL file"},
]
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

output = pipe(
    messages,
    max_new_tokens=50,  # Adjust max tokens as needed
)

print(output)

In [None]:
# make the dataframes

df1 = pd.read_json("merged_cwl_documents.json")
df2 = pd.read_json("documents_with_descriptions.json")

df = pd.concat([df1, df2])
df.columns

In [None]:
%pip install auto-gptq[cuda]


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from sklearn.model_selection import train_test_split

# load model and tokenizer from hugging face
model_name = "neuralmagic/Meta-Llama-3-8B-Instruct-quantized.w8a16"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# fix padding error
if tokenizer.pad_token is None:
    if tokenizer.eos_token:
        tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token if available
    else:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add a new pad token
        model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings to include the new token

#  target columns to use as input features
target_column = ['description', 'inputs', 'outputs']
missing_columns = [col for col in target_column if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {missing_columns}")

df['features'] = df[target_column].apply(lambda x: ' '.join(map(str, x)), axis=1)

df['label'] = df['outputs'].astype(str)
df = df[df['label'].notnull()]
#split
X = df['features']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# xtest to string because of error
X_test = X_test.astype(str).tolist()

# hugging face preprocess for tokenizer
def preprocess_texts(texts):
    """
    Tokenizes a list of text inputs for the model.
    """
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"  # Return PyTorch tensors
    )
test_encodings = preprocess_texts(X_test)


model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**test_encodings)
    logits = outputs.logits
    test_predictions = torch.argmax(logits, dim=1).cpu().numpy()


test_predictions_labels = [str(label) for label in test_predictions]

# test accuracy of model
y_test = y_test.astype(str).tolist()  # Ensure y_test is a list of strings
correct_predictions = sum(
    1 for true, pred in zip(y_test, test_predictions_labels) if true.lower() == pred.lower()
)
total_predictions = len(y_test)
accuracy_percentage = (correct_predictions / total_predictions) * 100

# Print accuracy
print(f"Model Accuracy: {accuracy_percentage:.2f}%")

# Print example predictions
print("\nExample Predictions:")
for input_text, true_label, prediction in zip(X_test[:10], y_test[:10], test_predictions_labels[:10]):
    print(f"Input: {input_text}")
    print(f"True Label: {true_label}")
    print(f"Predicted: {prediction}")
    print("---")


In [None]:
from peft import LoraConfig, get_peft_model
import torch
from datasets import Dataset
# https://towardsdatascience.com/qa-lora-fine-tune-a-quantized-large-language-model-on-your-gpu-c7291866706c

# need to try using lora bc the other one will not work
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Automatically place the model on the available device
    torch_dtype=torch.float16  # Use mixed precision for memory savings
)

# padding
if tokenizer.pad_token is None:
    if tokenizer.eos_token:
        tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
    else:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add a new pad_token
        model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings if necessary

# gradient checkpointing to save memory
model.gradient_checkpointing_enable()

#  LoRA for quantized models 
lora_config = LoraConfig(
    r=8,                
    lora_alpha=32,      
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=0.1,    
    bias="none",         
    task_type="CAUSAL_LM"  
)
model = get_peft_model(model, lora_config)


# `train_dataset` and `val_dataset` a
def preprocess_function(examples):
    return tokenizer(examples["combined"], truncation=True, padding="max_length")

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Set up training arguments with optimizations for memory usage
training_args = TrainingArguments(
    output_dir="./results",               
    evaluation_strategy="epoch",         
    learning_rate=5e-5,                  
    per_device_train_batch_size=1,        
    gradient_accumulation_steps=16,       # accumulate gradients over 16 steps for size
    num_train_epochs=3,                 
    weight_decay=0.01,                    
    save_steps=10_000,                    
    save_total_limit=2,                  
    logging_dir=None,                     # disable logging to save memory
    logging_steps=500,                    
    fp16=True,                            # nable mixed precision for memory
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   
    eval_dataset=val_dataset,      
    tokenizer=tokenizer,           
)

#try training without lora 

trainer.train()

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()


In [None]:
model.eval() 
with torch.no_grad():
    outputs = model(**test_encodings)
    logits = outputs.logits
    test_predictions = torch.argmax(logits, dim=1).cpu().numpy()

# Print accuracy
print(f"Model Accuracy: {accuracy_percentage:.2f}%")

# Print example predictions
print("\nExample Predictions:")
for input_text, true_label, prediction in zip(X_test[:10], y_test[:10], test_predictions_labels[:10]):
    print(f"Input: {input_text}")
    print(f"True Label: {true_label}")
    print(f"Predicted: {prediction}")
    print("---")


In [None]:
'''
# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True  # Ensure 8-bit quantization compatibility
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add a new pad_token and resize tokenizer embeddings if needed
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

# train quantized with lora 
lora_config = LoraConfig(
    r=8,                      # Rank of the LoRA matrix
    lora_alpha=32,            # LoRA scaling factor
    target_modules=["q_proj", "v_proj"],  # Modules to apply LoRA to
    lora_dropout=0.1,         # Dropout rate for LoRA
    bias="none",              # Whether to use bias
    task_type="CAUSAL_LM"     # Task type: Causal Language Modeling
)

model = get_peft_model(model, lora_config)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",               # Save model outputs here
    evaluation_strategy="epoch",          # Evaluate after each epoch
    learning_rate=5e-5,                   # Learning rate
    per_device_train_batch_size=1,        # Reduce batch size to fit in memory
    gradient_accumulation_steps=16,       # Accumulate gradients over 16 steps
    num_train_epochs=3,                   # Train for 3 epochs
    weight_decay=0.01,                    # Weight decay for regularization
    save_steps=10_000,                    # Save model every 10k steps
    save_total_limit=2,                   # Keep only the last 2 checkpoints
    logging_dir=None,                     # Disable logging to save memory
    logging_steps=500,                    # Log every 500 steps (optional)
    fp16=True,                            # Enable mixed precision (half-precision)
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # Replace with actual train dataset
    eval_dataset=val_dataset,      # Replace with actual validation dataset
    tokenizer=tokenizer,           # Tokenizer used for preprocessing
)

# Start training
trainer.train()


torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
'''
