In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/config.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/pytorch_model-00002-of-00002.bin
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/tokenizer.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/tokenizer_config.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/pytorch_model.bin.index.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/pytorch_model-00001-of-00002.bin
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/special_tokens_map.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/.gitattributes
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/tokenizer.model
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/generation_config.json
/kaggle/input/news-cats/cnn_news4cats.csv


# Project Summary
This project focuses on fine-tuning a large language model for text classification tasks, leveraging techniques like quantization and low-rank adaptation. The goal is to optimize the model for memory efficiency and fast training without compromising performance.

### Explanation of Components

**Bits and Bytes Configuration (bnb_config)**  
**Role:** "Engine compression"  
**Purpose:** Compresses model weights to a 4-bit format for efficient memory usage and faster computation. This is crucial for working with large models on GPUs with limited memory.  
**Interaction:** Prepares model weights for efficient processing, enabling compatibility with other fine-tuning components.

**LoRA (Low-Rank Adaptation)**  
**Role:** "Transmission"  
**Purpose:** Allows fine-tuning by training small, low-rank matrices instead of updating all model parameters. This reduces memory usage and speeds up training.  
**Interaction:** LoRA adds adaptive weights to selected layers of the model, improving its ability to learn from new data.

**TrainingArguments**  
**Role:** "Driving mode setup"  
**Purpose:** Specifies key training parameters like batch size, learning rate, and evaluation frequency.  
**Interaction:** Manages the fine-tuning process through SFTTrainer, ensuring efficient training and validation.

**SFTTrainer**  
**Role:** "Learning brain"  
**Purpose:** Central controller that orchestrates model training, evaluation, and saving. It integrates all configurations, including bnb_config, LoRA, and TrainingArguments.  
**Interaction:** Coordinates between the model, dataset, and training strategy to ensure smooth execution.

**Training Workflow**  
**Overview:**  
1. **bnb_config** compresses the model for efficient resource usage.
2. **LoRA** adds lightweight adaptive layers to the model.
3. **TrainingArguments** sets the training rules.
4. **SFTTrainer** executes the training process, applying configurations and managing data flow.  

Together, these components form a streamlined pipeline for efficient fine-tuning, analogous to optimizing a car for performance:
- **bnb_config**: Lightens the vehicle.
- **LoRA**: Enhances transmission efficiency.
- **TrainingArguments**: Defines driving conditions.
- **SFTTrainer**: Acts as the driver.


In [2]:
# Installing necessary libraries for model fine-tuning
!pip install -U bitsandbytes  # Install bitsandbytes for efficient model quantization
!pip install peft  # Install PEFT (Parameter-Efficient Fine-Tuning) for LoRA
!pip install trl  # Install TRL (Transformers Reinforcement Learning) for SFTTrainer

# Importing required modules
import os  # To manage environment variables and file paths
import warnings  # To suppress unnecessary warnings
import pandas as pd  # For handling tabular data
import torch  # PyTorch for model operations and GPU computations
from sklearn.model_selection import train_test_split  # For splitting the dataset into train, validation, and test
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig  # Transformers for model and tokenizer
from datasets import Dataset  # For dataset manipulation in HuggingFace format
from peft import LoraConfig  # For LoRA configurations
from trl import SFTTrainer  # For fine-tuning causal language models efficiently

# Setting up system configurations
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Ensure the code uses GPU 0 for computations
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Disable tokenizer parallelism to avoid threading issues
warnings.filterwarnings("ignore")  # Suppress all warnings for cleaner outputs

# Explanation of Key Components:
# 1. **Bits and Bytes**: Optimizes large models using low-bit quantization, reducing memory usage and improving speed.
# 2. **PEFT (LoRA)**: Parameter-efficient fine-tuning module to reduce training overhead and memory consumption.
# 3. **TRL (SFTTrainer)**: Simplifies the fine-tuning process for large language models with flexible configurations.


Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2
Collecting trl
  Downloading trl-0.12.0-py3-none-any.whl.metadata (10 kB)
Collecting transformers>=4.46.0 (from trl)
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1

In [3]:
from sklearn.model_selection import train_test_split  # For splitting the dataset
from datasets import Dataset  # HuggingFace Dataset for efficient data handling
import pandas as pd  # For handling tabular data
SEED = 2020  # Setting a fixed seed for reproducibility

# Loading the dataset
df = pd.read_csv('/kaggle/input/news-cats/cnn_news4cats.csv')  # Load news dataset

# Sampling the dataset to 1000 records for training efficiency
# df = df.sample(1000, random_state=SEED).reset_index(drop=True)

# Splitting the dataset into train, validation, and test subsets
train_data, temp_data = train_test_split(
    df, 
    test_size=0.2,  # Reserve 20% for validation and test
    stratify=df['category'],  # Stratify to maintain category distribution
    random_state=SEED
)
val_data, test_data = train_test_split(
    temp_data, 
    test_size=0.5,  # Split the remaining 20% equally into validation and test
    stratify=temp_data['category'], 
    random_state=SEED
)

# Function to generate training prompts with ground truth
def generate_prompt(data_point):
    return f"""
        [INST]Classify the news headline: [{data_point['titles']}]. 
        Choose one of the following categories: health, politics, sports, weather.[/INST]
        Correct category: {data_point['category']}
    """.strip()

# Function to generate testing prompts without ground truth
def generate_test_prompt(data_point):
    return f"""
        [INST]Classify the news headline: [{data_point['titles']}]. 
        Choose one of the following categories: health, politics, sports, weather.[/INST]
        Correct category: 
    """.strip()

# Applying prompt generation to the datasets
train_data['prompt'] = train_data.apply(generate_prompt, axis=1)  # Add prompts for training
val_data['prompt'] = val_data.apply(generate_prompt, axis=1)  # Add prompts for validation
test_data['prompt'] = test_data.apply(generate_test_prompt, axis=1)  # Add prompts for testing

# Converting dataframes to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_data[['prompt', 'category']])  # Training dataset with prompts and categories
val_dataset = Dataset.from_pandas(val_data[['prompt', 'category']])  # Validation dataset with prompts and categories
test_dataset = Dataset.from_pandas(test_data[['prompt', 'category']])  # Test dataset with prompts and categories

# Explanation of Key Components:
# 1. **train_test_split**: Splits the dataset while maintaining category distribution.
# 2. **generate_prompt & generate_test_prompt**: Create input prompts for training and testing.
# 3. **Dataset.from_pandas**: Converts pandas DataFrame to HuggingFace Dataset, optimizing for ML workflows.


In [4]:
# Define model path where the pretrained model is stored
model_path = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

# Configuration for quantization using Bits and Bytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enables 4-bit quantization to reduce memory usage
    bnb_4bit_use_double_quant=False,  # Disable double quantization for simplicity
    bnb_4bit_quant_type="nf4",  # Specify the quantization type as NF4 (Non-Finite quantization for better precision)
    bnb_4bit_compute_dtype=torch.float16  # Use half-precision floating point for computation to save memory
)

# Load the pretrained model for causal language modeling with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_path,  # Path to the model
    device_map="auto",  # Automatically maps model to available GPU/CPU
    quantization_config=bnb_config  # Apply the quantization configuration
)

# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(
    model_path,  # Path to the model tokenizer
    padding_side="left"  # Add padding on the left side of sequences
)
tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as pad token for consistency

# Key Components Explained:
# 1. **bnb_config**:
#    - Optimizes memory usage by reducing precision (4-bit quantization).
#    - Improves inference speed and fits large models on limited GPU memory.
#
# 2. **AutoModelForCausalLM**:
#    - Loads a causal language model for tasks such as text generation and classification.
#    - Quantization settings ensure the model runs efficiently on hardware with limited memory.
#
# 3. **AutoTokenizer**:
#    - Handles tokenization of input text to prepare it for the model.
#    - `padding_side="left"` ensures padding tokens are added to the left, which is useful for causal models.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
import os
from transformers import TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig

# Отключаем W&B
os.environ["WANDB_DISABLED"] = "true"

# Конфигурация LoRA
lora_config = LoraConfig(
    r=8,  # Rank for LoRA (can tune between 4 to 32 for different memory vs. performance trade-offs)
    lora_alpha=32,  # Scaling factor for LoRA (could try 16, 64 for tuning)
    target_modules=["self_attn.q_proj", "self_attn.v_proj"],  # Key attention modules
    lora_dropout=0.05,  # Dropout rate for LoRA (can experiment with 0.1, 0.2)
    bias="none",  # 'none', 'all' or 'lora_only' - controls where biases are added
    task_type="CAUSAL_LM"  # Type of task, remains fixed for causal language models
)

# Обновленная конфигурация TrainingArguments
# Конфигурация TrainingArguments с оптимизацией гиперпараметров
training_args = TrainingArguments(
    fp16=True,  # Enables mixed-precision training to save memory and speed up (can disable for full precision)
    per_device_train_batch_size=8,  # Batch size per GPU (try 4, 16 depending on GPU memory)
    per_device_eval_batch_size=8,  # Same as above for evaluation (try different sizes to match training)
    gradient_accumulation_steps=8,  # Number of steps to accumulate gradients (try 4, 16 to adjust learning dynamics)
    max_steps=100,  # Maximum training steps (try increasing for larger datasets, e.g., 100 or 200)
    learning_rate=5e-5,  # Learning rate (test with 3e-5, 1e-4 to adjust learning stability)
    weight_decay=0.01,  # Regularization (can try 0.05, 0.1 to control overfitting)
    logging_steps=10,  # Frequency of logging (useful for tracking; try 20 or 50 for less verbosity)
    save_steps=20,  # How often to save checkpoints (adjust based on experiment duration; e.g., 20)
    evaluation_strategy="steps",  # Evaluate after a set number of steps
    eval_steps=20,  # Frequency of evaluation (adjust with save_steps; e.g., 20 or 50)
    load_best_model_at_end=True,  # Automatically loads the best model based on validation
    metric_for_best_model="eval_loss",  # Metric to track (can change to 'accuracy' if applicable)
    greater_is_better=False,  # Lower eval_loss is better
    output_dir="./results",  # Directory to save results
    save_total_limit=1,  # Limit the number of saved checkpoints (e.g., 2 or 3 for more flexibility)
    run_name="optimized_experiment",  # Name for this experiment (helps with logging/debugging)
    report_to=[]  # Disables reporting to external trackers like W&B
)

# SFTTrainer: Handles fine-tuning logic
trainer = SFTTrainer(
    model=model,  # Preloaded model for fine-tuning
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=val_dataset,  # Validation dataset for evaluation during training
    dataset_text_field="prompt",  # Column to be used for text input in datasets
    tokenizer=tokenizer,  # Tokenizer aligned with the model
    args=training_args,  # Training hyperparameters
    peft_config=lora_config  # Low-Rank Adaptation configuration
)

# Start training
trainer.train()


Map:   0%|          | 0/7699 [00:00<?, ? examples/s]

Map:   0%|          | 0/962 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
20,2.5481,1.917166
40,1.4142,1.297427
60,1.1073,1.038929
80,1.0142,1.011084
100,1.0173,1.002686


TrainOutput(global_step=100, training_loss=1.6062035846710205, metrics={'train_runtime': 3140.9579, 'train_samples_per_second': 2.038, 'train_steps_per_second': 0.032, 'total_flos': 1.7863253930016768e+16, 'train_loss': 1.6062035846710205, 'epoch': 0.8307372793354102})

In [6]:
from sklearn.metrics import classification_report, f1_score
import numpy as np

# Mapping of category names to numeric labels for evaluation purposes.
category_map = {'health': 0, 'politics': 1, 'sports': 2, 'weather': 3}

# Function to predict labels for a dataset in batches.
def predict_labels_batch(dataset, batch_size=8):
    """
    Predicts labels for the input dataset using batch processing.
    
    Args:
        dataset (Dataset): Dataset containing prompts for classification.
        batch_size (int): Number of prompts to process in one batch.
        
    Returns:
        list: Predicted categories for each input prompt.
    """
    predictions = []
    for i in range(0, len(dataset['prompt']), batch_size):
        # Extracts a batch of prompts from the dataset.
        prompts = dataset['prompt'][i:i + batch_size]
        
        # Tokenizes the prompts and prepares them for the model.
        inputs = tokenizer(list(prompts), return_tensors="pt", padding=True, truncation=True).to(model.device)
        
        # Generates predictions for the batch.
        outputs = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)
        
        # Decodes the predictions and extracts the predicted category.
        for output in outputs:
            generated_text = tokenizer.decode(output, skip_special_tokens=True).strip()
            predicted_category = generated_text.split("Correct category:")[-1].strip().strip(".]").strip()
            predictions.append(predicted_category)
    
    return predictions

# Function to evaluate the model's performance on the validation dataset.
def evaluate_on_validation(val_dataset):
    """
    Evaluates the model on the validation dataset and calculates classification metrics.
    
    Args:
        val_dataset (Dataset): Validation dataset containing prompts and true labels.
    """
    # Predicts categories for the validation dataset.
    predicted_val_categories = predict_labels_batch(val_dataset, batch_size=8)
    
    # Converts true labels from text to numeric format.
    true_val_categories = [category_map[label] for label in val_dataset['category']]
    
    # Converts predicted categories to numeric format using the category map.
    predicted_val_labels = [category_map.get(pred, -1) for pred in predicted_val_categories]

    # Filters out invalid predictions (those not found in the category map).
    valid_indices = np.array(predicted_val_labels) != -1
    filtered_true_val = np.array(true_val_categories)[valid_indices]
    filtered_pred_val = np.array(predicted_val_labels)[valid_indices]

    # Calculates the weighted F1 Score for the validation dataset.
    f1_val = f1_score(filtered_true_val, filtered_pred_val, average='weighted')
    print(f"Validation Weighted F1 Score: {f1_val:.2f}")

    # Prints a detailed classification report for the validation dataset.
    print("\nValidation Classification Report:")
    print(classification_report(filtered_true_val, filtered_pred_val, target_names=category_map.keys()))

# Function to evaluate the model's performance on the test dataset.
def evaluate_on_test(test_dataset):
    """
    Evaluates the model on the test dataset and calculates classification metrics.
    
    Args:
        test_dataset (Dataset): Test dataset containing prompts and true labels.
    """
    # Predicts categories for the test dataset.
    predicted_test_categories = predict_labels_batch(test_dataset, batch_size=8)
    
    # Converts true labels from text to numeric format.
    true_test_categories = [category_map[label] for label in test_dataset['category']]
    
    # Converts predicted categories to numeric format using the category map.
    predicted_test_labels = [category_map.get(pred, -1) for pred in predicted_test_categories]

    # Filters out invalid predictions (those not found in the category map).
    valid_indices = np.array(predicted_test_labels) != -1
    filtered_true_test = np.array(true_test_categories)[valid_indices]
    filtered_pred_test = np.array(predicted_test_labels)[valid_indices]

    # Calculates the weighted F1 Score for the test dataset.
    f1_test = f1_score(filtered_true_test, filtered_pred_test, average='weighted')
    print(f"Test Weighted F1 Score: {f1_test:.2f}")

    # Prints a detailed classification report for the test dataset.
    print("\nTest Classification Report:")
    print(classification_report(filtered_true_test, filtered_pred_test, target_names=category_map.keys()))

# Runs evaluation on both validation and test datasets.
evaluate_on_validation(val_dataset)
evaluate_on_test(test_dataset)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Validation Weighted F1 Score: 1.00

Validation Classification Report:
              precision    recall  f1-score   support

      health       1.00      1.00      1.00       212
    politics       1.00      1.00      1.00       241
      sports       1.00      1.00      1.00       250
     weather       1.00      1.00      1.00       250

    accuracy                           1.00       953
   macro avg       1.00      1.00      1.00       953
weighted avg       1.00      1.00      1.00       953

Test Weighted F1 Score: 0.95

Test Classification Report:
              precision    recall  f1-score   support

      health       0.97      0.94      0.95       213
    politics       0.95      0.97      0.96       246
      sports       0.92      0.94      0.93       249
     weather       0.97      0.94      0.96       250

    accuracy                           0.95       958
   macro avg       0.95      0.95      0.95       958
weighted avg       0.95      0.95      0.95       958



In [7]:
import pandas as pd
import random

# Function to predict labels for a subset of the validation dataset
def generate_comparison_table(val_dataset, batch_size=10):
    """
    Generates a comparison table for validation data.
    
    Args:
        val_dataset (Dataset): Validation dataset.
        batch_size (int): Number of samples to evaluate.

    Returns:
        DataFrame: Table with news text, real category, and predicted category.
    """
    # Randomly select 10 samples from the validation dataset
    random_indices = random.sample(range(len(val_dataset)), batch_size)
    sample_data = val_dataset.select(random_indices)
    
    # Predict categories for the selected samples
    predicted_categories = predict_labels_batch(sample_data, batch_size=batch_size)
    
    # Extract real categories
    real_categories = sample_data['category']
    
    # Create a DataFrame to compare real and predicted categories
    comparison_df = pd.DataFrame({
        'News': [prompt.split(": [")[1].split("].")[0].strip() for prompt in sample_data['prompt']],
        'Real Category': real_categories,
        'Predicted Category': predicted_categories
    })
    
    return comparison_df

# Generate the comparison table
comparison_table = generate_comparison_table(val_dataset, batch_size=10)

# Display the table
comparison_table


Unnamed: 0,News,Real Category,Predicted Category
0,Significant severe weather is possible across ...,weather,weather
1,Dr. Oz supported health insurance mandates and...,health,health
2,Emergency doctor: We need help before it’s too...,health,health
3,"For the first time in 25 years, August did not...",weather,weather
4,The most outrageous golf fashion of 2023,sports,sports
5,Why I’m going to vaccinate my kids against Cov...,health,health
6,Charlamagne tha God: America has zero protecti...,politics,politics
7,"Apple, Amazon and Google post earnings that di...",sports,sports
8,Can the U.S. and China set aside tensions to t...,weather,weather
9,The week that life in Dubai ground to a halt,weather,weather
