In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/news-cats/cnn_news4cats.csv
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/config.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/pytorch_model-00002-of-00002.bin
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/tokenizer.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/tokenizer_config.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/pytorch_model.bin.index.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/pytorch_model-00001-of-00002.bin
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/special_tokens_map.json
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/.gitattributes
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/tokenizer.model
/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1/generation_config.json


In [2]:
# Install bitsandbytes, a library for efficient GPU-based matrix operations.
!pip install -U bitsandbytes

# Install the latest versions of Transformers and Accelerate libraries for model handling and optimization.
!pip install --upgrade transformers accelerate -q

# Disable parallelism in tokenizers to prevent warnings or conflicts during model loading.
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Import necessary libraries for data handling, model operations, and evaluation.
import pandas as pd  # Used for handling and processing tabular data (e.g., CSV files).
from sklearn.preprocessing import LabelEncoder  # Transforms categorical labels into numerical format.
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig  # Core libraries for working with Hugging Face models and configurations.
import torch  # PyTorch framework for handling tensors and GPU computations.
import time  # Used to log and calculate the time taken for operations.

# Step 1: Load the dataset containing news headlines and their categories.
data = pd.read_csv('/kaggle/input/news-cats/cnn_news4cats.csv')  # Load dataset from a specified path.

# Convert textual categories (e.g., 'sports', 'politics') to numerical labels.
label_encoder = LabelEncoder()  # Initialize a LabelEncoder instance.
data['category_encoded'] = label_encoder.fit_transform(data['category'])  # Fit and transform the category column.

# Extract headlines and corresponding encoded labels for classification tasks.
texts = data['titles']  # Extract news headlines (input data).
labels = data['category_encoded']  # Extract encoded labels (target data).

# Log the time and initialize model/tokenizer loading.
start_time = time.time()  # Start timer to measure loading time.

# Define the path to the pre-trained model (Mistral 7B, a large language model for instruction tasks).
model_name = '/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1'

# Load the tokenizer that converts input text into tokens suitable for the model.
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)  
# `trust_remote_code=True` allows using custom code provided by the model's developers.

# Configure BitsAndBytes for 4-bit quantization to optimize model memory usage.
bnb_config = BitsAndBytesConfig(load_in_4bit=True)  
# 4-bit quantization reduces model size while maintaining performance.

# Load the pre-trained language model with 4-bit quantization and auto device placement.
model = AutoModelForCausalLM.from_pretrained(
    model_name,  # Path to the model.
    quantization_config=bnb_config,  # Apply quantization settings.
    trust_remote_code=True,  # Trust any custom model-specific code.
    device_map="auto",  # Automatically map model components to available devices (CPU/GPU).
    local_files_only=True  # Use only local files, avoiding network downloads.
)

# Set the model to evaluation mode, which disables training-specific layers like dropout.
model.eval()

# Log the total time taken to load and initialize the model and tokenizer.
print(f"Model loaded and initialized in {time.time() - start_time:.2f} seconds.")

# Set the tokenizer's padding token to the model's end-of-sequence token.
# This ensures consistent padding for inputs of varying lengths.
tokenizer.pad_token = tokenizer.eos_token


Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded and initialized in 77.96 seconds.


In [3]:
import torch  # Core library for tensor computations and model operations.
from sklearn.metrics import f1_score  # Used for calculating the F1 score, a classification evaluation metric.
import random  # Provides tools to generate random numbers and random selections.

# Step 3: Function to generate predictions using prompting.
def classify_with_prompt(model, tokenizer, headlines, label_encoder):
    """
    Function to classify a list of headlines using a language model and prompting.

    Args:
    - model: Pre-trained language model.
    - tokenizer: Tokenizer to convert text into tokens for the model.
    - headlines: List of news headlines to classify.
    - label_encoder: Encoder to map categories to numeric labels.

    Returns:
    - predicted_labels: List of predicted label indices.
    """
    model.eval()  # Set the model to evaluation mode (disables dropout and other training-specific layers).
    predicted_labels = []  # Initialize an empty list to store predictions.

    for headline in headlines:
        # Construct a prompt instructing the model to classify the headline.
        prompt = (
            f"Classify the following headline into one of these categories: "
            f"{list(label_encoder.classes_)}.\n"
            f"Headline: '{headline}'\nCategory:"
        )
        
        # Tokenize the prompt and convert it to tensors suitable for model input.
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to('cuda')
        
        with torch.no_grad():  # Disable gradient computation to save memory and improve inference speed.
            # Generate model output using the prompt.
            outputs = model.generate(**inputs, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id)
        
        # Decode the generated output tokens back into a string.
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract the predicted category from the generated text.
        category = generated_text.split("Category:")[-1].strip().lower()

        # Post-process the predicted category to map it to a numeric label.
        if category in label_encoder.classes_:
            predicted_labels.append(label_encoder.transform([category])[0])
        else:
            predicted_labels.append(-1)  # Assign -1 for unrecognized categories.
    
    return predicted_labels

# Step 4: Apply the model to a random sample of 20 examples from the dataset.
sample_size = 1000  # Number of random samples to classify.
sample_indices = random.sample(range(len(texts)), sample_size)  # Randomly select indices from the dataset.
sample_texts = texts.iloc[sample_indices].tolist()  # Get the corresponding headlines.
sample_labels = labels.iloc[sample_indices].tolist()  # Get the true labels for the selected headlines.

# Classify the sampled headlines using the prompting method.
predicted = classify_with_prompt(model, tokenizer, sample_texts, label_encoder)

# Step 5: Calculate the F1 score for the sample.
f1 = f1_score(sample_labels, predicted, average='weighted', zero_division=1)
print(f"F1 Score for the sample: {f1}")  # Print the F1 score for the sample.

# Display 10 random predictions for inspection.
display_indices = random.sample(range(sample_size), 10)  # Randomly select 10 indices from the sample.
for i in display_indices:
    headline = sample_texts[i]  # Get the corresponding headline.
    true_label = sample_labels[i]  # Get the true label.
    pred_label = predicted[i]  # Get the predicted label.
    
    # Print the headline, true category, and predicted category.
    print(f"Headline: {headline}")
    print(f"True Category: {label_encoder.inverse_transform([true_label])[0]}, "
          f"Predicted Category: {label_encoder.inverse_transform([pred_label])[0] if pred_label != -1 else 'Unrecognized'}\n")




F1 Score for the sample: 0.8744585647252424
Headline: Fan gambling heard by players ‘every single round,’ says Jon Rahm
True Category: sports, Predicted Category: sports

Headline: Potential VP picks for Harris running mate
True Category: politics, Predicted Category: politics

Headline: 5 stories to start your day
True Category: sports, Predicted Category: Unrecognized

Headline: St. Louis rain continues, heat in the Pacific Northwest, and monsoon rain for Southwest
True Category: weather, Predicted Category: weather

Headline: Is it normal for cold-like symptoms to last for weeks? An expert explains
True Category: health, Predicted Category: health

Headline: Trump claims not to know who is behind Project 2025. A CNN review found at least 140 people who worked for him are involved
True Category: politics, Predicted Category: politics

Headline: Axelrod thinks Trump should be worried about Harris. Here's why
True Category: politics, Predicted Category: politics

Headline: Record-break