In [1]:
#Notebook showing how to execute annotation with SDG with a custom annotation yaml

# Annotation with SDG using Guided Decoding

## Importing the necessary libraries

In [17]:
# First Party
from instructlab.sdg.pipeline import Pipeline, PipelineContext
from instructlab.sdg.blocks.llmblock import LLMBlock
# Third Party
from datasets import load_dataset
from openai import OpenAI
import yaml
import os

## Serve LLM through ilab serve command

Run the following shell command to serve the Mixtral-8x7B-Instruct-v0.1 model on port 8000 (by default). The mixtral model is quite large and may take a while to be served through vLLM.

*Note*: You can serve any other desired model by changing the model-path argument. The rest of this notebook will work seamlessly with any other model as long we can wrap the served model in an OpenAI client

`ilab serve --model-path ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1/`

Wrap the served model in an OpenAI client

In [4]:
client = OpenAI(
    base_url="http://localhost:8000/v1",  # Your model endpoint
    api_key="dummy-key"  # vLLM doesn't check the key, but one is required
)

Make sure the model is served before running the next cell, and that the following cell returns the correct model id

In [5]:
models = client.models.list()
teacher_model = models.data[0].id
teacher_model #make sure this is the correct model

'/home/ec2-user/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1'

# Importing and preparing classification dataset

In [6]:
# Importing classification dataset from HuggingFace
Dataset = load_dataset("argilla/synthetic-text-classification-news")
print(Dataset) # print details of the dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})


In [7]:
# SDG requires a dataset with 'document' column
Dataset = Dataset.rename_column("text", "document")
print(Dataset) # print details of the dataset


DatasetDict({
    train: Dataset({
        features: ['document', 'label'],
        num_rows: 100
    })
})


Identify the unique categories in the dataset to include in the annotation yaml file

In [8]:
# Get unique categories from the dataset
labels = Dataset['train'].features['label'].names
labels

['science',
 'technology',
 'business',
 'health',
 'entertainment',
 'environment',
 'sports',
 'politics']

Create annotation yaml configuration to leverage guided decoding, and include the labels under the 'guided choice' key like so

In [40]:
# Create YAML configuration
yaml_config = {
    "version": "1.0",
    "blocks": [
        {
            "name": "annotation",
            "type": "LLMBlock",
            "config": {
                "config_path": "annotation_config.yaml",
                "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
                "output_cols": ["output"],
                "gen_kwargs": {
                    "max_tokens": 20,
                    "temperature": 0,
                    "extra_body": {
                        "guided_choice": labels  # This will use your labels list
                    }
                }
            },
            "drop_duplicates": ["document"]
        }
    ]
}

# Write to YAML file
with open('annotation_pipeline.yaml', 'w') as f: #this is the file that will be used to create the annotation pipeline
    yaml.dump(yaml_config, f, default_flow_style=False)

Split the dataset into ICL (In-Context Learning) examples, validation and unlabeled set

In [41]:
K = 3 #number of ICL examples
N = 7 #number of validation examples

icl_examples = Dataset['train'].select(range(K))
validation_examples = Dataset['train'].select(range(K, K+N))
unlabeled_examples = Dataset['train'].select(range(K+N, len(Dataset['train'])))

print(f"ICL examples: {len(icl_examples)}")
print(f"Validation examples: {len(validation_examples)}")
print(f"Unlabeled examples: {len(unlabeled_examples)}")

ICL examples: 3
Validation examples: 7
Unlabeled examples: 90


In [42]:
for sample in icl_examples:
    print(sample)


{'document': 'A star-studded cast, including Leonardo DiCaprio and Jennifer Lawrence, has been announced for the upcoming biographical drama film about the life of the famous musician, Elvis Presley. The movie, directed by Baz Luhrmann, is set to release in summer 2024 and promises to be a musical spectacle.', 'label': 4}
{'document': 'The recent study suggests a correlation between increased sugar consumption and a higher risk of cardiovascular disease, but more research is needed to confirm the causal relationship. The American Heart Association recommends that adults limit their daily sugar intake to 25 grams.', 'label': 3}
{'document': 'Rising sea levels threaten the homes of millions of people worldwide, with coastal cities like Miami and Bangkok already experiencing frequent flooding. Climate change is the main culprit behind this phenomenon, as the melting of polar ice caps contributes to a global increase in ocean levels. In response, governments are investing in sea walls and 

Create annotation config YAML including the ICL examples in it like so:

In [43]:
# Create annotation config YAML
annotation_config = {
    "system": "You are an expert in annotation. You will be given a text and you need to annotate it with the appropriate category based on the context of the text.",
    "introduction": "Task Description: annotate the following text with the appropriate category based on the context of the text.",
    "principles": """
- The category should be based on the context of the text
- Use the single word category that best describes the text
- Use the category that is most dominant in the given text""",
    "examples": "Here are some examples to help you understand the annotation task better:\n\n" + 
                "\n\n".join([
                    f"Input: {example['document']}\nLabel: {labels[int(example['label'])]}" 
                    for example in icl_examples
                ]),
    "generation": "Here is the query for annotation:\n{{document}}",
    "start_tags": [""],
    "end_tags": [""]
}

# Write to YAML file
with open('annotation_config.yaml', 'w') as f:
    yaml.dump(annotation_config, f, default_flow_style=False)

## Initialize pipeline context and annotation block

In [50]:

ctx = PipelineContext(client=client, model_family="mixtral", model_id=teacher_model)

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath(''))
# Create the absolute path to the pipeline yaml
pipeline_yaml = os.path.join(current_dir,"annotation","annotation_pipeline.yaml")

# Use it in from_file
annotation_pipe = Pipeline.from_file(ctx, pipeline_yaml)

Main Driver Code

In [51]:
gen_data = annotation_pipe.generate(unlabeled_examples)

Check output features

In [52]:
gen_data.features

{'document': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'output': Value(dtype='string', id=None)}

Print generated samples with true and predicted labels

In [53]:
for sample in gen_data:
    print("\ndocument: ", sample['document'], "\ntrue label: ", labels[int(sample['label'])], "\npredicted label: ", sample['output'])



document:  The New York Yankees defeated the Boston Red Sox 5-2 in a highly anticipated matchup at Yankee Stadium. Aaron Judge hit a home run in the bottom of the 8th inning, securing the win for the Yankees. 
true label:  sports 
predicted label:  sports

document:  The global economic downturn is largely attributed to the unstable global trade policies and the rising costs of raw materials, which has led to a sharp decline in consumer spending and a corresponding decrease in the demand for luxury goods. As a result, many high-end retailers are experiencing significant financial losses, prompting some to reconsider their business strategies. 
true label:  business 
predicted label:  business

document:  The Los Angeles Lakers' star player suffered a season-ending injury during a physical altercation with a teammate in practice, leaving the team's chances at the playoffs in jeopardy. 
true label:  sports 
predicted label:  business

document:  Rising global temperatures are melting po

## Saving results to HuggingFace dataset format

Note: Change the save path to your desired path

In [54]:
# First rename the columns
gen_data = gen_data.rename_column('label', 'true_label')
gen_data = gen_data.rename_column('output', 'predicted_label')

# Convert numeric labels to string labels if needed
gen_data = gen_data.map(lambda x: {'true_label': labels[int(x['true_label'])]})

# Save to JSONL format
gen_data.to_json('annotation_results.jsonl', lines=True, orient='records')

Map: 100%|██████████| 90/90 [00:00<00:00, 17547.76 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1013.85ba/s]


34352

## Calculate metrics

In [55]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report
import json

# Load predictions and true labels
true_labels = []
pred_labels = []

with open('annotation_results.jsonl', 'r') as f:
    for line in f:
        data = json.loads(line)
        true_labels.append(data['true_label'])
        pred_labels.append(data['predicted_label'])

# Calculate basic accuracy
accuracy = accuracy_score(true_labels, pred_labels)
print(f"Accuracy: {accuracy:.2%}")

# Calculate precision, recall, and F1 score for each class
precision, recall, f1, support = precision_recall_fscore_support(true_labels, pred_labels, average=None, labels=labels)

# Print metrics for each class
print("\nPer-class Metrics:")
print("Class\t\tPrecision\tRecall\t\tF1\t\tSupport")
print("-" * 70)
for i, label in enumerate(labels):
    print(f"{label:<12}\t{precision[i]:.2f}\t\t{recall[i]:.2f}\t\t{f1[i]:.2f}\t\t{support[i]}")

# Calculate and print macro and weighted averages
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='macro')
weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')

print("\nOverall Metrics:")
print(f"Macro Avg:\t{macro_precision:.2f}\t\t{macro_recall:.2f}\t\t{macro_f1:.2f}")
print(f"Weighted Avg:\t{weighted_precision:.2f}\t\t{weighted_recall:.2f}\t\t{weighted_f1:.2f}")

# Print detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(true_labels, pred_labels))

# Create and print confusion matrix
cm = confusion_matrix(true_labels, pred_labels, labels=labels)
print("\nConfusion Matrix:")
print("Labels:", labels)
print(cm)

Accuracy: 91.11%

Per-class Metrics:
Class		Precision	Recall		F1		Support
----------------------------------------------------------------------
science     	1.00		0.67		0.80		6
technology  	0.78		1.00		0.88		7
business    	0.77		1.00		0.87		10
health      	1.00		1.00		1.00		11
entertainment	0.95		0.90		0.92		20
environment 	0.92		1.00		0.96		23
sports      	1.00		0.57		0.73		7
politics    	1.00		0.83		0.91		6

Overall Metrics:
Macro Avg:	0.93		0.87		0.88
Weighted Avg:	0.92		0.91		0.91

Detailed Classification Report:
               precision    recall  f1-score   support

     business       0.77      1.00      0.87        10
entertainment       0.95      0.90      0.92        20
  environment       0.92      1.00      0.96        23
       health       1.00      1.00      1.00        11
     politics       1.00      0.83      0.91         6
      science       1.00      0.67      0.80         6
       sports       1.00      0.57      0.73         7
   technology       0.78      1.00 