# setup

In [1]:
!nvidia-smi

Fri Nov 22 03:07:43 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  |   00000000:0A:00.0 Off |                    0 |
| N/A   37C    P0             44W /  300W |       1MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install langchain langchain-huggingface bitsandbytes datasets matplotlib scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import getpass
import os
import random
import re
import matplotlib.pyplot as plt
import numpy as np

from time import time
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from datasets import load_dataset
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
)

In [6]:
api_env_key = "HUGGINGFACEHUB_API_TOKEN"
if os.environ.get(api_env_key) is None:
    os.environ[api_env_key] = getpass.getpass(
        "Enter your Hugging Face API key: "
    )

Enter your Hugging Face API key:  ········


In [7]:
llm = HuggingFacePipeline.from_model_id(
    model_id="meta-llama/Llama-3.1-8B-Instruct",
    task="text-generation",
    device=0, 
    pipeline_kwargs=dict(
        max_new_tokens=512,
        do_sample=True,
        repetition_penalty=1.1,
        pad_token_id = 128009 # extracted from chat_model.llm.pipeline.tokenizer.eos_token_id
    ),
)

2024-11-22 03:07:59.554442: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-22 03:07:59.575670: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-22 03:07:59.582862: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-22 03:07:59.600858: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
chat_model = ChatHuggingFace(llm=llm)
chat_model.llm.pipeline.tokenizer.pad_token_id = chat_model.llm.pipeline.tokenizer.eos_token_id

In [9]:
print(chat_model.llm.pipeline.tokenizer.eos_token_id)
print(type(chat_model.llm.pipeline.tokenizer.eos_token_id))

128009
<class 'int'>


In [10]:
dataset = load_dataset("sst2")

train_set = dataset["train"]
test_set = dataset["test"]

print(train_set[:5])
print(test_set[:5])

{'idx': [0, 1, 2, 3, 4], 'sentence': ['hide new secretions from the parental units ', 'contains no wit , only labored gags ', 'that loves its characters and communicates something rather beautiful about human nature ', 'remains utterly satisfied to remain the same throughout ', 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up '], 'label': [0, 0, 1, 0, 0]}
{'idx': [0, 1, 2, 3, 4], 'sentence': ["it 's a charming and often affecting journey . ", 'unflinchingly bleak and desperate ', 'allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . ', "the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . ", "it 's slow -- very , very slow . "], 'label': [1, 0, 1, 1, 0]}


In [11]:
# Filter positive and negative examples
positive_examples = [example for example in train_set if example["label"] == 1]
negative_examples = [example for example in train_set if example["label"] == 0]

# Randomly select 2 examples from each
random.seed(42)
selected_positive = random.sample(positive_examples, 2)
selected_negative = random.sample(negative_examples, 2)

# Combine selected examples
few_shot_examples = selected_positive + selected_negative

print(few_shot_examples)

[{'idx': 13189, 'sentence': 'at every opportunity to do something clever ', 'label': 1}, {'idx': 3002, 'sentence': 'most wondrously gifted artists ', 'label': 1}, {'idx': 55051, 'sentence': 'the one not-so-small problem with expecting is that the entire exercise has no real point . ', 'label': 0}, {'idx': 20225, 'sentence': 'only is entry number twenty the worst of the brosnan bunch ', 'label': 0}]


In [12]:
examples_text = "\n".join(
    f"Sentence: {example['sentence']}\n"
    f"Sentiment: {'Positive' if example['label'] == 1 else 'Negative'}"
    for example in few_shot_examples
)

examples_text_cot = "\n".join(
    f"Sentence: {example['sentence']}\n"
    f"Reasoning: {'This sentence has positive language and expresses satisfaction or praise.' if example['label'] == 1 else 'This sentence contains negative language or expresses dissatisfaction.'}\n"
    f"Sentiment: {'Positive' if example['label'] == 1 else 'Negative'}"
    for example in few_shot_examples
)

print(examples_text)
print(examples_text_cot)

Sentence: at every opportunity to do something clever 
Sentiment: Positive
Sentence: most wondrously gifted artists 
Sentiment: Positive
Sentence: the one not-so-small problem with expecting is that the entire exercise has no real point . 
Sentiment: Negative
Sentence: only is entry number twenty the worst of the brosnan bunch 
Sentiment: Negative
Sentence: at every opportunity to do something clever 
Reasoning: This sentence has positive language and expresses satisfaction or praise.
Sentiment: Positive
Sentence: most wondrously gifted artists 
Reasoning: This sentence has positive language and expresses satisfaction or praise.
Sentiment: Positive
Sentence: the one not-so-small problem with expecting is that the entire exercise has no real point . 
Reasoning: This sentence contains negative language or expresses dissatisfaction.
Sentiment: Negative
Sentence: only is entry number twenty the worst of the brosnan bunch 
Reasoning: This sentence contains negative language or expresses dis

# Functions

In [28]:
def zero_shot(content: str, examples = None):
    messages = [
        SystemMessage(
            content=f"Your goal is to read a sentence and classify its emotion into one of the following categories: {', '.join(label_map.values())}.\n\n"
                    f"Now classify the following sentence. The output MUST follow this format:\n"
                    f"Sentiment: [Classification]"
        ),
        HumanMessage(
            content=content
        ),
    ]

    response = chat_model.invoke(messages)
    return response.content

def few_shot(content: str, examples: str):
    messages = [
        SystemMessage(
            content=f"Your goal is to read a sentence and classify its emotion into one of the following categories: {', '.join(label_map.values())}.\n\n"
                    f"Here are some examples:\n{examples}\n\n"
                    f"Now classify the following sentence. The output MUST follow this format:\n"
                    f"Sentiment: [Classification]"
        ),
        HumanMessage(
            content=content
        ),
    ]

    response = chat_model.invoke(messages)
    return response.content

def parse_sentiment(response):
    sentiment_pattern = r"Sentiment:\s*(.*?)$"
    sentiment_match = re.search(sentiment_pattern, response)

    sentiment = sentiment_match.group(1).strip().lower() if sentiment_match else "invalid"

    if sentiment not in [label for label in label_map.values()]:
        sentiment = "invalid"

    return sentiment

def evaluate_model(dataset, fn, examples):
    t0 = time()
    predictions = []

    for idx, (sentence, true_label) in enumerate(zip(dataset["sentence"], dataset["label"]), 1):
        prediction = fn(sentence, examples)
        sentiment = parse_sentiment(prediction)

        predictions.append(sentiment)

        if idx % 100 == 0:
            print(f"---\nSentence: {sentence}\nTrue: {true_label}\nPrediction: {sentiment}")
            print(f"Processed {idx}/{len(dataset)} examples, Time: {time()-t0:.3f}\n---\n")

    return predictions

def create_reports(dataset, predictions):
    calc_accuracy(dataset, predictions)
    class_report(dataset, predictions)
    conf_matrix(dataset, predictions)
    acc_graph(dataset, predictions)

def calc_accuracy(dataset, predictions):
    true_labels = dataset["label"]
    accuracy = sum([1 if p == t else 0 for p, t in zip(predictions, true_labels)]) / len(true_labels) * 100
    print(f"accuracy: {accuracy:.4f}")

def class_report(dataset, predictions):
    true_labels = dataset["label"]
    print(classification_report(
            true_labels, 
            predictions, 
            target_names=list(label_map.values()) + ["invalid"],
            zero_division=0))

def conf_matrix(dataset, predictions):
    true_labels = dataset["label"]
    cm = confusion_matrix(true_labels, predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(label_map.values()) + ["invalid"])
    disp.plot()
    plt.show()

def acc_graph(dataset, predictions):
    true_labels = dataset["label"]
    accuracies = np.cumsum(np.array(true_labels) == np.array(predictions)) / np.arange(1, len(true_labels) + 1)
    plt.figure(figsize=(8, 5))
    plt.plot(accuracies * 100)
    plt.title("Accuracy Over Samples")
    plt.xlabel("Number of Samples")
    plt.ylabel("Accuracy (%)")
    plt.grid()
    plt.show()

# Zero-Shot Prompting Preview

In [13]:
ai_msg = zero_shot("The food was absolutely amazing and delightful!")
print(ai_msg)

ai_msg = zero_shot("The experience was the worst I've ever had.")
print(ai_msg)

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Your goal is to read a sentence and determine whether the sentiment is 'Positive' or 'Negative'. You will only respond with 'Positive' or 'Negative'.

Now classify the following sentence:<|eot_id|><|start_header_id|>user<|end_header_id|>

The food was absolutely amazing and delightful!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Positive
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Your goal is to read a sentence and determine whether the sentiment is 'Positive' or 'Negative'. You will only respond with 'Positive' or 'Negative'.

Now classify the following sentence:<|eot_id|><|start_header_id|>user<|end_header_id|>

The experience was the worst I've ever had.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Negative


# Few-Shot Prompting Preview

In [14]:
ai_msg = few_shot("The food was absolutely amazing and delightful!", examples_text)
print(ai_msg)

ai_msg = few_shot("The experience was the worst I've ever had.", examples_text)
print(ai_msg)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Your goal is to read a sentence and determine whether the sentiment is 'Positive' or 'Negative'. You will only respond with 'Positive' or 'Negative'. Here are some examples:

Sentence: at every opportunity to do something clever 
Sentiment: Positive
Sentence: most wondrously gifted artists 
Sentiment: Positive
Sentence: the one not-so-small problem with expecting is that the entire exercise has no real point . 
Sentiment: Negative
Sentence: only is entry number twenty the worst of the brosnan bunch 
Sentiment: Negative

Now classify the following sentence:<|eot_id|><|start_header_id|>user<|end_header_id|>

The food was absolutely amazing and delightful!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Positive
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Your goal is to read a se

# Few-Shot with Chain-of-Thought Preview

In [15]:
ai_msg = few_shot_cot("The food was absolutely amazing and delightful!", examples_text_cot)
print(ai_msg)

ai_msg = few_shot_cot("The experience was the worst I've ever had.", examples_text_cot)
print(ai_msg)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Your goal is to read a sentence and determine whether the sentiment is 'Positive' or 'Negative'. You will only respond with 'Positive' or 'Negative'. Here are some examples:

Sentence: at every opportunity to do something clever 
Reasoning: This sentence has positive language and expresses satisfaction or praise.
Sentiment: Positive
Sentence: most wondrously gifted artists 
Reasoning: This sentence has positive language and expresses satisfaction or praise.
Sentiment: Positive
Sentence: the one not-so-small problem with expecting is that the entire exercise has no real point . 
Reasoning: This sentence contains negative language or expresses dissatisfaction.
Sentiment: Negative
Sentence: only is entry number twenty the worst of the brosnan bunch 
Reasoning: This sentence contains negative language or expresses dissatisfaction.
Sentiment: Negative

Now classify the

# Zero-Shot Evaluation

In [25]:
zero_shot_results = evaluate_model(validation_set, zero_shot)

Processed 50/872 examples: Correct=41, Incorrect=9
Processed 100/872 examples: Correct=81, Incorrect=19
Processed 150/872 examples: Correct=123, Incorrect=27
Processed 200/872 examples: Correct=165, Incorrect=35
Processed 250/872 examples: Correct=201, Incorrect=49
Processed 300/872 examples: Correct=239, Incorrect=61
Processed 350/872 examples: Correct=279, Incorrect=71
Processed 400/872 examples: Correct=327, Incorrect=73
Processed 450/872 examples: Correct=368, Incorrect=82
Processed 500/872 examples: Correct=407, Incorrect=93
Processed 550/872 examples: Correct=446, Incorrect=104
Processed 600/872 examples: Correct=493, Incorrect=107
Processed 650/872 examples: Correct=537, Incorrect=113
Processed 700/872 examples: Correct=575, Incorrect=125
Processed 750/872 examples: Correct=620, Incorrect=130
Processed 800/872 examples: Correct=656, Incorrect=144
Processed 850/872 examples: Correct=700, Incorrect=150
Final Results: Correct=717, Incorrect=155, Accuracy=0.82225


# Few-Shot Evaluation

In [29]:
few_shot_results = evaluate_model(validation_set, few_shot)

Processed 50/872 examples: Correct=48, Incorrect=2
Processed 100/872 examples: Correct=94, Incorrect=6
Processed 150/872 examples: Correct=139, Incorrect=11
Processed 200/872 examples: Correct=186, Incorrect=14
Processed 250/872 examples: Correct=231, Incorrect=19
Processed 300/872 examples: Correct=277, Incorrect=23
Processed 350/872 examples: Correct=323, Incorrect=27
Processed 400/872 examples: Correct=372, Incorrect=28
Processed 450/872 examples: Correct=419, Incorrect=31
Processed 500/872 examples: Correct=465, Incorrect=35
Processed 550/872 examples: Correct=508, Incorrect=42
Processed 600/872 examples: Correct=557, Incorrect=43
Processed 650/872 examples: Correct=605, Incorrect=45
Processed 700/872 examples: Correct=648, Incorrect=52
Processed 750/872 examples: Correct=696, Incorrect=54
Processed 800/872 examples: Correct=741, Incorrect=59
Processed 850/872 examples: Correct=790, Incorrect=60
Final Results: Correct=809, Incorrect=63, Accuracy=0.92775


# Few-Shot Chain-of-Thought Evaluation

In [30]:
few_shot_cot_results = evaluate_model(validation_set, few_shot_cot)

Processed 50/872 examples: Correct=48, Incorrect=2
Processed 100/872 examples: Correct=93, Incorrect=7
Processed 150/872 examples: Correct=138, Incorrect=12
Processed 200/872 examples: Correct=184, Incorrect=16
Processed 250/872 examples: Correct=227, Incorrect=23
Processed 300/872 examples: Correct=270, Incorrect=30
Processed 350/872 examples: Correct=316, Incorrect=34
Processed 400/872 examples: Correct=364, Incorrect=36
Processed 450/872 examples: Correct=409, Incorrect=41
Processed 500/872 examples: Correct=453, Incorrect=47
Processed 550/872 examples: Correct=495, Incorrect=55
Processed 600/872 examples: Correct=544, Incorrect=56
Processed 650/872 examples: Correct=592, Incorrect=58
Processed 700/872 examples: Correct=634, Incorrect=66
Processed 750/872 examples: Correct=682, Incorrect=68
Processed 800/872 examples: Correct=724, Incorrect=76
Processed 850/872 examples: Correct=773, Incorrect=77
Final Results: Correct=792, Incorrect=80, Accuracy=0.90826
