In [None]:
from ICL_modules import data_loader, dataset_interface, s_random, experiment_basics, functions
from ICL_inference import inference
from ICL_calibrations import calibration_methods,new_calib
from run import run_multiple_calibration_experiments_generic
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LogitsProcessorList,
    LogitsProcessor
)
import random
import torch
import numpy as np
import pandas as pd
import itertools
import functools
import copy

# Intro to Package

In [None]:
# Load the SST-2 sentiment analysis dataset from the GLUE benchmark.
# 'from_cache=True' ensures faster loading by using a locally cached version if available.
sst_2 = data_loader.glue_sst2(from_cache=True)

In [None]:
# Retrieve the list of possible labels in the dataset.
# For SST-2, the labels represent sentiment: 'negative' or 'positive'.
sst_2.get_label_space()

['negative', 'positive']

In [None]:
# Get the corresponding numerical index of each label.
# This is useful for training models where labels need to be in integer form.
sst_2.find_index_from_label('negative'),sst_2.find_index_from_label('positive')

(0, 1)

In [None]:
# Convert label indices back to their corresponding text labels.
# This is useful when interpreting model predictions or displaying results.
sst_2.label_index_to_text(0),sst_2.label_index_to_text(1)

('negative', 'positive')

In [None]:
# Retrieve the dataset in the form of a list of tuples: [([text], label_index)]
# Each item is a single data sample:
#   - [text]: a list containing the sentence
#   - label_index: 0 for 'negative', 1 for 'positive'
sst_2.get_dataset()[:4]

[([', painful , obnoxious '], 0),
 (['the lady and the duke surprisingly '], 1),
 (['hoping for a stiff wind '], 0),
 (['woven together skilfully '], 1)]

In [None]:
# Access the 4th data example in the dataset.
# get_input_text(3): returns the text of the 4th sample
# get_label(3): returns the label index (e.g., 0 or 1) of the 4th sample
sst_2.get_input_text(3), sst_2.get_label(3)

(['woven together skilfully '], 'positive')

In [None]:
# Define the number of test samples and calculate the number of demonstration samples accordingly
test_samples = 512
dem_samples = len(sst_2) - test_samples  # Total dataset size minus test size

# Define the number of ICL examples (i.e k-shot)
k = 4
seed= 107
# Initialize a DatasetSplitter instance to handle train/test splits and k-shot setting
splitted_sst2 = dataset_interface.DatasetSplitter(sst_2, dem_samples, test_samples, seed)

In [None]:
# This is the *demonstration set* used for k-shot in-context learning (ICL).
# For example, if k=4, we randomly sample 4 examples (text + label) from the dataset.
# These 4 examples will be shown to the model as demonstrations, mimicking a few-shot learning setup.
# Important: After sampling these, we will no longer use or access this portion of the data.
# So we effectively only have 4 training examples in the ICL context.
splitted_sst2.demonstration.get_dataset()[:4]

[(['delightfully rendered '], 1),
 (['bewilderingly brilliant and '], 1),
 (['a beyond-lame satire , '], 0),
 (['engrossing portrait '], 1)]

In [None]:
# This is the test set, which is held out and never seen during training or demonstration sampling.
# After running the training or few-shot inference (e.g. via in-context learning),
# we evaluate the model's performance on this set.
splitted_sst2.test.get_dataset()[:4]

[(['with weak dialogue and biopic '], 0),
 (['more bizarre than '], 0),
 (['a charming , funny and beautifully crafted import '], 1),
 (['are the difference between this and countless other flicks about guys and dolls '],
  1)]

In [None]:
# Get the text label ('positive' or 'negative') of the 1st sample in the test and demonstration set
splitted_sst2.test.get_label(0), splitted_sst2.demonstration.get_label(0)

('negative', 'positive')

In [None]:
# Get the *ground truth label* of a specific sample.
# You can choose whether to fetch from the test set or the training set using test_set=True/False.

# Label of the 1st sample in the test set
splitted_sst2.get_ground_truth_label(0, test_set=True),

# Label of the 1st sample in the training set
splitted_sst2.get_ground_truth_label(0, test_set=False)

'positive'

In [None]:
# This cell constructs a prompt string for k-shot in-context learning (ICL)
# using the `prompt_writter()` method.

# What is this for?
# The resulting prompt is used to guide a language model in predicting the sentiment
# of a test sentence by showing it a few labeled examples beforehand.

# Explanation of what's happening:
# The first argument to `prompt_writter()` is a list of demonstration examples in the format:
#     ([input_text], label_text)
# Each demonstration example consists of:
#   - input_text: the sentence as a list of tokens/words
#   - label_text: the corresponding sentiment label ('positive' or 'negative')

# This list comprehension:
#     [ (splitted_sst2.demonstration.get_input_text(i), splitted_sst2.demonstration.get_label(i)) for i in [2019, 4761, 3483, 3952] ]
# does the following:
#   - It selects 4 specific examples from the demonstration set, using their indices.
#   - These indices [2019, 4761, 3483, 3952] were  chosen randomly in experiments
#     and represent the k=4 examples we want to show the model.
#   - For each index `i`, it gets the sentence and its label.

# The second argument to `prompt_writter()`:
#     splitted_sst2.test.get_input_text(0)
# fetches the input text (as a list) of the first test sample — the sentence we want the model to label.

# Output format:
# The resulting prompt will look like:
#     sentence: demo1_text    sentiment: label1
#     sentence: demo2_text    sentiment: label2
#     ...
#     sentence: test_text     sentiment:
#
# The model is expected to complete the last line with a predicted sentiment.

splitted_sst2.prompt_writter([(splitted_sst2.demonstration.get_input_text(i),splitted_sst2.demonstration.get_label(i)) for i in [2019, 4761, 3483, 3952]],
                             splitted_sst2.test.get_input_text(0))

'sentence: establishes itself as a durable part of the movie landscape : a james bond series for kids  sentiment: positive\nsentence: no-holds-barred cinematic  sentiment: positive\nsentence: this odd , poetic road movie , spiked by jolts of pop music  sentiment: positive\nsentence: comes the first lousy guy ritchie imitation .  sentiment: negative\nsentence: with weak dialogue and biopic  sentiment:  '

In [None]:
# Create an Experiment object for SST-2 using the previously split dataset
# and the desired k-shot configuration.
#
# `Experiment` is a utility class designed to:
# - Manage the overall experiment setup
# - Automatically handle prompt construction
# - Interface with test samples
#
# Arguments:
#   splitted_sst2 : the dataset wrapper that holds train/test/demonstration splits
#   k             : the number of in-context examples to use in each prompt
#
# This object will make it easier to run multiple test samples systematically.

sst2_experiment = experiment_basics.Experiment(splitted_sst2, k=k, seed=seed)


In [None]:
# 👇 Generate a prompt for a specific test sample (in this case, index 0)
#
# This method:
# - Samples k demonstration examples automatically (no need to manually specify indices)
# - Retrieves the input sentence at test index 0
# - Constructs a full ICL prompt of the form:
#
#     sentence: demo1_text  sentiment: label1
#     sentence: demo2_text  sentiment: label2
#     ...
#     sentence: test_input_text  sentiment:
#
# This prompt can now be sent to a language model to predict the sentiment
# of the final test sentence.
#
# Note: This abstracts away the manual pairing and formatting from earlier cells.

sst2_experiment.get_prompts_for_test_sample(0)

"sentence: a tendency to sag in certain places  sentiment: negative\nsentence: authentic christmas spirit  sentiment: positive\nsentence: mixed messages ,  sentiment: negative\nsentence: far from heaven is a dazzling conceptual feat , but more than that , it 's a work of enthralling drama .  sentiment: positive\nsentence: with weak dialogue and biopic  sentiment:  "