# Test the prompt with different models

## Test with DAMO-NLP-SG / zero-shot-classify-SSTuning-XLM-R 

https://huggingface.co/DAMO-NLP-SG/zero-shot-classify-SSTuning-XLM-R

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, string, random

In [None]:
model_name = "DAMO-NLP-SG/zero-shot-classify-SSTuning-XLM-R" # @param ["DAMO-NLP-SG/zero-shot-classify-SSTuning-base", "DAMO-NLP-SG/zero-shot-classify-SSTuning-large", "DAMO-NLP-SG/zero-shot-classify-SSTuning-ALBERT", "DAMO-NLP-SG/zero-shot-classify-SSTuning-XLM-R"]

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
list_ABC = [x for x in string.ascii_uppercase]

def add_prefix(text, list_label, shuffle=False):
    # Append a period '.' to each label. This will improve the accuracy
    list_label = [x+'.' if x[-1] not in ['.','!'] else x for x in list_label]

    # Extend the list_label with padding tokens to have a length of 20
    list_label_new = list_label + [tokenizer.pad_token]* (2 - len(list_label))

    if shuffle:
        # Shuffle the order of elements in list_label_new if shuffle flag is True
        random.shuffle(list_label_new)

    # Create a string representation of label options by combining each label with its corresponding index
    s_option = ' '.join(['('+list_ABC[i]+') '+list_label_new[i] for i in range(len(list_label_new))])

    # Return the modified text with label options and the list_label_new
    return f'{s_option} {tokenizer.sep_token} {text}', list_label_new

In [None]:
def check_text(model, text, list_label, shuffle=False):
    # Add prefixes to the text using the add_prefix function
    text, list_label_new = add_prefix(text, list_label, shuffle=shuffle)

    # Set the model to evaluation mode and move it to the appropriate device
    model.to(device).eval()

    # Perform tokenization and encoding of the text
    encoding = tokenizer([text], truncation=True, max_length=512)

    # Create a dictionary of tensors for the encoded text
    item = {key: torch.tensor(val).to(device) for key, val in encoding.items()}

    # Generate logits from the model
    logits = model(**item).logits

    # Select a subset of logits based on shuffle flag
    logits = logits if shuffle else logits[:, 0:len(list_label)]

    # Convert logits to probabilities using softmax
    probs = torch.nn.functional.softmax(logits, dim=-1).tolist()

    # Get the predicted label index
    predicted_index = torch.argmax(logits, dim=-1).item()
    predicted_label = list_label[predicted_index]

    # Get the probability of the predicted label
    probability = probs[0][predicted_index] * 100

    # Return the predicted label and probability
    return predicted_label, probability


In [None]:
# First test only with one prompt as example
import json

total_correct = 0
total_pairs = 0

# Load the JSONL file and process each pair
with open("textual_entailment-task1-test-data.jsonl", 'r') as file:
    for line in file:
        pair = json.loads(line)
        text = pair["text"]
        hypothesis = pair["hypothesis"]
        choices = pair["choices"]
        label = pair["label"]

        # Format the prompt with text and hypothesis
        prompt_formatted = f"La frase {text} sostiene la frase {hypothesis}?"
        
        output = classifier(prompt_formatted, choices, multi_label=False)
        sequence = output['sequence']
        selected_label = output['labels'][0] if output['scores'][0] > output['scores'][1] else output['labels'][1]
        accuracy = max(output['scores'])

        # Accumulate statistics
        if selected_label == choices[label]:
            total_correct += 1
        total_pairs += 1

    # Calculate overall statistics only if at least one pair was processed
    if total_pairs > 0:
        accuracy = total_correct / total_pairs * 100

        # Print statistics
        print("Overall Statistics:")
        print("Total Pairs:", total_pairs)
        print("Total Correct:", total_correct)
        print("Accuracy:", round(accuracy, 2), "%")
    else:
        print("No pairs found in the JSONL file.")

## Test with XLM-roBERTa-large-it-mnli

https://huggingface.co/Jiva/xlm-roberta-large-it-mnli

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="Jiva/xlm-roberta-large-it-mnli", device=0, use_fast=True, multi_label=True)


In [None]:
# we will classify the following wikipedia entry about Sardinia"
sequence_to_classify = "La frase 'Pieralfonso Fratta Pasini' è un imprenditore e un politico italiano. sostiene la frase 'Pieralfonso Fratta Pasini' è un imprenditore e politico italiano.?"
# we can specify candidate labels in Italian:
candidate_labels = ["implicato, non implicato"]
classifier(sequence_to_classify, candidate_labels)
# {'labels': ['geografia', 'moda', 'politica', 'macchine', 'cibo'],
# 'scores': [0.38871392607688904, 0.22633370757102966, 0.19398456811904907, 0.13735772669315338, 0.13708525896072388]}


In [None]:
import json
import torch

# Function to process each pair in the JSONL file
def visualize_prompt_response(jsonl_file):
    with open(jsonl_file, 'r') as file:
        count = 0  # Counter to track processed pairs
        for line in file:
            if count >= 5:  # Break the loop if 5 pairs have been processed
                break
            pair = json.loads(line)
            id = pair["id"]
            text = pair["text"]
            hypothesis = pair["hypothesis"]
            choices = pair["choices"]
            label = pair["label"]

            # Format the prompt with text and hypothesis
            prompt_formatted = f"La frase {text} sostiene la frase {hypothesis}?"
            
            output = classifier(prompt_formatted, choices, multi_label=False)
            sequence = output['sequence']
            selected_label = output['labels'][0] if output['scores'][0] > output['scores'][1] else output['labels'][1]
            accuracy = max(output['scores'])
            
            # Print the required information
            print("ID:", id)
            print("Sequence:", sequence)
            print("Selected Label:", selected_label)
            print("Accuracy:", round(accuracy, 3))
            print("Is Selected Label Correct:", selected_label == choices[label])
            print()

            count += 1  # Increment the counter for processed pairs

jsonl_file = 'textual_entailment-task1-train-data.jsonl'
visualize_prompt_response(jsonl_file)

- Always getting CUDA memory exceed 

In [None]:
# First test only with one prompt as example
import json

total_correct = 0
total_pairs = 0

# Load the JSONL file and process each pair
with open("textual_entailment-task1-test-data.jsonl", 'r') as file:
    for line in file:
        pair = json.loads(line)
        text = pair["text"]
        hypothesis = pair["hypothesis"]
        choices = pair["choices"]
        label = pair["label"]

        # Format the prompt with text and hypothesis
        prompt_formatted = f"La frase {text} sostiene la frase {hypothesis}?"
        
        output = classifier(prompt_formatted, choices, multi_label=False)
        sequence = output['sequence']
        selected_label = output['labels'][0] if output['scores'][0] > output['scores'][1] else output['labels'][1]
        accuracy = max(output['scores'])

        # Accumulate statistics
        if selected_label == choices[label]:
            total_correct += 1
        total_pairs += 1

    # Calculate overall statistics only if at least one pair was processed
    if total_pairs > 0:
        accuracy = total_correct / total_pairs * 100

        # Print statistics
        print("Overall Statistics:")
        print("Total Pairs:", total_pairs)
        print("Total Correct:", total_correct)
        print("Accuracy:", round(accuracy, 2), "%")
    else:
        print("No pairs found in the JSONL file.")

## Test with comprehend-it-multilang-base

https://huggingface.co/knowledgator/comprehend_it-multilingual-t5-base

In [None]:
from liqfit.pipeline import ZeroShotClassificationPipeline
from liqfit.models import T5ForZeroShotClassification
from transformers import T5Tokenizer

model = T5ForZeroShotClassification.from_pretrained('knowledgator/comprehend_it-multilingual-t5-base')
tokenizer = T5Tokenizer.from_pretrained('knowledgator/comprehend_it-multilingual-t5-base')
classifier_comprehend = ZeroShotClassificationPipeline(model=model, tokenizer=tokenizer,
                                                      hypothesis_template = '{}', encoder_decoder = True)


In [None]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']
classifier_comprehend(sequence_to_classify, candidate_labels, multi_label=False)
# {'sequence': 'one day I will see the world',
#  'labels': ['travel', 'cooking', 'dancing'],
#  'scores': [0.7350383996963501, 0.1484801471233368, 0.1164814680814743]}

In [None]:
import json
import torch

# Function to process each pair in the JSONL file
def visualize_prompt_response(jsonl_file):
    with open(jsonl_file, 'r') as file:
        count = 0  # Counter to track processed pairs
        for line in file:
            if count >= 5:  # Break the loop if 5 pairs have been processed
                break
            pair = json.loads(line)
            id = pair["id"]
            text = pair["text"]
            hypothesis = pair["hypothesis"]
            choices = pair["choices"]
            label = pair["label"]

            # Format the prompt with text and hypothesis
            prompt_formatted = f"La frase {text} sostiene la frase {hypothesis}?"
            
            output = classifier_comprehend(prompt_formatted, choices, multi_label=False)
            sequence = output['sequence']
            selected_label = output['labels'][0] if output['scores'][0] > output['scores'][1] else output['labels'][1]
            accuracy = max(output['scores'])
            
            # Print the required information
            print("ID:", id)
            print("Sequence:", sequence)
            print("Selected Label:", selected_label)
            print("Accuracy:", round(accuracy, 3))
            print("Is Selected Label Correct:", selected_label == choices[label])
            print()

            count += 1  # Increment the counter for processed pairs

jsonl_file = 'textual_entailment-task1-train-data.jsonl'
visualize_prompt_response(jsonl_file)

In [None]:
import json

total_correct = 0
total_pairs = 0

# Load the JSONL file and process each pair
with open("textual_entailment-task1-test-data.jsonl", 'r') as file:
    for line in file:
        pair = json.loads(line)
        text = pair["text"]
        hypothesis = pair["hypothesis"]
        choices = pair["choices"]
        label = pair["label"]

        # Format the prompt with text and hypothesis
        prompt_formatted = f"La frase {text} sostiene la frase {hypothesis}?"
        
        # Assuming classifier is a function that returns the output sequence and scores
        output = classifier(prompt_formatted, choices, multi_label=False)
    
        # Check and convert tensor precision
        for key in output.keys():
            if torch.is_tensor(output[key]) and output[key].dtype == torch.float16:
                output[key] = output[key].to(torch.float32)

        sequence = output['sequence']
        selected_label = output['labels'][0] if output['scores'][0] > output['scores'][1] else output['labels'][1]
        accuracy = max(output['scores'])

        # Accumulate statistics
        if selected_label == choices[label]:
            total_correct += 1
        total_pairs += 1

    # Calculate overall statistics only if at least one pair was processed
    if total_pairs > 0:
        accuracy = total_correct / total_pairs * 100

        # Print statistics
        print("Overall Statistics:")
        print("Total Pairs:", total_pairs)
        print("Total Correct:", total_correct)
        print("Accuracy:", round(accuracy, 2), "%")
    else:
        print("No pairs found in the JSONL file.")


In [None]:
# Train model in order to get better acuuracy from pormpts

In [None]:
import warnings
import torch
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score

# Suppress warnings
warnings.filterwarnings("ignore")

# Load the pretrained model and tokenizer
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Define your dataset loading and processing functions
def load_dataset(file_path):
    dataset = []
    with open(file_path, 'r') as file:
        for line in file:
            example = json.loads(line)
            dataset.append(example)
    return dataset

# Define training parameters
epochs = 100
#learning_rate = 2e-5
learning_rate = 0.1
batch_size = 8  # Reduce batch size
accumulation_steps = 4  # Define accumulation steps

# Prepare optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Map label strings to integers
label_map = {'implicato': 0, 'non implicato': 1}

# Fine-tuning loop
for epoch in range(epochs):
    # Load training data
    train_dataset = load_dataset("textual_entailment-task1-train-data.jsonl")
    total_loss = 0.0
    model.train()
    for batch_start in range(0, len(train_dataset), batch_size):
        batch = train_dataset[batch_start:batch_start+batch_size]
        texts = [example["text"] for example in batch]
        hypotheses = [example["hypothesis"] for example in batch]
        labels = [label_map.get(example["label"], example["label"]) for example in batch]  # Convert label strings to integers
        labels = torch.tensor(labels)  # Convert labels to tensor
        
        # Tokenize inputs
        inputs = tokenizer(texts, hypotheses, padding=True, truncation=True, return_tensors="pt")

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute loss
        loss = loss_fn(logits, labels)

        # Backward pass
        loss.backward()

        # Gradient accumulation
        if (batch_start + 1) % accumulation_steps == 0 or batch_start == len(train_dataset) - 1:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()

    # Print average loss for the epoch
    print(f"Epoch {epoch+1}, Average Loss: {total_loss / len(train_dataset)}")

# Save model and tokenizer
model.save_pretrained("trained_model/")
tokenizer.save_pretrained("trained_model/")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the trained model and tokenizer
model_path = "trained_model/"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

def classifier_comprehend(prompt, choices, multi_label=False):
    results = []
    for choice in choices:
        inputs = tokenizer(prompt, choice, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():  # Disable gradient calculation
            outputs = model(**inputs)
        logits = outputs.logits
        probabilities = logits.softmax(dim=1)
        
        # For binary classification, return the label with the highest probability
        if not multi_label:
            label = torch.argmax(probabilities, dim=1).item()
            results.append({'sequence': prompt, 'label': choice, 'score': probabilities[0][label].item()})
        else:
            # For multi-label classification, return all labels with probabilities
            scores, labels = torch.topk(probabilities, k=len(choices), dim=1)
            labels = [choices[i] for i in labels.squeeze().tolist()]
            results.append({'sequence': prompt, 'label': labels, 'score': scores.squeeze().tolist()})
    return results



In [None]:
# Function to process each pair in the JSONL file
def visualize_prompt_response(jsonl_file):
    with open(jsonl_file, 'r') as file:
        count = 0  # Counter to track processed pairs
        for line in file:
            if count >= 5:  # Break the loop if 5 pairs have been processed
                break
            pair = json.loads(line)
            id = pair["id"]
            text = pair["text"]
            hypothesis = pair["hypothesis"]
            choices = pair["choices"]
            label = pair["label"]

            # Format the prompt with text and hypothesis
            prompt_formatted = f"La frase {text} sostiene la frase {hypothesis}?"
            
            outputs = classifier_comprehend(prompt_formatted, choices, multi_label=False)
            for output in outputs:
                selected_label = output['label']
                accuracy = output['score']
                
                # Print the required information
                print("ID:", id)
                print("Sequence:", prompt_formatted)
                print("Selected Label:", selected_label)
                print("Accuracy:", round(accuracy, 3))
                print("Is Selected Label Correct:", selected_label == choices[label])
                print()

            count += 1  # Increment the counter for processed pairs

jsonl_file = 'textual_entailment-task1-train-data.jsonl'
visualize_prompt_response(jsonl_file)


In [None]:
import json

total_correct = 0
total_pairs = 0

# Load the JSONL file and process each pair
with open("textual_entailment-task1-test-data.jsonl", 'r') as file:
    for line in file:
        pair = json.loads(line)
        text = pair["text"]
        hypothesis = pair["hypothesis"]
        choices = pair["choices"]
        label = pair["label"]

        # Format the prompt with text and hypothesis
        prompt_formatted = f"La frase {text} sostiene la frase {hypothesis}?"
        
        # Assuming classifier is a function that returns the output sequence and scores
        outputs = classifier_comprehend(prompt_formatted, choices, multi_label=False)
        for output in outputs:
            selected_label = output['label']
            accuracy = output['score']
    
        # Accumulate statistics
        if selected_label == choices[label]:
            total_correct += 1
        total_pairs += 1

    # Calculate overall statistics only if at least one pair was processed
    if total_pairs > 0:
        accuracy = total_correct / total_pairs * 100

        # Print statistics
        print("Overall Statistics:")
        print("Total Pairs:", total_pairs)
        print("Total Correct:", total_correct)
        print("Accuracy:", round(accuracy, 2), "%")
    else:
        print("No pairs found in the JSONL file.")
