<a href="https://colab.research.google.com/github/chethana613/LLM-Readings-and-Assignments/blob/main/LLMAssignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Zero Shot # **Bloom 560 and Bloomz 560** With 2 Different Prompts

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report,precision_recall_fscore_support
import torch
from collections import defaultdict
from tqdm import tqdm

# Load the dataset
dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

model_name = "bigscience/bloom-560m"
#model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
#model_name = "bigscience/bloomz-560m"
#model_name = "bigscience/bloomz-1b1"
#model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "This tweet expresses joy and positivity.",
    "negative": "The following tweet conveys sadness and disappointment.",
    "neutral": "The sentiment of this tweet is neutral and lacks strong emotions."
}

zero_shot_predictions = []
true_labels = []

# Wrap the loop with tqdm
for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_metrics = precision_recall_fscore_support(true_labels, zero_shot_predictions, average='weighted')
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print(f"Zero-shot Precision: {zero_shot_metrics[0]}")
print(f"Zero-shot Recall: {zero_shot_metrics[1]}")
print(f"Zero-shot F1-score: {zero_shot_metrics[2]}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/155k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/29.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/64.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1839 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/324 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/870 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/870 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 870/870 [22:34<00:00,  1.56s/it]

Zero-shot Accuracy: 0.3333333333333333
Zero-shot Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       290
     neutral       0.00      0.00      0.00       290
    positive       0.33      1.00      0.50       290

    accuracy                           0.33       870
   macro avg       0.11      0.33      0.17       870
weighted avg       0.11      0.33      0.17       870




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report,precision_recall_fscore_support
import torch
from collections import defaultdict
from tqdm import tqdm

# Load the dataset
dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

model_name = "bigscience/bloom-560m"
#model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
#model_name = "bigscience/bloomz-560m"
#model_name = "bigscience/bloomz-1b1"
#model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "The sentiment of this tweet is uplifting or joy or positivity or cheerful or happiness or positivity or convey optimism and delight ",
    "negative": "The following tweet conveys gloomy and disheartening or sorrow and dismay or disappointment and frustration.",
    "neutral": "The sentiment of this tweet is neutral tone without strong emotions or  balanced and impartial or  lacks emotional intensity and remains neutral."
}

zero_shot_predictions = []
true_labels = []

# Wrap the loop with tqdm
for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_metrics = precision_recall_fscore_support(true_labels, zero_shot_predictions, average='weighted')
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print(f"Zero-shot Precision: {zero_shot_metrics[0]}")
print(f"Zero-shot Recall: {zero_shot_metrics[1]}")
print(f"Zero-shot F1-score: {zero_shot_metrics[2]}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)



tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/870 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 870/870 [33:02<00:00,  2.28s/it]

Zero-shot Accuracy: 0.32988505747126434
Zero-shot Precision: 0.3091190679170219
Zero-shot Recall: 0.32988505747126434
Zero-shot F1-score: 0.19253062839154803
Zero-shot Classification Report:
              precision    recall  f1-score   support

    negative       0.48      0.04      0.08       290
     neutral       0.12      0.01      0.01       290
    positive       0.33      0.94      0.49       290

    accuracy                           0.33       870
   macro avg       0.31      0.33      0.19       870
weighted avg       0.31      0.33      0.19       870






In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report,precision_recall_fscore_support
import torch
from collections import defaultdict
from tqdm import tqdm

# Load the dataset
dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

#model_name = "bigscience/bloom-560m"
#model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
model_name = "bigscience/bloomz-560m"
#model_name = "bigscience/bloomz-1b1"
#model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "This tweet expresses joy and positivity.",
    "negative": "The following tweet conveys sadness and disappointment.",
    "neutral": "The sentiment of this tweet is neutral and lacks strong emotions."
}

zero_shot_predictions = []
true_labels = []

# Wrap the loop with tqdm
for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_metrics = precision_recall_fscore_support(true_labels, zero_shot_predictions, average='weighted')
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print(f"Zero-shot Precision: {zero_shot_metrics[0]}")
print(f"Zero-shot Recall: {zero_shot_metrics[1]}")
print(f"Zero-shot F1-score: {zero_shot_metrics[2]}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)



In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
import torch
from collections import defaultdict
from tqdm import tqdm

# Load the dataset
dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

#model_name = "bigscience/bloom-560m"
#model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
model_name = "bigscience/bloomz-560m"
#model_name = "bigscience/bloomz-1b1"
#model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "The sentiment of this tweet is uplifting or joy or positivity or cheerful or happiness or positivity or convey optimism and delight ",
    "negative": "The following tweet conveys gloomy and disheartening or sorrow and dismay or disappointment and frustration.",
    "neutral": "The sentiment of this tweet is neutral tone without strong emotions or  balanced and impartial or  lacks emotional intensity and remains neutral."
}

zero_shot_predictions = []
true_labels = []

# Wrap the loop with tqdm
for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/155k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/29.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/64.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1839 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/324 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/870 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloomz-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/870 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 870/870 [32:26<00:00,  2.24s/it]

Zero-shot Accuracy: 0.3333333333333333
Zero-shot Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       290
     neutral       0.00      0.00      0.00       290
    positive       0.33      1.00      0.50       290

    accuracy                           0.33       870
   macro avg       0.11      0.33      0.17       870
weighted avg       0.11      0.33      0.17       870




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Zero Shot # **Bloom 1b1 and Bloomz 1b1** With 2 Different Prompts

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
import torch
from collections import defaultdict
from tqdm import tqdm

# Load the dataset
dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

#model_name = "bigscience/bloom-560m"
model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
#model_name = "bigscience/bloomz-560m"
#model_name = "bigscience/bloomz-1b1"
#model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "This tweet expresses joy and positivity.",
    "negative": "The following tweet conveys sadness and disappointment.",
    "neutral": "The sentiment of this tweet is neutral and lacks strong emotions."
}

zero_shot_predictions = []
true_labels = []

# Wrap the loop with tqdm
for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)


model.safetensors:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-1b1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/870 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 870/870 [58:28<00:00,  4.03s/it]

Zero-shot Accuracy: 0.32068965517241377
Zero-shot Classification Report:
              precision    recall  f1-score   support

    negative       0.25      0.07      0.11       290
     neutral       0.50      0.00      0.01       290
    positive       0.33      0.89      0.48       290

    accuracy                           0.32       870
   macro avg       0.36      0.32      0.20       870
weighted avg       0.36      0.32      0.20       870






In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report,precision_recall_fscore_support
import torch
from collections import defaultdict
from tqdm import tqdm

# Load the dataset
dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

#model_name = "bigscience/bloom-560m"
#model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
#model_name = "bigscience/bloomz-560m"
model_name = "bigscience/bloomz-1b1"
#model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "This tweet expresses joy and positivity.",
    "negative": "The following tweet conveys sadness and disappointment.",
    "neutral": "The sentiment of this tweet is neutral and lacks strong emotions."
}

zero_shot_predictions = []
true_labels = []

# Wrap the loop with tqdm
for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_metrics = precision_recall_fscore_support(true_labels, zero_shot_predictions, average='weighted')
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print(f"Zero-shot Precision: {zero_shot_metrics[0]}")
print(f"Zero-shot Recall: {zero_shot_metrics[1]}")
print(f"Zero-shot F1-score: {zero_shot_metrics[2]}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/155k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/29.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/64.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1839 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/324 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/870 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloomz-1b1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/870 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 870/870 [1:02:16<00:00,  4.30s/it]

Zero-shot Accuracy: 0.33448275862068966
Zero-shot Precision: 0.3656565656565657
Zero-shot Recall: 0.33448275862068966
Zero-shot F1-score: 0.18339062293898234
Zero-shot Classification Report:
              precision    recall  f1-score   support

    negative       0.36      0.01      0.03       290
     neutral       0.40      0.01      0.03       290
    positive       0.33      0.98      0.50       290

    accuracy                           0.33       870
   macro avg       0.37      0.33      0.18       870
weighted avg       0.37      0.33      0.18       870






In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
import torch
from collections import defaultdict
from tqdm import tqdm

# Load the dataset
dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

#model_name = "bigscience/bloom-560m"
#model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
#model_name = "bigscience/bloomz-560m"
model_name = "bigscience/bloomz-1b1"
#model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "This tweet expresses joy and positivity.",
    "negative": "The following tweet conveys sadness and disappointment.",
    "neutral": "The sentiment of this tweet is neutral and lacks strong emotions."
}

zero_shot_predictions = []
true_labels = []

# Wrap the loop with tqdm
for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloomz-1b1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/870 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 870/870 [57:42<00:00,  3.98s/it]

Zero-shot Accuracy: 0.36666666666666664
Zero-shot Classification Report:
              precision    recall  f1-score   support

    negative       0.18      0.02      0.04       290
     neutral       0.36      0.27      0.31       290
    positive       0.38      0.81      0.51       290

    accuracy                           0.37       870
   macro avg       0.31      0.37      0.29       870
weighted avg       0.31      0.37      0.29       870






In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report,precision_recall_fscore_support
import torch
from collections import defaultdict
from tqdm import tqdm

# Load the dataset
dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

#model_name = "bigscience/bloom-560m"
#model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
#model_name = "bigscience/bloomz-560m"
model_name = "bigscience/bloomz-1b1"
#model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "The sentiment of this tweet is uplifting or joy or positivity or cheerful or happiness or positivity or convey optimism and delight ",
    "negative": "The following tweet conveys gloomy and disheartening or sorrow and dismay or disappointment and frustration.",
    "neutral": "The sentiment of this tweet is neutral tone without strong emotions or  balanced and impartial or  lacks emotional intensity and remains neutral."
}

zero_shot_predictions = []
true_labels = []

# Wrap the loop with tqdm
for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_metrics = precision_recall_fscore_support(true_labels, zero_shot_predictions, average='weighted')
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print(f"Zero-shot Precision: {zero_shot_metrics[0]}")
print(f"Zero-shot Recall: {zero_shot_metrics[1]}")
print(f"Zero-shot F1-score: {zero_shot_metrics[2]}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)



tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloomz-1b1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/870 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 870/870 [1:20:23<00:00,  5.54s/it]

Zero-shot Accuracy: 0.3333333333333333
Zero-shot Precision: 0.1111111111111111
Zero-shot Recall: 0.3333333333333333
Zero-shot F1-score: 0.16666666666666666
Zero-shot Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       290
     neutral       0.00      0.00      0.00       290
    positive       0.33      1.00      0.50       290

    accuracy                           0.33       870
   macro avg       0.11      0.33      0.17       870
weighted avg       0.11      0.33      0.17       870




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Zero Shot # **Bloom 1b7 and Bloomz 1b7** With 2 Different Prompts

In [None]:
# model_name = "bigscience/bloomz-1b7"

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report,precision_recall_fscore_support
import torch
from collections import defaultdict
from tqdm import tqdm


dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

#model_name = "bigscience/bloom-560m"
#model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
#model_name = "bigscience/bloomz-560m"
#model_name = "bigscience/bloomz-1b1"
model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "This tweet expresses joy and positivity.",
    "negative": "The following tweet conveys sadness and disappointment.",
    "neutral": "The sentiment of this tweet is neutral and lacks strong emotions."
}

zero_shot_predictions = []
true_labels = []

for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_metrics = precision_recall_fscore_support(true_labels, zero_shot_predictions, average='weighted')
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print(f"Zero-shot Precision: {zero_shot_metrics[0]}")
print(f"Zero-shot Recall: {zero_shot_metrics[1]}")
print(f"Zero-shot F1-score: {zero_shot_metrics[2]}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloomz-1b7 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/870 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 870/870 [1:55:04<00:00,  7.94s/it]

Zero-shot Accuracy: 0.3333333333333333
Zero-shot Precision: 0.1111111111111111
Zero-shot Recall: 0.3333333333333333
Zero-shot F1-score: 0.16666666666666666
Zero-shot Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       290
     neutral       0.00      0.00      0.00       290
    positive       0.33      1.00      0.50       290

    accuracy                           0.33       870
   macro avg       0.11      0.33      0.17       870
weighted avg       0.11      0.33      0.17       870




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **Blog Style Prompting**

In [4]:
# model_name = "bigscience/bloomz-1b7"


from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report,precision_recall_fscore_support
import torch
from collections import defaultdict
from tqdm import tqdm


dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "english", split='test')

#model_name = "bigscience/bloom-560m"
#model_name = "bigscience/bloom-1b1"
#model_name = "bigscience/bloom-1b7"
#model_name = "bigscience/bloomz-560m"
model_name = "bigscience/bloomz-1b1"
#model_name = "bigscience/bloomz-1b7"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ["negative", "neutral", "positive"]

prompts = {
    "positive": "I just read a tweet that radiates pure joy and positivity. It's truly heartwarming!",
    "negative": "Came across a tweet that's filled with sadness and disappointment. It's really disheartening.",
    "neutral": "Stumbled upon a tweet that seems quite neutral, lacking strong emotions. Just an observation."
}


zero_shot_predictions = []
true_labels = []

for text, label_id in tqdm(zip(dataset['text'], dataset['label']), total=len(dataset)):
    true_label = labels[label_id]
    true_labels.append(true_label)

    prompt_votes = defaultdict(int)
    for prompt_label, prompt in prompts.items():
        inputs = tokenizer(prompt + " " + text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        predicted_label_idx = torch.argmax(outputs.logits, dim=1).item()
        prompt_votes[prompt_label] += predicted_label_idx

    majority_prompt = max(prompt_votes, key=prompt_votes.get)
    zero_shot_predictions.append(majority_prompt)

zero_shot_accuracy = accuracy_score(true_labels, zero_shot_predictions)
zero_shot_metrics = precision_recall_fscore_support(true_labels, zero_shot_predictions, average='weighted')
zero_shot_classification_report = classification_report(true_labels, zero_shot_predictions, target_names=labels)

print(f"Zero-shot Accuracy: {zero_shot_accuracy}")
print(f"Zero-shot Precision: {zero_shot_metrics[0]}")
print(f"Zero-shot Recall: {zero_shot_metrics[1]}")
print(f"Zero-shot F1-score: {zero_shot_metrics[2]}")
print("Zero-shot Classification Report:")
print(zero_shot_classification_report)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/155k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/29.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/64.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1839 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/324 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/870 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloomz-1b1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/870 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 870/870 [59:53<00:00,  4.13s/it]

Zero-shot Accuracy: 0.332183908045977
Zero-shot Precision: 0.110855389336402
Zero-shot Recall: 0.332183908045977
Zero-shot F1-score: 0.16623526028185215
Zero-shot Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       290
     neutral       0.00      0.00      0.00       290
    positive       0.33      1.00      0.50       290

    accuracy                           0.33       870
   macro avg       0.11      0.33      0.17       870
weighted avg       0.11      0.33      0.17       870




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
