<a href="https://colab.research.google.com/github/eriksali/DNN_2023_NLP/blob/main/NLP_Pre_trained_Transformer_based_Models_reload_small.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets 
!pip install apache_beam
!pip install transformers

In [8]:
import torch
import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

# Load the "emotion" dataset
dataset = load_dataset("emotion")

# Split the dataset into training, testing, and validation sets
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

# Check the dataset size for each split
print("Total number of examples:", len(dataset))
print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Testing set size:", len(test_data))

# Get the number of items from each class in each split
train_counts = np.unique(train_data["label"], return_counts=True)
val_counts = np.unique(val_data["label"], return_counts=True)
test_counts = np.unique(test_data["label"], return_counts=True)
class_names = dataset["train"].features["label"].names

for i, label_count in enumerate(train_counts[1]):
    print(f"Number of items from {class_names[i]} class in training set:", label_count)
    print(f"Number of items from {class_names[i]} class in testing set:", test_counts[1][i])
    print(f"Number of items from {class_names[i]} class in validation set:", val_counts[1][i])

# Calculate the average length of text in each split
train_lengths = [len(text.split()) for text in train_data["text"]]
val_lengths = [len(text.split()) for text in val_data["text"]]
test_lengths = [len(text.split()) for text in test_data["text"]]
print("Average length of text in the training set:", np.mean(train_lengths))
print("Average length of text in the validation set:", np.mean(val_lengths))
print("Average length of text in the testing set:", np.mean(test_lengths))

# Extract the input features (word embeddings) using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data["text"])
X_val = vectorizer.transform(val_data["text"])
X_test = vectorizer.transform(test_data["text"])
y_train = train_data["label"]
y_val = val_data["label"]
y_test = test_data["label"]

# Check the vocabulary size and the number of examples for each split
print("Vocabulary size:", X_train.shape[1])
print("Number of examples in the training set:", X_train.shape[0])
print("Number of examples in the validation set:", X_val.shape[0])
print("Number of examples in the testing set:", X_test.shape[0])




  0%|          | 0/3 [00:00<?, ?it/s]

Total number of examples: 3
Training set size: 16000
Validation set size: 2000
Testing set size: 2000
Number of items from sadness class in training set: 4666
Number of items from sadness class in testing set: 581
Number of items from sadness class in validation set: 550
Number of items from joy class in training set: 5362
Number of items from joy class in testing set: 695
Number of items from joy class in validation set: 704
Number of items from love class in training set: 1304
Number of items from love class in testing set: 159
Number of items from love class in validation set: 178
Number of items from anger class in training set: 2159
Number of items from anger class in testing set: 275
Number of items from anger class in validation set: 275
Number of items from fear class in training set: 1937
Number of items from fear class in testing set: 224
Number of items from fear class in validation set: 212
Number of items from surprise class in training set: 572
Number of items from surpri

In [None]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the dataset
dataset = load_dataset("emotion")

# Define the emotion labels
emotions = dataset["train"].features["label"].names

# Split the dataset
dataset_dict = DatasetDict({
    "train": dataset["train"].shuffle().select(range(5000)),
    "test": dataset["test"].shuffle().select(range(1000))
})

# Load the tokenizer and the model
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-cased")
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(emotions))

tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(emotions))

# Tokenize the dataset
'''def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation=True)

tokenized_dataset = dataset_dict.map(tokenize_dataset, batched=True)'''

def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation=True, max_length=64)

tokenized_dataset = dataset_dict.map(tokenize_dataset, batched=True)


# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Fine-tune the model
trainer_bert.train()
trainer_distilbert.train()

# Evaluate the model
eval_results_bert = trainer_bert.evaluate()
eval_results_distilbert = trainer_distilbert.evaluate()

print(f"BERT evaluation results: {eval_results_bert}")
print(f"DistilBERT evaluation results: {eval_results_distilbert}")




  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,0.358048
2,0.666000,0.250329
3,0.666000,0.263554




Epoch,Training Loss,Validation Loss
1,No log,1.55841
2,1.573200,1.406392
3,1.573200,1.292105


BERT evaluation results: {'eval_loss': 0.26355400681495667, 'eval_runtime': 218.3504, 'eval_samples_per_second': 4.58, 'eval_steps_per_second': 0.289, 'epoch': 3.0}
DistilBERT evaluation results: {'eval_loss': 1.292104721069336, 'eval_runtime': 120.0263, 'eval_samples_per_second': 8.332, 'eval_steps_per_second': 0.525, 'epoch': 3.0}


In [None]:
# Define output directory
output_dir = './emotion_classification_bert/'

import os
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save model to output directory
model_bert.save_pretrained(output_dir)

# Save tokenizer to output directory
tokenizer_bert.save_pretrained(output_dir)

!zip -r emotion_classification_bert.zip emotion_classification_bert

# Define output directory
output_dir = './emotion_classification_distilbert/'

import os
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save model to output directory
model_distilbert.save_pretrained(output_dir)

# Save tokenizer to output directory
tokenizer_distilbert.save_pretrained(output_dir)

!zip -r emotion_classification_distilbert.zip emotion_classification_distilbert



updating: emotion_classification_bert/ (stored 0%)
updating: emotion_classification_bert/pytorch_model.bin (deflated 7%)
updating: emotion_classification_bert/vocab.txt (deflated 49%)
updating: emotion_classification_bert/config.json (deflated 54%)
updating: emotion_classification_bert/tokenizer.json (deflated 70%)
updating: emotion_classification_bert/special_tokens_map.json (deflated 42%)
updating: emotion_classification_bert/tokenizer_config.json (deflated 46%)
updating: emotion_classification_distilbert/ (stored 0%)
updating: emotion_classification_distilbert/pytorch_model.bin (deflated 8%)
updating: emotion_classification_distilbert/config.json (deflated 52%)
  adding: emotion_classification_distilbert/vocab.txt (deflated 53%)
  adding: emotion_classification_distilbert/tokenizer.json (deflated 71%)
  adding: emotion_classification_distilbert/special_tokens_map.json (deflated 42%)
  adding: emotion_classification_distilbert/tokenizer_config.json (deflated 42%)


In [None]:
!unzip -uq "/content/emotion_classification_bert.zip" -d "/content/" 

from transformers import BertForSequenceClassification, BertTokenizer
# Load saved model
model_bert = BertForSequenceClassification.from_pretrained('emotion_classification_bert')

# Load saved tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('emotion_classification_bert')

sentence = "Hello, how are you?"
inputs = tokenizer_bert(sentence, return_tensors='pt')
outputs = model_bert(**inputs)
print(outputs)


!unzip -uq "/content/emotion_classification_distilbert.zip" -d "/content/" 

from transformers import DistilBertTokenizer, DistilBertModel
# Load saved model
model_distilbert = DistilBertModel.from_pretrained('emotion_classification_distilbert')

# Load saved tokenizer
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('emotion_classification_distilbert')

sentence = "Hello, how are you?"
inputs = tokenizer_distilbert(sentence, return_tensors='pt')
outputs = model_distilbert(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.4119,  2.9381, -0.9873,  0.1655, -0.1946, -0.9263]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


Some weights of the model checkpoint at emotion_classification_distilbert were not used when initializing DistilBertModel: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseModelOutput(last_hidden_state=tensor([[[-0.4762, -0.8302,  0.0108,  ..., -0.0368,  1.4159,  0.0619],
         [ 0.0206, -0.4679,  0.2187,  ...,  0.2304,  1.4955, -0.3915],
         [-0.4497,  0.0608,  0.5119,  ..., -0.3182,  0.9448, -0.1339],
         ...,
         [-0.0048, -1.0545,  0.6014,  ...,  0.2451,  1.2031, -0.5100],
         [-0.2260, -0.8109, -0.1103,  ..., -0.2533,  1.4551, -0.1857],
         [ 0.7890,  0.1797, -0.5541,  ...,  0.3301, -0.0665, -0.5065]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)


In [None]:
# zero-shot-classification

from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification")

# Define the test data
test_data = [
    {"text": "That's so gay."},
    {"text": "I love this song!"},
    {"text": "This pizza is terrible."},
    {"text": "I can't believe she said that."},
]

# Define the candidate labels
candidate_labels = ["offensive", "non-offensive"]

# Evaluate the model on the test data
true_labels = ["offensive", "non-offensive", "offensive", "non-offensive"]
predicted_labels = [classifier(example["text"], candidate_labels)["labels"][0] for example in test_data]

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, pos_label="offensive")
recall = recall_score(true_labels, predicted_labels, pos_label="offensive")
f1 = f1_score(true_labels, predicted_labels, pos_label="offensive")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")


No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Accuracy: 0.75
Precision: 0.6666666666666666
Recall: 1.0
F1 score: 0.8


In [10]:
!pip install sentencepiece
!pip install sacremoses

import torch
from transformers import pipeline, AutoTokenizer

# Load the pre-trained NLI model and tokenizer
model_name = "textattack/roberta-base-MNLI"
model = torch.hub.load('huggingface/transformers', model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a prompt for zero-shot classification using NLI
def classify_emotion(text):
    prompt = "Is this text expressing {emotion}? Answer yes or no.\n{text}"
    emotions = ["anger", "joy", "love", "sadness", "surprise"]
    encoded_inputs = tokenizer(prompt=prompt.format(emotion=emotions[0], text=text), padding=True, truncation=True, return_tensors="pt")
    logits = model(encoded_inputs['input_ids'], attention_mask=encoded_inputs['attention_mask'])[0]
    probabilities = logits.softmax(dim=1)
    prediction = "yes" if probabilities[0][1] > probabilities[0][0] else "no"
    return prediction


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895259 sha256=57

Using cache found in /root/.cache/torch/hub/huggingface_transformers_main


RuntimeError: ignored

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "textattack/roberta-base-MNLI"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

test_data = [
    {"text": "I am feeling happy today", "label": "Positive"},
    {"text": "I am not feeling well", "label": "Negative"},
    {"text": "The movie was awesome", "label": "Positive"},
    {"text": "I hate it when it rains", "label": "Negative"},
    {"text": "The food was delicious", "label": "Positive"},
]

inputs = tokenizer(
    [sample["text"] for sample in test_data],
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128,
)

outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)

from sklearn.metrics import classification_report


labels = [sample["label"] for sample in test_data]
label_map = {"Positive": 0, "Negative": 1}
label_indices = [label_map[label] for label in labels]
print(classification_report(label_indices, predictions, digits=4))




##################################################################

from transformers import BertForSequenceClassification, BertTokenizer
# Load saved model
model = BertForSequenceClassification.from_pretrained('emotion_classification_bert')

# Load saved tokenizer
tokenizer = BertTokenizer.from_pretrained('emotion_classification_bert')

'''##model_name = "mrm8488/emotion_classification_bert"
model_name = "emotion_classification_bert"
tokenizer = AutoTokenizer.from_prepared(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)'''

## Prepare the input for the model: We need to tokenize the input text and convert it to a format that can be input to the fine-tuned model.

inputs = tokenizer(
    [sample["text"] for sample in test_data],
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128,
)

outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)

from sklearn.metrics import classification_report

labels = [sample["label"] for sample in test_data]
label_map = {"Positive": 0, "Negative": 1}
label_indices = [label_map[label] for label in labels]
print(classification_report(label_indices, predictions, digits=4))



Some weights of the model checkpoint at textattack/roberta-base-MNLI were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         3
           1     0.4000    1.0000    0.5714         2

    accuracy                         0.4000         5
   macro avg     0.2000    0.5000    0.2857         5
weighted avg     0.1600    0.4000    0.2286         5

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         3
           1     0.2500    0.5000    0.3333         2

    accuracy                         0.2000         5
   macro avg     0.1250    0.2500    0.1667         5
weighted avg     0.1000    0.2000    0.1333         5



In [None]:
!pip install datasets
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

# Load the "emotion" dataset
dataset = load_dataset("emotion")

# Split the dataset into training and test sets
train_data = dataset["train"]["text"]
train_labels = dataset["train"]["label"]
test_data = dataset["test"]["text"]
test_labels = dataset["test"]["label"]

# Vectorize the text using TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

# Train a logistic regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_vectors, train_labels)

# Make predictions on the test set
test_preds = classifier.predict(test_vectors)

# Compute accuracy and F1-score
accuracy = accuracy_score(test_labels, test_preds)
f1 = f1_score(test_labels, test_preds, average="weighted")

'''print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
'''
from sklearn.metrics import classification_report
print(classification_report(test_labels, test_preds, digits=4))


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]



Downloading and preparing dataset emotion/split to /root/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset emotion downloaded and prepared to /root/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.8893    0.9260    0.9073       581
           1     0.8310    0.9554    0.8889       695
           2     0.8230    0.5849    0.6838       159
           3     0.9024    0.8073    0.8522       275
           4     0.8700    0.7768    0.8208       224
           5     0.8378    0.4697    0.6019        66

    accuracy                         0.8610      2000
   macro avg     0.8589    0.7533    0.7925      2000
weighted avg     0.8617    0.8610    0.8558      2000



In [None]:
'''
The code performs text classification on the "emotion" dataset using different machine learning models and evaluates their performance using accuracy, F1 score, and classification report.

The code begins by installing the "datasets" package and importing necessary modules such as numpy, sklearn, and datasets.

It then loads the "emotion" dataset using the load_dataset function from the datasets module and splits it into training and testing sets.

Afterwards, it extracts the input features from the text using the CountVectorizer function from the sklearn.feature_extraction.text module and trains a Multinomial Naive Bayes classifier using the training set.

The trained model is then used to predict the labels for the test set, and the accuracy and F1 score are calculated using the accuracy_score and f1_score functions from the sklearn.metrics module.

The code then prints a classification report using the classification_report function from the sklearn.metrics module to display precision, recall, and F1 score for each class, as well as the overall accuracy.

Next, the code defines a random baseline model using the DummyClassifier function from the sklearn.dummy module and evaluates its performance using the same metrics as before.

Finally, the code defines a majority class baseline model that predicts the most frequent class in the training set for all test samples and evaluates its performance using the same metrics.

In summary, the code demonstrates how to load a dataset, extract features from text, train and evaluate different machine learning models for text classification, and compare their performance using different evaluation metrics.

'''


!pip install datasets

import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

# Load the "emotion" dataset
dataset = load_dataset("emotion")

# Split the dataset into training and testing sets
train_data = dataset["train"]
test_data = dataset["test"]

# Extract the input features (word embeddings) using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data["text"])
X_test = vectorizer.transform(test_data["text"])
y_train = train_data["label"]
y_test = test_data["label"]

# Train a bag-of-words classifier using Multinomial Naive Bayes
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model using accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits=4))

################################################################
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Random baseline
random_model = DummyClassifier(strategy="uniform", random_state=42)
random_model.fit(X_train, y_train)
random_pred = random_model.predict(X_test)

# Evaluate performance of random model
random_accuracy = accuracy_score(y_test, random_pred)
random_precision = precision_score(y_test, random_pred, average='macro')
random_recall = recall_score(y_test, random_pred, average='macro')
random_f1 = f1_score(y_test, random_pred, average='macro')

print(classification_report(y_test, random_pred, digits=4))

#################################################################
import numpy as np

# get the majority class in the training set
majority_class = np.argmax(np.bincount(y_train))

# predict the majority class for all test samples
y_pred = np.full((len(y_test),), fill_value=majority_class)

# evaluate the performance of the baseline model
accuracy = np.mean(y_pred == y_test)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1_score = f1_score(y_test, y_pred, average='macro')

print(classification_report(y_test, y_pred, digits=4))






Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




  0%|          | 0/3 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.7429    0.9398    0.8298       581
           1     0.7398    0.9698    0.8394       695
           2     0.9474    0.2264    0.3655       159
           3     0.9176    0.5673    0.7011       275
           4     0.8151    0.5312    0.6432       224
           5     0.0000    0.0000    0.0000        66

    accuracy                         0.7655      2000
   macro avg     0.6938    0.5391    0.5632      2000
weighted avg     0.7657    0.7655    0.7302      2000

              precision    recall  f1-score   support

           0     0.2812    0.1704    0.2122       581
           1     0.3343    0.1583    0.2148       695
           2     0.0706    0.1447    0.0948       159
           3     0.1503    0.1782    0.1631       275
           4     0.0712    0.1071    0.0856       224
           5     0.0515    0.2576    0.0859        66

    accuracy                         0.1610      2000
   macro avg     0.1599

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# On a smaller model 

import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

# Load the emotion dataset
dataset = load_dataset("emotion")

# Load the smaller model and tokenizer
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Instantiate the zero-shot classification pipeline with the smaller model
classifier = pipeline(
    "zero-shot-classification",
    model=model,
    tokenizer=tokenizer,
)

# Example prompt for offensive classification
prompt = "Is this text offensive? Answer yes or no.\n"

# Use a few examples from the dataset to create classification prompts
text = dataset["train"]["text"][:5]
labels = dataset["train"]["label"][:5]
prompts = [prompt + t for t in text]

# Use the zero-shot classifier on the test set
test_text = dataset["test"]["text"]
test_labels = dataset["test"]["label"]
zero_shot_preds = classifier(prompts, test_text)

# Evaluate zero-shot classification
correct = 0
total = len(test_labels)
for i, pred in enumerate(zero_shot_preds):
    label_pred = pred["labels"][0]
    if label_pred == labels[i]:
        correct += 1

from sklearn.metrics import classification_report
print(classification_report(test_labels, label_pred, digits=4))

print(prompts)




  0%|          | 0/3 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
'''
To use word embeddings as input features for a bag-of-words classifier for the "emotion" dataset, we first need to load the dataset using the load_dataset() function from the Hugging Face datasets library:
'''

from datasets import load_dataset

dataset = load_dataset('emotion')
X = dataset['train']['text']
y = dataset['train']['label']

'''
Next, we need to preprocess the text data and convert it into numerical form using word embeddings. We can use the TransformersBaseTokenizer and TransformersWordEmbeddings classes from the nlp library to tokenize the text and convert it into word embeddings:
'''

'''!pip install nlp
from nlp import TransformersBaseTokenizer, TransformersWordEmbeddings

# Load the tokenizer and embeddings model
tokenizer = TransformersBaseTokenizer('bert-base-cased')
embeddings_model = TransformersWordEmbeddings('bert-base-cased')'''

from transformers import BertForSequenceClassification, BertTokenizer
# Load saved model
embeddings_model = BertForSequenceClassification.from_pretrained('emotion_classification_bert')

# Load saved tokenizer
tokenizer = BertTokenizer.from_pretrained('emotion_classification_bert')

# Tokenize and embed the text data
X_embeddings = embeddings_model(tokenizer(X))

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression().fit(X_train, y_train)

from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))

'''
To create a baseline model, we can use the DummyClassifier class from scikit-learn. We can create two types of baseline models: random and majority/target-class.

To create a random baseline model, we can set the strategy parameter of the DummyClassifier to 'uniform':
'''

from sklearn.dummy import DummyClassifier

# Random baseline
dummy_random = DummyClassifier(strategy='uniform')
dummy_random.fit(X_train, y_train)
y_pred_random = dummy_random.predict(X_test)

print('Random baseline accuracy:', accuracy_score(y_test, y_pred_random))

'''
To create a majority/target-class baseline model, we can set the strategy parameter of the DummyClassifier to 'most_frequent':
'''
# Majority/target-class baseline
dummy_majority = DummyClassifier(strategy='most_frequent')
dummy_majority.fit(X_train, y_train)
y_pred_majority = dummy_majority.predict(X_test)

print('Majority/target-class baseline accuracy:', accuracy_score(y_test, y_pred_majority))




  0%|          | 0/3 [00:00<?, ?it/s]

AttributeError: ignored

In [None]:
inputs = tokenizer_bert('Hello, how are you?', return_tensors='pt')
outputs = model_bert(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.4119,  2.9381, -0.9873,  0.1655, -0.1946, -0.9263]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the dataset
dataset = datasets.load_dataset('go_emotions', split='train[:80%]')

# Load the tokenizer and encode the dataset
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
encoded_dataset = dataset.map(lambda examples: tokenizer(examples['text'], padding=True, truncation=True), batched=True)

# Load the pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=28)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy='epoch',     # evaluate every epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=32,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    num_train_epochs=3,              # total number of training epochs
    weight_decay=0.01,               # weight decay
    push_to_hub=False,               # whether to push the fine-tuned model to the Hugging Face model hub
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,     # load the best model at the end of training
    metric_for_best_model="accuracy",
)

# Define the Trainer object and fine-tune the BERT model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

# Save the fine-tuned BERT model
trainer.save_model('./models/bert_emotion_classification')

# Fine-tune the DistilBERT model
distilbert_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=28)
distilbert_trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=encoded_dataset,
    compute_metrics=compute_metrics,
)
distilbert_trainer.train()

# Save the fine-tuned DistilBERT model
distilbert_trainer.save_model('./models/distilbert_emotion_classification')

# Load the test dataset and encode it
test_dataset = datasets.load_dataset('go_emotions', split='train[80%:]')
encoded_test_dataset = test_dataset.map(lambda examples: tokenizer(examples['text'], padding=True, truncation=True), batched=True)

# Define the function to compute the metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = (predictions == labels).mean()
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Evaluate the fine-tuned BERT model on the test dataset
bert_eval_result = trainer.evaluate(encoded_test_dataset)
print("BERT Evaluation Result:")
for key, value in bert_eval_result.items():
    print(f"{key}: {value:.4f}")

# Evaluate the fine-tuned DistilBERT model on the test dataset
distilbert_eval_result = distilbert_trainer.evaluate(encoded_test_dataset)
print("DistilBERT Evaluation Result:")
for key, value in distilbert_eval_result.items():
  print(f"{key}: {value:.4f}")


In [None]:
# Define output directory
output_dir = './emotion_classification_bert/'

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save model to output directory
model_bert.save_pretrained(output_dir)

# Load saved model
model_bert = BertForSequenceClassification.from_pretrained(output_dir)

# Save tokenizer to output directory
tokenizer.save_pretrained(output_dir)
# Load saved tokenizer
tokenizer = BertTokenizer.from_pretrained(output_dir)


In [2]:
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

# Load the emotion dataset
dataset = load_dataset("emotion")

# Instantiate the zero-shot classification pipeline with a smaller model
# model_name = "distilbert-base-uncased"
model_name = "textattack/roberta-base-MNLI"
classifier = pipeline(
    "zero-shot-classification",
    model=model_name,
    tokenizer=model_name,
)

# Example prompt for offensive classification
prompt = "Is this text offensive? Answer yes or no.\n"

# Use a few examples from the dataset to create classification prompts
text = dataset["train"]["text"][:5]
labels = dataset["train"]["label"][:5]
prompts = [prompt + t for t in text]

# Use the zero-shot classifier on the test set
test_text = dataset["test"]["text"]
test_labels = dataset["test"]["label"]
zero_shot_preds = classifier(prompts, test_text)

# Evaluate zero-shot classification
correct = 0
total = len(test_labels)
for i, pred in enumerate(zero_shot_preds):
    label_pred = pred["labels"][0]
    if label_pred == labels[i]:
        correct += 1

'''accuracy = correct / total
print(f"Zero-shot classification accuracy: {accuracy:.4f}")'''

from sklearn.metrics import classification_report
print(classification_report(test_labels, label_pred, digits=4))

print(prompts)




  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/roberta-base-MNLI were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


KeyboardInterrupt: ignored

In [None]:
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

# Load the emotion dataset
dataset = load_dataset("emotion")

# Instantiate the zero-shot classification pipeline
classifier = pipeline(
    "zero-shot-classification",
    model="joeddav/bart-large-mnli-yahoo-answers",
    tokenizer="joeddav/bart-large-mnli-yahoo-answers",
)

# Example prompt for offensive classification
prompt = "Is this text offensive? Answer yes or no.\n"

# Use a few examples from the dataset to create classification prompts
text = dataset["train"]["text"][:5]
labels = dataset["train"]["label"][:5]
prompts = [prompt + t for t in text]

# Use the zero-shot classifier on the test set
test_text = dataset["test"]["text"]
test_labels = dataset["test"]["label"]
zero_shot_preds = classifier(prompts, test_text)

# Evaluate zero-shot classification
correct = 0
total = len(test_labels)
for i, pred in enumerate(zero_shot_preds):
    label_pred = pred["labels"][0]
    if label_pred == labels[i]:
        correct += 1

'''accuracy = correct / total
print(f"Zero-shot classification accuracy: {accuracy:.4f}")'''

from sklearn.metrics import classification_report
print(classification_report(test_labels, label_pred, digits=4))

print(prompts)


In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, label_pred, digits=4))

print(prompts)

In [None]:
from transformers import pipeline

zero_shot_classifier = pipeline("zero-shot-classification", model="EleutherAI/gpt-neo-2.7B")

prompts = ["Is this text about anger, fear, joy, love, sadness, or surprise?",
           "What is the emotion expressed in this text?",
           "Can you classify the emotion in this text?",
           "Which of these emotions best describes the sentiment in this text?"]

zero_shot_results = []
for prompt in prompts:
    zero_shot_results.append(zero_shot_classifier(test_data["text"], candidate_labels=["anger", "fear", "joy", "love", "sadness", "surprise"], prompt=prompt))

print("Zero-shot classification results:", zero_shot_results)


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/10.7G [00:00<?, ?B/s]