<a href="https://colab.research.google.com/github/eriksali/DNN_2023_NLP/blob/main/NLP_Pre_trained_Transformer_based_Models_reload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets 

In [None]:
!pip install transformers

In [None]:
!pip install apache_beam

In [18]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the dataset
dataset = load_dataset("emotion")

# Define the emotion labels
emotions = dataset["train"].features["label"].names

# Split the dataset
dataset_dict = DatasetDict({
    "train": dataset["train"].shuffle().select(range(5000)),
    "test": dataset["test"].shuffle().select(range(1000))
})

# Load the tokenizer and the model
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-cased")
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(emotions))

tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(emotions))

# Tokenize the dataset
'''def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation=True)

tokenized_dataset = dataset_dict.map(tokenize_dataset, batched=True)'''

def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation=True, max_length=64)

tokenized_dataset = dataset_dict.map(tokenize_dataset, batched=True)


# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Fine-tune the model
trainer_bert.train()
trainer_distilbert.train()

# Evaluate the model
eval_results_bert = trainer_bert.evaluate()
eval_results_distilbert = trainer_distilbert.evaluate()

print(f"BERT evaluation results: {eval_results_bert}")
print(f"DistilBERT evaluation results: {eval_results_distilbert}")




  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,0.358048
2,0.666000,0.250329
3,0.666000,0.263554




Epoch,Training Loss,Validation Loss
1,No log,1.55841
2,1.573200,1.406392
3,1.573200,1.292105


BERT evaluation results: {'eval_loss': 0.26355400681495667, 'eval_runtime': 218.3504, 'eval_samples_per_second': 4.58, 'eval_steps_per_second': 0.289, 'epoch': 3.0}
DistilBERT evaluation results: {'eval_loss': 1.292104721069336, 'eval_runtime': 120.0263, 'eval_samples_per_second': 8.332, 'eval_steps_per_second': 0.525, 'epoch': 3.0}


In [21]:


# Save the fine-tuned BERT model
bert_model_path = "./bert_emotion_classification"
model_bert.save_pretrained(bert_model_path)

# Save the fine-tuned DistilBERT model
distilbert_model_path = "./distilbert_emotion_classification"
model_distilbert.save_pretrained(distilbert_model_path)


In [30]:
# Define output directory
output_dir = './emotion_classification_bert/'

import os
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save model to output directory
model_bert.save_pretrained(output_dir)

# Save tokenizer to output directory
tokenizer_bert.save_pretrained(output_dir)

!zip -r emotion_classification_bert.zip emotion_classification_bert

# Define output directory
output_dir = './emotion_classification_distilbert/'

import os
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save model to output directory
model_distilbert.save_pretrained(output_dir)

# Save tokenizer to output directory
tokenizer_distilbert.save_pretrained(output_dir)

!zip -r emotion_classification_distilbert.zip emotion_classification_distilbert



updating: emotion_classification_bert/ (stored 0%)
updating: emotion_classification_bert/pytorch_model.bin (deflated 7%)
updating: emotion_classification_bert/vocab.txt (deflated 49%)
updating: emotion_classification_bert/config.json (deflated 54%)
updating: emotion_classification_bert/tokenizer.json (deflated 70%)
updating: emotion_classification_bert/special_tokens_map.json (deflated 42%)
updating: emotion_classification_bert/tokenizer_config.json (deflated 46%)
updating: emotion_classification_distilbert/ (stored 0%)
updating: emotion_classification_distilbert/pytorch_model.bin (deflated 8%)
updating: emotion_classification_distilbert/config.json (deflated 52%)
  adding: emotion_classification_distilbert/vocab.txt (deflated 53%)
  adding: emotion_classification_distilbert/tokenizer.json (deflated 71%)
  adding: emotion_classification_distilbert/special_tokens_map.json (deflated 42%)
  adding: emotion_classification_distilbert/tokenizer_config.json (deflated 42%)


In [34]:
!unzip -uq "/content/emotion_classification_bert.zip" -d "/content/" 

from transformers import BertForSequenceClassification, BertTokenizer
# Load saved model
model_bert = BertForSequenceClassification.from_pretrained('emotion_classification_bert')

# Load saved tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('emotion_classification_bert')

sentence = "Hello, how are you?"
inputs = tokenizer_bert(sentence, return_tensors='pt')
outputs = model_bert(**inputs)
print(outputs)


!unzip -uq "/content/emotion_classification_distilbert.zip" -d "/content/" 

from transformers import DistilBertTokenizer, DistilBertModel
# Load saved model
model_distilbert = DistilBertModel.from_pretrained('emotion_classification_distilbert')

# Load saved tokenizer
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('emotion_classification_distilbert')

sentence = "Hello, how are you?"
inputs = tokenizer_distilbert(sentence, return_tensors='pt')
outputs = model_distilbert(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.4119,  2.9381, -0.9873,  0.1655, -0.1946, -0.9263]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


Some weights of the model checkpoint at emotion_classification_distilbert were not used when initializing DistilBertModel: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseModelOutput(last_hidden_state=tensor([[[-0.4762, -0.8302,  0.0108,  ..., -0.0368,  1.4159,  0.0619],
         [ 0.0206, -0.4679,  0.2187,  ...,  0.2304,  1.4955, -0.3915],
         [-0.4497,  0.0608,  0.5119,  ..., -0.3182,  0.9448, -0.1339],
         ...,
         [-0.0048, -1.0545,  0.6014,  ...,  0.2451,  1.2031, -0.5100],
         [-0.2260, -0.8109, -0.1103,  ..., -0.2533,  1.4551, -0.1857],
         [ 0.7890,  0.1797, -0.5541,  ...,  0.3301, -0.0665, -0.5065]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)


In [28]:
inputs = tokenizer_bert('Hello, how are you?', return_tensors='pt')
outputs = model_bert(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.4119,  2.9381, -0.9873,  0.1655, -0.1946, -0.9263]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [23]:
from transformers import BertForSequenceClassification, BertTokenizer
model = BertForSequenceClassification.from_pretrained('bert_emotion_classification')
tokenizer = BertTokenizer.from_pretrained('bert_emotion_classification')

inputs = tokenizer('Hello, how are you?', return_tensors='pt')
outputs = model(**inputs)


OSError: ignored

In [None]:
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the dataset
dataset = datasets.load_dataset('go_emotions', split='train[:80%]')

# Load the tokenizer and encode the dataset
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
encoded_dataset = dataset.map(lambda examples: tokenizer(examples['text'], padding=True, truncation=True), batched=True)

# Load the pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=28)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy='epoch',     # evaluate every epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=32,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    num_train_epochs=3,              # total number of training epochs
    weight_decay=0.01,               # weight decay
    push_to_hub=False,               # whether to push the fine-tuned model to the Hugging Face model hub
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,     # load the best model at the end of training
    metric_for_best_model="accuracy",
)

# Define the Trainer object and fine-tune the BERT model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

# Save the fine-tuned BERT model
trainer.save_model('./models/bert_emotion_classification')

# Fine-tune the DistilBERT model
distilbert_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=28)
distilbert_trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=encoded_dataset,
    compute_metrics=compute_metrics,
)
distilbert_trainer.train()

# Save the fine-tuned DistilBERT model
distilbert_trainer.save_model('./models/distilbert_emotion_classification')

# Load the test dataset and encode it
test_dataset = datasets.load_dataset('go_emotions', split='train[80%:]')
encoded_test_dataset = test_dataset.map(lambda examples: tokenizer(examples['text'], padding=True, truncation=True), batched=True)

# Define the function to compute the metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = (predictions == labels).mean()
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Evaluate the fine-tuned BERT model on the test dataset
bert_eval_result = trainer.evaluate(encoded_test_dataset)
print("BERT Evaluation Result:")
for key, value in bert_eval_result.items():
    print(f"{key}: {value:.4f}")

# Evaluate the fine-tuned DistilBERT model on the test dataset
distilbert_eval_result = distilbert_trainer.evaluate(encoded_test_dataset)
print("DistilBERT Evaluation Result:")
for key, value in distilbert_eval_result.items():
  print(f"{key}: {value:.4f}")


In [None]:
# Define output directory
output_dir = './emotion_classification_bert/'

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save model to output directory
model_bert.save_pretrained(output_dir)

# Load saved model
model_bert = BertForSequenceClassification.from_pretrained(output_dir)

# Save tokenizer to output directory
tokenizer.save_pretrained(output_dir)
# Load saved tokenizer
tokenizer = BertTokenizer.from_pretrained(output_dir)


In [None]:
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

# Load the emotion dataset
dataset = load_dataset("emotion")

# Instantiate the zero-shot classification pipeline
classifier = pipeline(
    "zero-shot-classification",
    model="joeddav/bart-large-mnli-yahoo-answers",
    tokenizer="joeddav/bart-large-mnli-yahoo-answers",
)

# Example prompt for offensive classification
prompt = "Is this text offensive? Answer yes or no.\n"

# Use a few examples from the dataset to create classification prompts
text = dataset["train"]["text"][:5]
labels = dataset["train"]["label"][:5]
prompts = [prompt + t for t in text]

# Use the zero-shot classifier on the test set
test_text = dataset["test"]["text"]
test_labels = dataset["test"]["label"]
zero_shot_preds = classifier(prompts, test_text)

# Evaluate zero-shot classification
correct = 0
total = len(test_labels)
for i, pred in enumerate(zero_shot_preds):
    label_pred = pred["labels"][0]
    if label_pred == labels[i]:
        correct += 1

accuracy = correct / total
print(f"Zero-shot classification accuracy: {accuracy:.4f}")


In [None]:
from transformers import pipeline

zero_shot_classifier = pipeline("zero-shot-classification", model="EleutherAI/gpt-neo-2.7B")

prompts = ["Is this text about anger, fear, joy, love, sadness, or surprise?",
           "What is the emotion expressed in this text?",
           "Can you classify the emotion in this text?",
           "Which of these emotions best describes the sentiment in this text?"]

zero_shot_results = []
for prompt in prompts:
    zero_shot_results.append(zero_shot_classifier(test_data["text"], candidate_labels=["anger", "fear", "joy", "love", "sadness", "surprise"], prompt=prompt))

print("Zero-shot classification results:", zero_shot_results)


In [None]:
from transformers import pipeline

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification")

# Define the movie review test set
reviews = [
    "I really enjoyed this movie, it was great!",
    "This movie was terrible, I would never watch it again.",
    "The acting in this movie was amazing, but the plot was lacking.",
    "I found this movie to be very boring and uninteresting."
]

# Define the true labels for the test set
labels = ["positive", "negative", "positive", "negative"]

# Classify each review and evaluate the results
predicted_labels = classifier(reviews, candidate_labels=["positive", "negative"], multi_label=False)

# Calculate evaluation metrics
accuracy = sum([1 if predicted_labels[i]['labels'][0] == labels[i] else 0 for i in range(len(labels))]) / len(labels)
precision = sum([1 if predicted_labels[i]['labels'][0] == labels[i] and predicted_labels[i]['scores'][0] > 0.5 else 0 for i in range(len(labels))]) / sum([1 if predicted_labels[i]['labels'][0] == 'positive' else 0 for i in range(len(labels))])
recall = sum([1 if predicted_labels[i]['labels'][0] == labels[i] and predicted_labels[i]['scores'][0] > 0.5 else 0 for i in range(len(labels))]) / sum([1 if labels[i] == 'positive' else 0 for i in range(len(labels))])
f1_score = 2 * precision * recall / (precision + recall)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1_score)


In [16]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# Load the dataset
dataset = load_dataset("emotion")

# Define the emotion labels
emotions = dataset["train"].features["label"].names

# Split the dataset
dataset_dict = DatasetDict({
    "train": dataset["train"].shuffle().select(range(5000)),
    "test": dataset["test"].shuffle().select(range(1000))
})

# Load the tokenizer and the model
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-cased")
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(emotions))

tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(emotions))

# Tokenize the dataset
def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation=True)

tokenized_dataset = dataset_dict.map(tokenize_dataset, batched=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Fine-tune the model
trainer_bert.train()
trainer_distilbert.train()

# Evaluate the model
y_true = dataset_dict["test"]["label"]
y_pred_bert = trainer_bert.predict(tokenized_dataset["test"]).predictions.argmax(axis=1)
y_pred_distilbert = trainer_distilbert.predict(tokenized_dataset["test"]).predictions.argmax(axis=1)

acc_bert = accuracy_score(y_true, y_pred_bert)
f1_bert = f1_score(y_true, y_pred_bert, average="weighted")

acc_distilbert = accuracy_score(y_true, y_pred_distilbert)
f1_distilbert = f1_score(y_true, y_pred_distilbert, average="weighted")

print(f"BERT accuracy: {acc_bert:.4f}")
print(f"BERT F1 score: {f1_bert:.4f}")
print(f"DistilBERT accuracy: {acc_distilbert:.4f}")
print(f"DistilBERT F1 score: {f1_distilbert:.4f}")




  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

ValueError: ignored

In [15]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the dataset
dataset = load_dataset("emotion")

# Define the emotion labels
emotions = dataset["train"].features["label"].names

# Split the dataset
dataset_dict = DatasetDict({
    "train": dataset["train"].shuffle().select(range(5000)),
    "test": dataset["test"].shuffle().select(range(1000))
})

# Load the tokenizer and the model
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-cased")
##tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-cased", max_length=128)
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(emotions))

tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
##tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased", max_length=128)
model_distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(emotions))


tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased", max_length=128)

# Tokenize the dataset
'''def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation=True)'''

def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation="longest_first", max_length=128)

tokenized_dataset = dataset_dict.map(tokenize_dataset, batched=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Fine-tune the model
trainer_bert.train()
trainer_distilbert.train()

# Evaluate the model
eval_results_bert = trainer_bert.evaluate()
eval_results_distilbert = trainer_distilbert.evaluate()

print(f"BERT evaluation results: {eval_results_bert}")
print(f"DistilBERT evaluation results: {eval_results_distilbert}")




  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

ValueError: ignored

In [10]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the dataset
dataset = load_dataset("wikipedia", "20220301.simple")

# Define the emotion labels
emotions = ["happy", "sad", "angry", "surprised"]

# Filter out non-emotion articles
dataset = dataset.filter(lambda x: any(label in x["title"].lower() for label in emotions))

# Split the dataset
dataset_dict = DatasetDict({
    "train": dataset["train"].shuffle().select(range(5000)),
    "test": dataset["test"].shuffle().select(range(1000))
})

# Load the tokenizer and the model
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-cased")
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(emotions))

tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(emotions))

# Tokenize the dataset
def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation=True)

tokenized_dataset = dataset_dict.map(tokenize_dataset, batched=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Fine-tune the model
trainer_bert.train()
trainer_distilbert.train()

# Evaluate the model
eval_results_bert = trainer_bert.evaluate()
eval_results_distilbert = trainer_distilbert.evaluate()

print(f"BERT evaluation results: {eval_results_bert}")
print(f"DistilBERT evaluation results: {eval_results_distilbert}")




  0%|          | 0/1 [00:00<?, ?it/s]

Filter:   0%|          | 0/205328 [00:00<?, ? examples/s]

IndexError: ignored

In [12]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the dataset
dataset = load_dataset("wikipedia", "20220301.simple")

# Define the emotion labels
emotions = ["happy", "sad", "angry", "surprised"]

# Filter out non-emotion articles
dataset = dataset.filter(lambda x: any(label in x["title"].lower() for label in emotions))

'''# Split the dataset
dataset_dict = DatasetDict({
    "train": dataset["train"].shuffle().select(range(285)),
    "test": dataset["test"].shuffle().select(range(1000))
})'''
# Split the dataset
dataset_dict = DatasetDict({
    "train": dataset["train"].shuffle().select(range(285)),
    "test": dataset["test"].shuffle().select(range(285, 500))
})


# Load the tokenizer and the model
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-cased")
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(emotions))

tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(emotions))

# Tokenize the dataset
def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation=True)

tokenized_dataset = dataset_dict.map(tokenize_dataset, batched=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Fine-tune the model
trainer_bert.train()
trainer_distilbert.train()

# Evaluate the model
eval_results_bert = trainer_bert.evaluate()
eval_results_distilbert = trainer_distilbert.evaluate()

print(f"BERT evaluation results: {eval_results_bert}")
print(f"DistilBERT evaluation results: {eval_results_distilbert}")




  0%|          | 0/1 [00:00<?, ?it/s]



KeyError: ignored

In [7]:
##!pip install datasets
##!pip install transformers
##!pip install apache_beam

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the dataset
dataset = load_dataset("wikipedia", "20220301.simple")

# Load the tokenizer and the model
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-cased")
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Tokenize the dataset
def tokenize_dataset(dataset):
    return tokenizer_bert(dataset["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize_dataset, batched=True)

# Split the dataset
train_dataset = tokenized_dataset["review"]
test_dataset = tokenized_dataset["setiment"]

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Fine-tune the model
trainer_bert.train()
trainer_distilbert.train()

# Evaluate the model
eval_results_bert = trainer_bert.evaluate()
eval_results_distilbert = trainer_distilbert.evaluate()

print(f"BERT evaluation results: {eval_results_bert}")
print(f"DistilBERT evaluation results: {eval_results_distilbert}")


Downloading and preparing dataset wikipedia/20220301.simple to /root/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559...


Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235M [00:00<?, ?B/s]

Dataset wikipedia downloaded and prepared to /root/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Map:   0%|          | 0/205328 [00:00<?, ? examples/s]

KeyError: ignored

In [9]:
# Split the dataset
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["sentiment"]

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Fine-tune the model
trainer_bert.train()
trainer_distilbert.train()

# Evaluate the model
eval_results_bert = trainer_bert.evaluate()
eval_results_distilbert = trainer_distilbert.evaluate()

print(f"BERT evaluation results: {eval_results_bert}")
print(f"DistilBERT evaluation results: {eval_results_distilbert}")

KeyError: ignored