In [6]:
# 释放因为程序出错不能释放的显存

import torch
import gc

try:
    del model
except:
    pass
try:
    del trainer
except:
    pass
gc.collect()
torch.cuda.empty_cache()

In [7]:
# 注释参见bert.ipynb

from sklearn.metrics import recall_score, accuracy_score, f1_score
import numpy as np
import os

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "AvgRec": recall_score(labels, predictions, average="macro"),
        "Accuracy": accuracy_score(labels, predictions),
        "F1-macro": f1_score(labels, predictions, average="macro")
    }

isTrial = False
hasPreprocessedTweet = True

eval_steps = 20 if isTrial else 500
num_epochs = 3
learning_rate = 5e-5
num_batch_size = 16

parameter_info = f"bert{'_trial' if isTrial else ''}{'_preprocessed' if hasPreprocessedTweet else ''}_nepoch-{num_epochs}_lr-{learning_rate}_bz-{num_batch_size}"
output_dir = f"/home/featurize/model_output/{parameter_info}"

fp_train = f"data/semeval_2017_task4_train{'_trial' if isTrial else ''}{'_preprocessed' if hasPreprocessedTweet else ''}.csv"
fp_test = f"data/semeval_2017_task4_test{'_trial' if isTrial else ''}{'_preprocessed' if hasPreprocessedTweet else ''}.csv"

test_file_name = os.path.splitext(os.path.split(fp_test)[1])[0]

sentiment2label = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

label2sentiment = {v:k for k, v in sentiment2label.items()}

In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

def tokenize_data(data):
    return tokenizer(data['tweet'], padding='max_length')

dataset = load_dataset('csv', data_files={'train': fp_train, 'test': fp_test})
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = dataset.map(tokenize_data, batched=True)

Using custom data configuration default-4640c170f2e9f920


Downloading and preparing dataset csv/default to /home/featurize/.cache/huggingface/datasets/csv/default-4640c170f2e9f920/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/featurize/.cache/huggingface/datasets/csv/default-4640c170f2e9f920/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/featurize/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/

  0%|          | 0/54 [00:00<?, ?ba/s]

  0%|          | 0/13 [00:00<?, ?ba/s]

In [9]:
def display_train_item(num):
    print(dataset["train"][num]["tweet"])
    print(" ".join([w for w in tokenizer.decode(dataset["train"][num]["input_ids"]).split() if w != "[PAD]"]))

for i in [13, 133, 266]:
    display_train_item(i)

i look forward to reading it .
[CLS] i look forward to reading it. [SEP]
i have been losing time all over the place lately .
[CLS] i have been losing time all over the place lately. [SEP]
every freaking time i take a break and come back to skating i push too hard and my tailbone ends up out of place .
[CLS] every freaking time i take a break and come back to skating i push too hard and my tailbone ends up out of place. [SEP]


In [10]:
def transfer_for_model(para):
    return {'labels': para['label']}

dataset = dataset.map(transfer_for_model, remove_columns=["tweet", "label"])

  0%|          | 0/53570 [00:00<?, ?ex/s]

  0%|          | 0/12284 [00:00<?, ?ex/s]

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

train_size = len(dataset["train"])
new_eval_size = train_size // 10
new_train_size = train_size - new_eval_size

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3, label2id=sentiment2label, id2label=label2sentiment)

train_dataset = dataset['train'].shuffle().select(range(new_train_size))
eval_dataset = dataset['train'].shuffle().select(range(new_train_size, train_size))
test_dataset = dataset["test"]

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_epochs,
    learning_rate=learning_rate,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    save_steps=eval_steps,
    logging_steps=eval_steps,
    per_device_train_batch_size=num_batch_size,
    per_device_eval_batch_size=num_batch_size,
    load_best_model_at_end=True,
    metric_for_best_model="AvgRec",
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/featurize/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  

Step,Training Loss,Validation Loss,Avgrec,Accuracy,F1-macro
500,0.7695,0.678518,0.690291,0.696099,0.67894
1000,0.6874,0.622747,0.686275,0.725966,0.705465
1500,0.6635,0.564524,0.730468,0.75938,0.740003
2000,0.6558,0.519178,0.761578,0.778421,0.767082
2500,0.6336,0.514016,0.80161,0.789621,0.782797
3000,0.6324,0.477874,0.763786,0.800075,0.781754
3500,0.4428,0.418825,0.817832,0.830315,0.82239
4000,0.4449,0.400504,0.853263,0.845809,0.839441
4500,0.4272,0.358649,0.848332,0.86429,0.8577
5000,0.4472,0.322111,0.885338,0.88053,0.875962


***** Running Evaluation *****
  Num examples = 5357
  Batch size = 16
Saving model checkpoint to /home/featurize/model_output/bert_preprocessed_nepoch-3_lr-5e-05_bz-16/checkpoint-500
Configuration saved in /home/featurize/model_output/bert_preprocessed_nepoch-3_lr-5e-05_bz-16/checkpoint-500/config.json
Model weights saved in /home/featurize/model_output/bert_preprocessed_nepoch-3_lr-5e-05_bz-16/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /home/featurize/model_output/bert_preprocessed_nepoch-3_lr-5e-05_bz-16/checkpoint-500/tokenizer_config.json
Special tokens file saved in /home/featurize/model_output/bert_preprocessed_nepoch-3_lr-5e-05_bz-16/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 5357
  Batch size = 16
Saving model checkpoint to /home/featurize/model_output/bert_preprocessed_nepoch-3_lr-5e-05_bz-16/checkpoint-1000
Configuration saved in /home/featurize/model_output/bert_preprocessed_nepoch-3_lr-5e-05_bz-16/checkpoint-

TrainOutput(global_step=9042, training_loss=0.44054279409660435, metrics={'train_runtime': 3522.0544, 'train_samples_per_second': 41.067, 'train_steps_per_second': 2.567, 'total_flos': 3.805646162676019e+16, 'train_loss': 0.44054279409660435, 'epoch': 3.0})

In [13]:
import pandas as pd
import os
from sklearn.metrics import recall_score
import json

predict = trainer.predict(test_dataset=test_dataset)
predictions = np.argmax(predict.predictions, axis=1)

p = pd.read_csv(fp_test)
p["prediction"] = predictions
test_file_name = os.path.splitext(os.path.split(fp_test)[1])[0]
p.to_csv(f"predictions/{test_file_name}_predictions_{parameter_info}.csv", index=False)

recall_scores = dict({"file": test_file_name, "parameter": parameter_info})
for label in label2sentiment.keys():
    recall_scores[label] = recall_score(p["label"], p["prediction"], average="macro", labels=[label])
recall_scores.update(predict.metrics)

json.dump(recall_scores, open(f"scores/scores_{test_file_name}_{parameter_info}.txt", "w"), indent=2, ensure_ascii=False)


***** Running Prediction *****
  Num examples = 12284
  Batch size = 16
