In [1]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
import transformers
#device = torch.device('cpu')
print(device)
import csv
import os

cuda:0


In [2]:
data_path = os.path.join(os.curdir,"data")
data_path

'.\\data'

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [4]:
def preprocess(data):
    return tokenizer(data["text"], truncation=True)


In [5]:
def load_data(type):

    label_path = os.path.join(data_path, type + "_labels.txt")
    text_path = os.path.join(data_path, type + "_text.txt")
    labels = open(label_path, encoding="utf8").read().split("\n")
    text = open(text_path, encoding="utf8").read().split("\n")[:-1]
    labels = [int(label) for label in labels if not label == ""]
    return text, labels

def load_data_dict(type):
    text, label = load_data(type)
    return {"text": text, "label": label}


In [6]:
text, _ = load_data("train")
text[0]

"“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry "

In [7]:
from datasets import Dataset

ds = Dataset.from_dict(load_data_dict("train"))

In [8]:
train_data = Dataset.from_dict(load_data_dict("train"))
tokenized_train_data = train_data.map(preprocess, batched=True)

val_data = Dataset.from_dict(load_data_dict("val"))
tokenized_val_data = val_data.map(preprocess, batched=True)

test_data = Dataset.from_dict(load_data_dict("test"))
tokenized_test_data = test_data.map(preprocess, batched=True)

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

In [9]:
tokenized_train_data

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3257
})

In [10]:
mapping_path = os.path.join(data_path, "mapping.txt")
with open(mapping_path,encoding="utf-8") as f:
    text = f.read().split("\n")
    csvreader = csv.reader(text, delimiter='\t')
classes = [row[1] for row in csvreader if len(row) > 1]
classes

['anger', 'joy', 'optimism', 'sadness']

In [11]:
id2label = {idx: class_a for idx, class_a in enumerate(classes)}
label2id = {class_a: idx for idx, class_a in enumerate(classes)}

In [12]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
num_epoch = 7

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(

    "distilbert/distilbert-base-uncased", num_labels=len(classes), id2label=id2label, label2id=label2id

)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import evaluate

accuracy = evaluate.load("accuracy")

In [15]:
import numpy as np
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

In [17]:
#torch.cuda.set_device(1)

training_args = TrainingArguments(

    output_dir="my_awesome_model",

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=num_epoch,

    weight_decay=0.01,

    eval_strategy="epoch",

    save_strategy="epoch",

    load_best_model_at_end=True,

)

In [18]:
trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_train_data,

    eval_dataset=tokenized_val_data,

    processing_class=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.640561,0.762032
2,No log,0.675219,0.783422
3,0.628900,0.685589,0.791444
4,0.628900,0.759019,0.780749
5,0.199800,0.883584,0.783422
6,0.199800,0.942929,0.775401
7,0.199800,0.954463,0.778075


TrainOutput(global_step=1428, training_loss=0.31653284826198547, metrics={'train_runtime': 956.9869, 'train_samples_per_second': 23.824, 'train_steps_per_second': 1.492, 'total_flos': 247389361823016.0, 'train_loss': 0.31653284826198547, 'epoch': 7.0})

In [19]:
def eval_net(model,loader):
    model.eval()
    correct = 0
    for data in loader:
        #data, label = data.to(device), label.to(device)
        label = data["label"]
        probs = model(torch.tensor([data["input_ids"]], device=device), torch.tensor([data["attention_mask"]], device=device)).logits
        probs = probs
        correct += probs.argmax().eq(label).sum().detach()
        
    val = float(correct / len(loader))
    return val

In [20]:
tokenized_test_data

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1421
})

In [21]:
idx, data = next(enumerate(tokenized_test_data))
data

{'text': '#Deppression is real. Partners w/ #depressed people truly dont understand the depth in which they affect us. Add in #anxiety &amp;makes it worse ',
 'label': 3,
 'input_ids': [101,
  1001,
  2139,
  9397,
  8303,
  3258,
  2003,
  2613,
  1012,
  5826,
  1059,
  1013,
  1001,
  14777,
  2111,
  5621,
  2123,
  2102,
  3305,
  1996,
  5995,
  1999,
  2029,
  2027,
  7461,
  2149,
  1012,
  5587,
  1999,
  1001,
  10089,
  1004,
  23713,
  1025,
  3084,
  2009,
  4788,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [22]:
model(torch.tensor([data["input_ids"]], device=device), torch.tensor([data["attention_mask"]], device=device)).logits

tensor([[-0.4235, -1.3257, -0.3454,  2.0747]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [23]:
print(f"The accuracy on test is {eval_net(model,tokenized_test_data)*100:.2f}%")

The accuracy on test is 79.24%


Bei GRU, war die Test-Accuracy 59.68%. Bei LSTM 61.08%. 