In [100]:
from datasets import Dataset
from datasets import DatasetDict
from datasets import concatenate_datasets
from datasets import load_metric
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
from transformers import pipeline
import csv
import numpy as np

In [108]:
# labels to ids
id2label = {0: "negative", 1: "neutral", 2:"positive"}
label2id = {v: k for k, v in id2label.items()}

# fuction to creat dataset from file
# TODO to be with pandas + add fillters
def file_to_dataset(filename):
    tweets_list = []
    labels_list = []
    with open(filename, "r", encoding="utf8") as file:
        tsv_reader = csv.reader(file, delimiter="\t")
        for line in tsv_reader:
            (id, label, tweet) = line
            tweets_list.append(tweet)
            labels_list.append(label2id[label])
    return Dataset.from_dict({"text": tweets_list, "label":labels_list})

# creat the dataset
train_data_2013 = file_to_dataset("semeval-2017-tweets_Subtask-A/downloaded/twitter-2013train-A.tsv")
train_data_2015 = file_to_dataset("semeval-2017-tweets_Subtask-A/downloaded/twitter-2015train-A.tsv")
train_data_2016 = file_to_dataset("semeval-2017-tweets_Subtask-A/downloaded/twitter-2016train-A.tsv")
test_data = file_to_dataset("semeval-2017-tweets_Subtask-A/downloaded/twitter-2015test-A.tsv")
dataset = DatasetDict({"train": concatenate_datasets([train_data_2013, train_data_2015, train_data_2016]), "validation":test_data})

# show sample
sample = dataset["train"].shuffle().select(range(3))
for row in sample:
    print(f"\n'>>> tweet: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [102]:
# load pretraind modle and tokenizer from checkpoint
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3, 
    id2label=id2label,
    label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [103]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# tokenize the data
def tokenize_function(tweet):
    return tokenizer(tweet["text"] , truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
   # TODO add
   # precition
   # recall
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average="micro")["f1"]
   return {"accuracy": accuracy, "f1": f1}


Map:   0%|          | 0/16041 [00:00<?, ? examples/s]

Map:   0%|          | 0/2390 [00:00<?, ? examples/s]

In [104]:
# TODO try fine tuning
training_args = TrainingArguments(
   output_dir="finetuning",
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   learning_rate=6e-6,
   evaluation_strategy="epoch"
)

# initialize trainer 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [105]:
trainer.train()

  0%|          | 0/4012 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.7661, 'learning_rate': 4.376869391824527e-05, 'epoch': 0.5}
{'loss': 0.661, 'learning_rate': 3.7537387836490526e-05, 'epoch': 1.0}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.6344468593597412, 'eval_accuracy': 0.7217573221757322, 'eval_f1': 0.7217573221757322, 'eval_runtime': 5.4812, 'eval_samples_per_second': 436.034, 'eval_steps_per_second': 27.366, 'epoch': 1.0}
{'loss': 0.4321, 'learning_rate': 3.1306081754735795e-05, 'epoch': 1.5}
{'loss': 0.4469, 'learning_rate': 2.507477567298106e-05, 'epoch': 1.99}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 0.766244113445282, 'eval_accuracy': 0.702510460251046, 'eval_f1': 0.702510460251046, 'eval_runtime': 7.229, 'eval_samples_per_second': 330.611, 'eval_steps_per_second': 20.75, 'epoch': 2.0}
{'loss': 0.2354, 'learning_rate': 1.8843469591226323e-05, 'epoch': 2.49}
{'loss': 0.2302, 'learning_rate': 1.2612163509471586e-05, 'epoch': 2.99}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 1.0257704257965088, 'eval_accuracy': 0.703347280334728, 'eval_f1': 0.703347280334728, 'eval_runtime': 5.2549, 'eval_samples_per_second': 454.815, 'eval_steps_per_second': 28.545, 'epoch': 3.0}
{'loss': 0.1031, 'learning_rate': 6.380857427716849e-06, 'epoch': 3.49}
{'loss': 0.1189, 'learning_rate': 1.4955134596211367e-07, 'epoch': 3.99}


  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 1.415643334388733, 'eval_accuracy': 0.7071129707112971, 'eval_f1': 0.707112970711297, 'eval_runtime': 5.7211, 'eval_samples_per_second': 417.752, 'eval_steps_per_second': 26.219, 'epoch': 4.0}
{'train_runtime': 739.8645, 'train_samples_per_second': 86.724, 'train_steps_per_second': 5.423, 'train_loss': 0.3736339961424663, 'epoch': 4.0}


TrainOutput(global_step=4012, training_loss=0.3736339961424663, metrics={'train_runtime': 739.8645, 'train_samples_per_second': 86.724, 'train_steps_per_second': 5.423, 'train_loss': 0.3736339961424663, 'epoch': 4.0})

In [106]:
trainer.evaluate()

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 1.415643334388733,
 'eval_accuracy': 0.7071129707112971,
 'eval_f1': 0.707112970711297,
 'eval_runtime': 6.2804,
 'eval_samples_per_second': 380.551,
 'eval_steps_per_second': 23.884,
 'epoch': 4.0}