In [1]:
import csv
from datasets import load_dataset
import numpy as np
from transformers import AutoTokenizer, DistilBertModel, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
import random

## Read in dataset and preprocessing

In [2]:
# read in datasets 
train_data = load_dataset('csv', data_files = ['./data/train.csv'], split = 'train')
test_data = load_dataset('csv', data_files = ['./data/test.csv'], split = 'train')
val_data = load_dataset('csv', data_files = ['./data/val.csv'], split = 'train')



In [3]:
# Preprocess
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [4]:
# preprocessing function: tokenize tweet and truncate to be no longer than max length
def preprocess_function(examples):
    return tokenizer(examples["tweet"], truncation=True)

In [5]:
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)
tokenized_val = val_data.map(preprocess_function, batched=True)



In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

In [7]:
accuracy = evaluate.load("accuracy")

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

## Tiny Example: First 10 tweets

In [9]:
id2label = {0: "FAKE", 1: "REAL"}
label2id = {"FAKE": 0, "REAL": 1}

In [10]:
tiny_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

In [14]:
# define training hyperparameters 
tiny_training_args = TrainingArguments(
    output_dir="tiny_model",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    resume_from_checkpoint = "tiny_model/checkpoint-1"
)

tiny_trainer = Trainer(
    model=tiny_model,
    args=tiny_training_args,
    train_dataset=tokenized_train.select(range(10)),
    eval_dataset=tokenized_test.select(range(10)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
tiny_trainer.train(resume_from_checkpoint = "tiny_model/checkpoint-1")

Loading model from tiny_model/checkpoint-1.
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 1
  Number of trainable parameters = 66955010
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 1
  Will skip the first 1 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


0it [00:00, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from tiny_model/checkpoint-1 (score: 0.010291600599884987).


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=1, training_loss=0.0, metrics={'train_runtime': 0.357, 'train_samples_per_second': 28.012, 'train_steps_per_second': 2.801, 'total_flos': 196631294880.0, 'train_loss': 0.0, 'epoch': 1.0})

In [16]:
tiny_trainer.evaluate(tokenized_test.select(range(10)))

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 10
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.010291600599884987,
 'eval_accuracy': 1.0,
 'eval_runtime': 35.5269,
 'eval_samples_per_second': 0.281,
 'eval_steps_per_second': 0.028,
 'epoch': 1.0}

In [20]:
y_logits = tiny_trainer.predict(tokenized_test.select(range(10)))

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10
  Batch size = 10


In [21]:
y_true = np.array([test_data[i]["label"] for i in range(test_data.num_rows)])

In [22]:
y_logits2 = y_logits.predictions 

In [23]:
import torch
# transform np.array to tensor (so we can use softmax)
y_logits3 = torch.from_numpy(y_logits2)

## Train 

### Full Dataset Trainer

In [48]:
id2label = {0: "FAKE", 1: "REAL"}
label2id = {"FAKE": 0, "REAL": 1}

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [15]:
def build_trainer(train, test, output_dir, resume_from_checkpoint):
    
    # define training hyperparameters 
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        resume_from_checkpoint = resume_from_checkpoint
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    return trainer

In [58]:
trainer = build_trainer(tokenized_train, tokenized_test, "modelA", "modelA/checkpoint-402")

trainer.train(resume_from_checkpoint = "modelA/checkpoint-402")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Loading model from modelA/checkpoint-402.
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6420
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 402
  Number of trainable parameters = 66955010
  Continuing training from checkpoint, will skip to saved global_step
  Continuing trainin

0it [00:00, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from modelA/checkpoint-402 (score: 0.12317584455013275).


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=402, training_loss=0.0, metrics={'train_runtime': 0.1268, 'train_samples_per_second': 50620.135, 'train_steps_per_second': 3169.672, 'total_flos': 169064622239376.0, 'train_loss': 0.0, 'epoch': 1.0})

In [59]:
trainer.evaluate(tokenized_test)

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2140
  Batch size = 16


KeyboardInterrupt: 

### 100 randomly selected examples Trainers

In [64]:
accuracies = []

for i in range(4):
    train = tokenized_train.select(random.sample(range(tokenized_train.num_rows), k=100))
    trainer = build_trainer(train, tokenized_test, "model100_" + str(i + 1), False)

    trainer.train()
    
    accuracies.append(trainer.evaluate(tokenized_test))

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7
  Number of trainable parameters = 66955010


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.171698,0.964953


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2140
  Batch size = 16
Saving model checkpoint to model100_1/checkpoint-7
Configuration saved in model100_1/checkpoint-7/config.json
Model weights saved in model100_1/checkpoint-7/pytorch_model.bin
tokenizer config file saved in model100_1/checkpoint-7/tokenizer_config.json
Special tokens file saved in model100_1/checkpoint-7/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from model100_1/checkpoint-7 (score: 0.1716984510421753).
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7
  Number of trainable parameters = 66955010


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.169111,0.965421


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2140
  Batch size = 16
Saving model checkpoint to model100_2/checkpoint-7
Configuration saved in model100_2/checkpoint-7/config.json
Model weights saved in model100_2/checkpoint-7/pytorch_model.bin
tokenizer config file saved in model100_2/checkpoint-7/tokenizer_config.json
Special tokens file saved in model100_2/checkpoint-7/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from model100_2/checkpoint-7 (score: 0.16911059617996216).
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: twee

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7
  Number of trainable parameters = 66955010


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.184755,0.966355


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2140
  Batch size = 16
Saving model checkpoint to model100_3/checkpoint-7
Configuration saved in model100_3/checkpoint-7/config.json
Model weights saved in model100_3/checkpoint-7/pytorch_model.bin
tokenizer config file saved in model100_3/checkpoint-7/tokenizer_config.json
Special tokens file saved in model100_3/checkpoint-7/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from model100_3/checkpoint-7 (score: 0.18475455045700073).
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: twee

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7
  Number of trainable parameters = 66955010


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.195679,0.966355


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2140
  Batch size = 16
Saving model checkpoint to model100_4/checkpoint-7
Configuration saved in model100_4/checkpoint-7/config.json
Model weights saved in model100_4/checkpoint-7/pytorch_model.bin
tokenizer config file saved in model100_4/checkpoint-7/tokenizer_config.json
Special tokens file saved in model100_4/checkpoint-7/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from model100_4/checkpoint-7 (score: 0.19567905366420746).
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: twee

In [68]:
average_accuracy_100 = sum([d['eval_accuracy'] for d in accuracies]) / len(accuracies)
average_accuracy_100

0.9657710280373832

### 400 randomly selected examples Trainers

In [73]:
accuracies = []

for i in range(4):
    train = tokenized_train.select(random.sample(range(tokenized_train.num_rows), k=400))
    trainer = build_trainer(train, tokenized_test, "model400_" + str(i + 1), False)

    trainer.train()
    
    accuracies.append(trainer.evaluate(tokenized_test))

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tweet, id. If tweet, id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 400
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 25
  Number of trainable parameters = 66955010


KeyboardInterrupt: 

In [70]:
average_accuracy_400 = sum([d['eval_accuracy'] for d in accuracies]) / len(accuracies)
average_accuracy_400

0.9503504672897196

### 1600 randomly selected examples Trainers

In [16]:
# accuracies = []

for i in range(2, 4):
    train = tokenized_train.select(random.sample(range(tokenized_train.num_rows), k=1600))
    trainer = build_trainer(train, tokenized_test, "model1600_" + str(i + 1), False)

    trainer.train()
    
#     accuracies.append(trainer.evaluate(tokenized_test))

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, tweet. If id, tweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1600
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 66955010
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.237653,0.911215


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, tweet. If id, tweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2140
  Batch size = 16
Saving model checkpoint to model1600_3/checkpoint-100
Configuration saved in model1600_3/checkpoint-100/config.json
Model weights saved in model1600_3/checkpoint-100/pytorch_model.bin
tokenizer config file saved in model1600_3/checkpoint-100/tokenizer_config.json
Special tokens file saved in model1600_3/checkpoint-100/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from model1600_3/checkpoint-100 (score: 0.2376527190208435).
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.188896,0.926636


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, tweet. If id, tweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2140
  Batch size = 16
Saving model checkpoint to model1600_4/checkpoint-100
Configuration saved in model1600_4/checkpoint-100/config.json
Model weights saved in model1600_4/checkpoint-100/pytorch_model.bin
tokenizer config file saved in model1600_4/checkpoint-100/tokenizer_config.json
Special tokens file saved in model1600_4/checkpoint-100/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from model1600_4/checkpoint-100 (score: 0.1888955533504486).


In [None]:
# average_accuracy_1600 = sum([d['eval_accuracy'] for d in accuracies]) / len(accuracies)
# average_accuracy_1600