In [1]:
%load_ext blackcellmagic

In [2]:
import numpy as np
import os
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)




In [3]:
raw = pd.read_csv("../data/train_raw.csv").sample(n=5000, random_state=42, replace=False)

In [4]:
raw.shape

(5000, 5)

In [5]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 96140 to 392165
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   tweetid            5000 non-null   int64 
 1   user_display_name  5000 non-null   object
 2   tweet_text         5000 non-null   object
 3   clean_text         5000 non-null   object
 4   troll_or_not       5000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 234.4+ KB


In [6]:
raw.head()

Unnamed: 0,tweetid,user_display_name,tweet_text,clean_text,troll_or_not
96140,948701054463545344,W8p+9rcodeTsTUcwjGDN0fflucRtDy5fcQiTi1P+OGU=,Northern Ireland face tough Gold Coast challen...,Northern Ireland face tough Gold Coast challenge,1
309564,1250129010820071426,xinwenxiaojie,@adrianbye Not denying that but the way they'v...,Not denying that but the way theyve been writt...,0
309513,913235360523546625,EmilyZFeng,China soft power: JV-like institutions being s...,China soft power JVlike institutions being set...,0
73602,820153030435860481,阿丽木琴,ONLYRPE: #TLRP saran nick mulchar? R0ME0F,ONLYRPE #TLRP saran nick mulchar R0ME0F,1
568617,1237374169655971841,KaiserKuo,It's not altogether kind. He doesn't think she...,Its not altogether kind He doesn not think she...,0


In [7]:
X = list(raw["clean_text"].values)
y = list(raw["troll_or_not"].values)


train_texts, test_texts, train_labels, test_labels = train_test_split(
    X, y, random_state=42, test_size=0.2, stratify=y
)


In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [9]:
class tweetsdataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = tweetsdataset(train_encodings, train_labels)
test_dataset = tweetsdataset(test_encodings, test_labels)

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
def hp_tune(params):
    return {
        "learning_rate": params.suggest_float("learning_rate", 5e-5, 5e-4, log=True),
        "num_train_epochs": params.suggest_int("num_train_epochs", 1, 3),
        "seed": params.suggest_int("seed", 1, 42),
        "per_device_train_batch_size": params.suggest_categorical(
            "per_device_train_batch_size", [8, 16, 32]
        ),
        #"per_device_eval_batch_size": params.suggest_categorical(
        #    "per_device_eval_batch_size", [8, 16, 32]
        #),
    }


In [12]:
def model_init():
    return DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", return_dict=True
    )


In [13]:
%%time

training_args = TrainingArguments(
    output_dir="../results",  # output directory
    overwrite_output_dir=True,
    #num_train_epochs=3,  # total number of training epochs
    #learning_rate=5e-5,
    #per_device_train_batch_size=8,  # batch size per device during training
    #per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_steps=1000,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir="../logs",  # directory for storing logs
    logging_steps=5,
    save_steps=10000,
    do_train=True,
    do_eval=True,
    evaluate_during_training=True,
    save_total_limit=1,
    gradient_accumulation_steps=8, #reduce memory usage while allowing bigger overall batch size.
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model_init=model_init,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # test dataset
)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

CPU times: user 1.97 s, sys: 357 ms, total: 2.33 s
Wall time: 5.16 s


In [14]:
%%time

trainer.hyperparameter_search(direction="maximize", hp_space=hp_tune)

[I 2020-09-10 19:53:08,206] A new study created in memory with name: no-name-570f7bca-472d-4fcc-b5c0-f5f0002f5fd4
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpo

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=250.0, style=ProgressStyle(description_wi…

{'loss': 0.6964195728302002, 'learning_rate': 2.755187894135314e-07, 'epoch': 0.16, 'step': 5}
{'loss': 0.6889097690582275, 'learning_rate': 5.510375788270628e-07, 'epoch': 0.32, 'step': 10}
{'loss': 0.6983746528625489, 'learning_rate': 8.265563682405942e-07, 'epoch': 0.48, 'step': 15}
{'loss': 0.6903141021728516, 'learning_rate': 1.1020751576541256e-06, 'epoch': 0.64, 'step': 20}
{'loss': 0.6926868438720704, 'learning_rate': 1.377593947067657e-06, 'epoch': 0.8, 'step': 25}
{'loss': 0.6884334564208985, 'learning_rate': 1.6531127364811883e-06, 'epoch': 0.96, 'step': 30}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=250.0, style=ProgressStyle(description_wi…

{'loss': 0.7217544555664063, 'learning_rate': 1.92863152589472e-06, 'epoch': 1.1280000000000001, 'step': 35}
{'loss': 0.6830867767333985, 'learning_rate': 2.2041503153082513e-06, 'epoch': 1.288, 'step': 40}
{'loss': 0.67794189453125, 'learning_rate': 2.4796691047217825e-06, 'epoch': 1.448, 'step': 45}
{'loss': 0.6773845672607421, 'learning_rate': 2.755187894135314e-06, 'epoch': 1.608, 'step': 50}
{'loss': 0.6710739135742188, 'learning_rate': 3.0307066835488454e-06, 'epoch': 1.768, 'step': 55}
{'loss': 0.6647987365722656, 'learning_rate': 3.3062254729623767e-06, 'epoch': 1.928, 'step': 60}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=250.0, style=ProgressStyle(description_wi…

{'loss': 0.6901519775390625, 'learning_rate': 3.581744262375908e-06, 'epoch': 2.096, 'step': 65}
{'loss': 0.6394630432128906, 'learning_rate': 3.85726305178944e-06, 'epoch': 2.2560000000000002, 'step': 70}
{'loss': 0.609100341796875, 'learning_rate': 4.13278184120297e-06, 'epoch': 2.416, 'step': 75}
{'loss': 0.6013671875, 'learning_rate': 4.4083006306165025e-06, 'epoch': 2.576, 'step': 80}
{'loss': 0.5659149169921875, 'learning_rate': 4.683819420030034e-06, 'epoch': 2.7359999999999998, 'step': 85}
{'loss': 0.5345794677734375, 'learning_rate': 4.959338209443565e-06, 'epoch': 2.896, 'step': 90}




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=125.0, style=ProgressStyle(description_w…


{'eval_loss': 0.47782826828956604, 'eval_accuracy': 0.799, 'eval_f1': 0.7881981032665965, 'eval_precision': 0.8385650224215246, 'eval_recall': 0.7435387673956262, 'epoch': 2.992, 'step': 93}


[I 2020-09-10 20:07:25,435] Trial 0 finished with value: 3.1693018930837473 and parameters: {'learning_rate': 5.510375788270628e-05, 'num_train_epochs': 3, 'seed': 22, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 3.1693018930837473.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model fr

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=7.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.6921026229858398, 'learning_rate': 6.707782018373533e-07, 'epoch': 0.32, 'step': 5}
{'loss': 0.6924108505249024, 'learning_rate': 1.3415564036747066e-06, 'epoch': 0.64, 'step': 10}
{'loss': 0.6903835296630859, 'learning_rate': 2.0123346055120596e-06, 'epoch': 0.96, 'step': 15}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7671373367309571, 'learning_rate': 2.683112807349413e-06, 'epoch': 1.32, 'step': 20}
{'loss': 0.6800626754760742, 'learning_rate': 3.353891009186766e-06, 'epoch': 1.6400000000000001, 'step': 25}
{'loss': 0.6712554931640625, 'learning_rate': 4.024669211024119e-06, 'epoch': 1.96, 'step': 30}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7472354888916015, 'learning_rate': 4.695447412861473e-06, 'epoch': 2.32, 'step': 35}
{'loss': 0.6480644226074219, 'learning_rate': 5.366225614698826e-06, 'epoch': 2.64, 'step': 40}
{'loss': 0.6279804229736328, 'learning_rate': 6.037003816536179e-06, 'epoch': 2.96, 'step': 45}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.6940689086914062, 'learning_rate': 6.707782018373532e-06, 'epoch': 3.32, 'step': 50}
{'loss': 0.5822822570800781, 'learning_rate': 7.378560220210886e-06, 'epoch': 3.64, 'step': 55}
{'loss': 0.56004638671875, 'learning_rate': 8.049338422048239e-06, 'epoch': 3.96, 'step': 60}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.5794486999511719, 'learning_rate': 8.720116623885592e-06, 'epoch': 4.32, 'step': 65}
{'loss': 0.48795089721679685, 'learning_rate': 9.390894825722946e-06, 'epoch': 4.64, 'step': 70}
{'loss': 0.4457099914550781, 'learning_rate': 1.0061673027560297e-05, 'epoch': 4.96, 'step': 75}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.4778160095214844, 'learning_rate': 1.0732451229397652e-05, 'epoch': 5.32, 'step': 80}
{'loss': 0.40204010009765623, 'learning_rate': 1.1403229431235006e-05, 'epoch': 5.64, 'step': 85}
{'loss': 0.366546630859375, 'learning_rate': 1.2074007633072358e-05, 'epoch': 5.96, 'step': 90}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…





HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=125.0, style=ProgressStyle(description_w…


{'eval_loss': 0.3538577839136124, 'eval_accuracy': 0.86, 'eval_f1': 0.8694029850746269, 'eval_precision': 0.8189806678383128, 'eval_recall': 0.9264413518886679, 'epoch': 6.192, 'step': 93}


[I 2020-09-10 20:35:22,553] Trial 1 finished with value: 3.4748250048016076 and parameters: {'learning_rate': 0.00013415564036747064, 'num_train_epochs': 1, 'seed': 38, 'per_device_train_batch_size': 32}. Best is trial 1 with value: 3.4748250048016076.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model f

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=7.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.6957561016082764, 'learning_rate': 6.244281091928895e-07, 'epoch': 0.32, 'step': 5}
{'loss': 0.6975147724151611, 'learning_rate': 1.248856218385779e-06, 'epoch': 0.64, 'step': 10}
{'loss': 0.6917993545532226, 'learning_rate': 1.8732843275786685e-06, 'epoch': 0.96, 'step': 15}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7783477783203125, 'learning_rate': 2.497712436771558e-06, 'epoch': 1.32, 'step': 20}
{'loss': 0.6882513046264649, 'learning_rate': 3.1221405459644475e-06, 'epoch': 1.6400000000000001, 'step': 25}
{'loss': 0.680438232421875, 'learning_rate': 3.746568655157337e-06, 'epoch': 1.96, 'step': 30}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7604385375976562, 'learning_rate': 4.370996764350227e-06, 'epoch': 2.32, 'step': 35}
{'loss': 0.6669113159179687, 'learning_rate': 4.995424873543116e-06, 'epoch': 2.64, 'step': 40}
{'loss': 0.6503486633300781, 'learning_rate': 5.619852982736005e-06, 'epoch': 2.96, 'step': 45}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7070014953613282, 'learning_rate': 6.244281091928895e-06, 'epoch': 3.32, 'step': 50}
{'loss': 0.6001136779785157, 'learning_rate': 6.8687092011217845e-06, 'epoch': 3.64, 'step': 55}
{'loss': 0.5558853149414062, 'learning_rate': 7.493137310314674e-06, 'epoch': 3.96, 'step': 60}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.5819236755371093, 'learning_rate': 8.117565419507564e-06, 'epoch': 4.32, 'step': 65}
{'loss': 0.46740570068359377, 'learning_rate': 8.741993528700454e-06, 'epoch': 4.64, 'step': 70}
{'loss': 0.44638824462890625, 'learning_rate': 9.366421637893341e-06, 'epoch': 4.96, 'step': 75}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.4824378967285156, 'learning_rate': 9.990849747086232e-06, 'epoch': 5.32, 'step': 80}
{'loss': 0.3972312927246094, 'learning_rate': 1.0615277856279122e-05, 'epoch': 5.64, 'step': 85}
{'loss': 0.3886848449707031, 'learning_rate': 1.123970596547201e-05, 'epoch': 5.96, 'step': 90}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…





HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=125.0, style=ProgressStyle(description_w…


{'eval_loss': 0.3620190262794495, 'eval_accuracy': 0.841, 'eval_f1': 0.8392315470171892, 'eval_precision': 0.8539094650205762, 'eval_recall': 0.8250497017892644, 'epoch': 6.192, 'step': 93}


[I 2020-09-10 21:03:23,084] Trial 2 finished with value: 3.35919071382703 and parameters: {'learning_rate': 0.0001248856218385779, 'num_train_epochs': 2, 'seed': 1, 'per_device_train_batch_size': 32}. Best is trial 1 with value: 3.4748250048016076.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=7.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.6897110939025879, 'learning_rate': 3.0750518755748716e-07, 'epoch': 0.32, 'step': 5}
{'loss': 0.6855953216552735, 'learning_rate': 6.150103751149743e-07, 'epoch': 0.64, 'step': 10}
{'loss': 0.6848952293395996, 'learning_rate': 9.225155626724613e-07, 'epoch': 0.96, 'step': 15}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7741003036499023, 'learning_rate': 1.2300207502299487e-06, 'epoch': 1.32, 'step': 20}
{'loss': 0.6812688827514648, 'learning_rate': 1.5375259377874358e-06, 'epoch': 1.6400000000000001, 'step': 25}
{'loss': 0.6787818908691406, 'learning_rate': 1.8450311253449227e-06, 'epoch': 1.96, 'step': 30}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7609752655029297, 'learning_rate': 2.15253631290241e-06, 'epoch': 2.32, 'step': 35}
{'loss': 0.6716098785400391, 'learning_rate': 2.4600415004598973e-06, 'epoch': 2.64, 'step': 40}
{'loss': 0.6616451263427734, 'learning_rate': 2.7675466880173842e-06, 'epoch': 2.96, 'step': 45}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7401714324951172, 'learning_rate': 3.0750518755748715e-06, 'epoch': 3.32, 'step': 50}
{'loss': 0.6442947387695312, 'learning_rate': 3.3825570631323584e-06, 'epoch': 3.64, 'step': 55}
{'loss': 0.6307945251464844, 'learning_rate': 3.6900622506898453e-06, 'epoch': 3.96, 'step': 60}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.6811500549316406, 'learning_rate': 3.997567438247333e-06, 'epoch': 4.32, 'step': 65}
{'loss': 0.5765296936035156, 'learning_rate': 4.30507262580482e-06, 'epoch': 4.64, 'step': 70}
{'loss': 0.5530487060546875, 'learning_rate': 4.612577813362307e-06, 'epoch': 4.96, 'step': 75}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.5825027465820313, 'learning_rate': 4.920083000919795e-06, 'epoch': 5.32, 'step': 80}
{'loss': 0.48347930908203124, 'learning_rate': 5.2275881884772815e-06, 'epoch': 5.64, 'step': 85}
{'loss': 0.45449676513671877, 'learning_rate': 5.5350933760347684e-06, 'epoch': 5.96, 'step': 90}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…





HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=125.0, style=ProgressStyle(description_w…


{'eval_loss': 0.4220557609796524, 'eval_accuracy': 0.817, 'eval_f1': 0.8107549120992761, 'eval_precision': 0.8448275862068966, 'eval_recall': 0.7793240556660039, 'epoch': 6.192, 'step': 93}


[I 2020-09-10 21:30:23,011] Trial 3 finished with value: 3.2519065539721765 and parameters: {'learning_rate': 6.150103751149743e-05, 'num_train_epochs': 3, 'seed': 13, 'per_device_train_batch_size': 32}. Best is trial 1 with value: 3.4748250048016076.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model fr

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=250.0, style=ProgressStyle(description_wi…

{'loss': 0.6972189903259277, 'learning_rate': 1.2527640692007789e-06, 'epoch': 0.16, 'step': 5}
{'loss': 0.6974266052246094, 'learning_rate': 2.5055281384015578e-06, 'epoch': 0.32, 'step': 10}
{'loss': 0.6909745216369629, 'learning_rate': 3.7582922076023364e-06, 'epoch': 0.48, 'step': 15}
{'loss': 0.690184211730957, 'learning_rate': 5.0110562768031155e-06, 'epoch': 0.64, 'step': 20}
{'loss': 0.683860969543457, 'learning_rate': 6.263820346003895e-06, 'epoch': 0.8, 'step': 25}
{'loss': 0.6733028411865234, 'learning_rate': 7.516584415204673e-06, 'epoch': 0.96, 'step': 30}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=250.0, style=ProgressStyle(description_wi…

{'loss': 0.689487075805664, 'learning_rate': 8.769348484405453e-06, 'epoch': 1.1280000000000001, 'step': 35}
{'loss': 0.6423381805419922, 'learning_rate': 1.0022112553606231e-05, 'epoch': 1.288, 'step': 40}
{'loss': 0.5965122222900391, 'learning_rate': 1.127487662280701e-05, 'epoch': 1.448, 'step': 45}
{'loss': 0.5492351531982422, 'learning_rate': 1.252764069200779e-05, 'epoch': 1.608, 'step': 50}
{'loss': 0.5002609252929687, 'learning_rate': 1.3780404761208567e-05, 'epoch': 1.768, 'step': 55}
{'loss': 0.4364105224609375, 'learning_rate': 1.5033168830409346e-05, 'epoch': 1.928, 'step': 60}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=250.0, style=ProgressStyle(description_wi…

{'loss': 0.46723251342773436, 'learning_rate': 1.6285932899610126e-05, 'epoch': 2.096, 'step': 65}
{'loss': 0.38009796142578123, 'learning_rate': 1.7538696968810906e-05, 'epoch': 2.2560000000000002, 'step': 70}
{'loss': 0.3963035583496094, 'learning_rate': 1.8791461038011682e-05, 'epoch': 2.416, 'step': 75}
{'loss': 0.37826385498046877, 'learning_rate': 2.0044225107212462e-05, 'epoch': 2.576, 'step': 80}
{'loss': 0.3208290100097656, 'learning_rate': 2.1296989176413242e-05, 'epoch': 2.7359999999999998, 'step': 85}
{'loss': 0.3103759765625, 'learning_rate': 2.254975324561402e-05, 'epoch': 2.896, 'step': 90}




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=125.0, style=ProgressStyle(description_w…


{'eval_loss': 0.329846893876791, 'eval_accuracy': 0.856, 'eval_f1': 0.8477801268498942, 'eval_precision': 0.9051918735891648, 'eval_recall': 0.7972166998011928, 'epoch': 2.992, 'step': 93}


[I 2020-09-10 21:44:37,769] Trial 4 finished with value: 3.4061887002402518 and parameters: {'learning_rate': 0.00025055281384015577, 'num_train_epochs': 2, 'seed': 23, 'per_device_train_batch_size': 16}. Best is trial 1 with value: 3.4748250048016076.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model f

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=250.0, style=ProgressStyle(description_wi…

{'loss': 0.6871309757232666, 'learning_rate': 2.2119798260116285e-06, 'epoch': 0.16, 'step': 5}
{'loss': 0.6888874530792236, 'learning_rate': 4.423959652023257e-06, 'epoch': 0.32, 'step': 10}
{'loss': 0.682749080657959, 'learning_rate': 6.635939478034886e-06, 'epoch': 0.48, 'step': 15}
{'loss': 0.6734565734863281, 'learning_rate': 8.847919304046514e-06, 'epoch': 0.64, 'step': 20}
{'loss': 0.6489477157592773, 'learning_rate': 1.1059899130058144e-05, 'epoch': 0.8, 'step': 25}
{'loss': 0.6203708648681641, 'learning_rate': 1.3271878956069771e-05, 'epoch': 0.96, 'step': 30}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=250.0, style=ProgressStyle(description_wi…

{'loss': 0.5888439178466797, 'learning_rate': 1.54838587820814e-05, 'epoch': 1.1280000000000001, 'step': 35}
{'loss': 0.5084144592285156, 'learning_rate': 1.7695838608093028e-05, 'epoch': 1.288, 'step': 40}
{'loss': 0.4600494384765625, 'learning_rate': 1.9907818434104655e-05, 'epoch': 1.448, 'step': 45}
{'loss': 0.4229282379150391, 'learning_rate': 2.211979826011629e-05, 'epoch': 1.608, 'step': 50}
{'loss': 0.4034912109375, 'learning_rate': 2.4331778086127916e-05, 'epoch': 1.768, 'step': 55}
{'loss': 0.4088134765625, 'learning_rate': 2.6543757912139542e-05, 'epoch': 1.928, 'step': 60}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=250.0, style=ProgressStyle(description_wi…

{'loss': 0.319287109375, 'learning_rate': 2.8755737738151173e-05, 'epoch': 2.096, 'step': 65}
{'loss': 0.34438552856445315, 'learning_rate': 3.09677175641628e-05, 'epoch': 2.2560000000000002, 'step': 70}
{'loss': 0.2952232360839844, 'learning_rate': 3.3179697390174426e-05, 'epoch': 2.416, 'step': 75}
{'loss': 0.30804977416992185, 'learning_rate': 3.5391677216186057e-05, 'epoch': 2.576, 'step': 80}
{'loss': 0.27484817504882814, 'learning_rate': 3.760365704219769e-05, 'epoch': 2.7359999999999998, 'step': 85}
{'loss': 0.260968017578125, 'learning_rate': 3.981563686820931e-05, 'epoch': 2.896, 'step': 90}




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=125.0, style=ProgressStyle(description_w…


{'eval_loss': 0.28190431597828863, 'eval_accuracy': 0.882, 'eval_f1': 0.8824701195219123, 'eval_precision': 0.8842315369261478, 'eval_recall': 0.8807157057654076, 'epoch': 2.992, 'step': 93}


[I 2020-09-10 21:58:26,813] Trial 5 finished with value: 3.5294173622134677 and parameters: {'learning_rate': 0.0004423959652023257, 'num_train_epochs': 3, 'seed': 9, 'per_device_train_batch_size': 16}. Best is trial 5 with value: 3.5294173622134677.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model fro

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=7.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.6900380611419678, 'learning_rate': 7.570488625500433e-07, 'epoch': 0.32, 'step': 5}
{'loss': 0.6877788066864013, 'learning_rate': 1.5140977251000866e-06, 'epoch': 0.64, 'step': 10}
{'loss': 0.6858269691467285, 'learning_rate': 2.27114658765013e-06, 'epoch': 0.96, 'step': 15}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7679986953735352, 'learning_rate': 3.0281954502001732e-06, 'epoch': 1.32, 'step': 20}
{'loss': 0.6798545837402343, 'learning_rate': 3.7852443127502167e-06, 'epoch': 1.6400000000000001, 'step': 25}
{'loss': 0.6712570190429688, 'learning_rate': 4.54229317530026e-06, 'epoch': 1.96, 'step': 30}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.7440628051757813, 'learning_rate': 5.2993420378503035e-06, 'epoch': 2.32, 'step': 35}
{'loss': 0.642086410522461, 'learning_rate': 6.0563909004003465e-06, 'epoch': 2.64, 'step': 40}
{'loss': 0.6196769714355469, 'learning_rate': 6.8134397629503895e-06, 'epoch': 2.96, 'step': 45}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.6505050659179688, 'learning_rate': 7.570488625500433e-06, 'epoch': 3.32, 'step': 50}
{'loss': 0.5418525695800781, 'learning_rate': 8.327537488050476e-06, 'epoch': 3.64, 'step': 55}
{'loss': 0.4885398864746094, 'learning_rate': 9.08458635060052e-06, 'epoch': 3.96, 'step': 60}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=125.0, style=ProgressStyle(description_wi…

{'loss': 0.5073417663574219, 'learning_rate': 9.841635213150562e-06, 'epoch': 4.32, 'step': 65}


KeyboardInterrupt: 

In [15]:
stop

NameError: name 'stop' is not defined

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
ft_model = "finetuned/state_trolls"
trainer.save_model(ft_model)
tokenizer.save_pretrained(ft_model)
