In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")["train"].train_test_split(test_size=0.05, train_size=0.1)

Found cached dataset imdb (/Users/chris-rotondo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
dataset["test"].to_pandas()

Unnamed: 0,text,label
0,"First off...with names like Fred Olen Ray, Bri...",0
1,John Huston's Wise Blood was a more horrifying...,0
2,The only reason i am bothering to comment on t...,0
3,".......Playing Kaddiddlehopper, Col San Fernan...",1
4,Well let me go say this because i love history...,1
...,...,...
1245,"This ""film"" is the culmination of everything t...",0
1246,"It's official, folks -- Hou Hsiao-Hsien doesn'...",0
1247,Astaire and Rogers at the height of their popu...,1
1248,Love is overwhelming... In all it's manifestat...,1


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(data):
    return tokenizer(data["text"], padding=True, truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

In [4]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [5]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    evaluation_strategy="steps", 
    eval_steps=51,             
    logging_dir="./imdb_logs",
    logging_steps=1,          
    save_steps=50,
    output_dir="./imdb_results",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
trainer.train(resume_from_checkpoint=True) 

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=625, training_loss=0.05222286228984594, metrics={'train_runtime': 2312.5605, 'train_samples_per_second': 1.081, 'train_steps_per_second': 0.27, 'total_flos': 331168496640000.0, 'train_loss': 0.05222286228984594, 'epoch': 1.0})

In [7]:
model.save_pretrained("./imdb-sentiment")
tokenizer.save_pretrained("./imdb-sentiment_pos_neg")

('./sentiment_pos_neg/tokenizer_config.json',
 './sentiment_pos_neg/special_tokens_map.json',
 './sentiment_pos_neg/vocab.txt',
 './sentiment_pos_neg/added_tokens.json',
 './sentiment_pos_neg/tokenizer.json')