In [12]:
from datasets import load_dataset

base_url = './data/'

dataset = load_dataset('csv', data_files={'train': base_url+'train.csv','validation': base_url+'validation.csv','test': base_url+'test.csv'})

Using custom data configuration default-7ea5f784734a84b2
Reusing dataset csv (C:\Users\user\.cache\huggingface\datasets\csv\default-7ea5f784734a84b2\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 76
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 26
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 26
    })
})

In [14]:
import torch

# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce GTX 1650
Memory Usage:
Allocated: 0.4 GB
Cached:    0.5 GB


In [5]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

dataset_encoded = dataset.map(tokenize_function, batched=True, batch_size=None)
dataset_encoded



Loading cached processed dataset at C:\Users\user\.cache\huggingface\datasets\csv\default-7ea5f784734a84b2\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-a15c6952d960ec60.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\user\.cache\huggingface\datasets\csv\default-7ea5f784734a84b2\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-ffe7fb0690828850.arrow


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 76
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 26
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 26
    })
})

In [15]:
print(dataset_encoded['train'][0])

{'text': 'I sent my very catholic mother a place where one of my friends got tested for ADHD. She freaked out because some of the workers said they were LGBT friendly.', 'label': 'POSITIVE', 'input_ids': [101, 146, 1850, 1139, 1304, 5855, 14084, 1596, 1534, 170, 1282, 1187, 1141, 1104, 1139, 2053, 1400, 7289, 1111, 5844, 23527, 119, 1153, 25818, 1149, 1272, 1199, 1104, 1103, 3239, 1163, 1152, 1127, 12105, 4931, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [16]:
from transformers import AutoModelForSequenceClassification

num_labels = 2
model = (AutoModelForSequenceClassification
         .from_pretrained(checkpoint, num_labels=num_labels)
         .to(device))

In [17]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"{checkpoint}-finetuned"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  optim='adamw_torch'
                                  )

In [18]:
from datasets import load_metric
import numpy as np
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [19]:
from transformers import Trainer

torch.cuda.empty_cache()

trainer = Trainer(model=model,
                  compute_metrics=compute_metrics,
                  args=training_args, 
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train()

  0%|          | 0/20 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [14]:
trainer.evaluate()

  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.5983176231384277,
 'eval_accuracy': 0.7307692307692307,
 'eval_runtime': 0.6,
 'eval_samples_per_second': 43.333,
 'eval_steps_per_second': 6.667,
 'epoch': 2.0}

In [18]:
trainer.save_model()
model_name

'bert-base-cased-finetuned'

In [27]:
from transformers import pipeline
classifier = pipeline('text-classification', model=model_name)

In [30]:
import textwrap
wrapper = textwrap.TextWrapper(width=80, break_long_words=False, break_on_hyphens=False)

In [31]:
sentence = "If you're from the US or Europe, you are in no position to talk about other country's LGBT+ policies"

In [32]:
c = classifier(sentence)
print('\nSentence:')
print(wrapper.fill(sentence))
print(f"\nThis sentence is classified with a {c[0]['label']} sentiment")


Sentence:
If you're from the US or Europe, you are in no position to talk about other
country's LGBT+ policies

This sentence is classified with a LABEL_1 sentiment


In [26]:
classifier('help')

[{'label': 'LABEL_1', 'score': 0.5868823528289795}]