In [18]:
import pandas as pd
import numpy as np
from datasets import Dataset
import evaluate
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast
from transformers import TrainingArguments, Trainer

In [19]:
pd.set_option('display.max_colwidth', None)

df = pd.read_csv('../Data/Twitter/cleaned_data.csv', encoding='latin')
df

Unnamed: 0,label,text
0,0,I miss you I wish it was the 18th already!
1,0,i think i'm confused and worried - need a strong coffee and a smoke! will venture back to twitterverse later - is it all my fault? OMG
2,0,"unfortunately no shoot today, my friend is busy"
3,0,is going to work. I dont feel very well today.
4,0,My phone was dying and it beeped at 4 am. Couldn't get back to sleep. This sucks
...,...,...
19995,1,It's the little things in life which makes it so enjoyable
19996,1,girlllllllllls just wanna have fun!
19997,1,I wish that was true we shall do it later
19998,1,- soo beautiful demi returns to Chile!! we love you


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   20000 non-null  int64 
 1   text    20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [22]:
df

Unnamed: 0,label,text
0,0,I miss you I wish it was the 18th already!
1,0,i think i'm confused and worried - need a strong coffee and a smoke! will venture back to twitterverse later - is it all my fault? OMG
2,0,"unfortunately no shoot today, my friend is busy"
3,0,is going to work. I dont feel very well today.
4,0,My phone was dying and it beeped at 4 am. Couldn't get back to sleep. This sucks
...,...,...
19995,1,It's the little things in life which makes it so enjoyable
19996,1,girlllllllllls just wanna have fun!
19997,1,I wish that was true we shall do it later
19998,1,- soo beautiful demi returns to Chile!! we love you


In [23]:
MODEL = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset = Dataset.from_pandas(df)
dataset[0]


{'label': 0, 'text': ' I miss you  I wish it was the 18th already!'}

In [24]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [25]:
tokenized_datasets = tokenized_datasets.class_encode_column('label')
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.5, stratify_by_column='label')
tokenized_datasets

Stringifying the column:   0%|          | 0/20000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/20000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 18000
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [28]:
train_dataset = tokenized_datasets["train"].select(range(5000))
eval_dataset = tokenized_datasets["train"].select(range(5000,7000))
test_dataset = tokenized_datasets["test"].select(range(10000))

In [31]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", learning_rate=.000001, num_train_epochs=5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/3125 [00:00<?, ?it/s]

{'loss': 0.6908, 'learning_rate': 8.399999999999999e-07, 'epoch': 0.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6591541767120361, 'eval_accuracy': 0.763, 'eval_f1': 0.7683284457478006, 'eval_precision': 0.7751479289940828, 'eval_recall': 0.7616279069767442, 'eval_runtime': 22.7888, 'eval_samples_per_second': 87.762, 'eval_steps_per_second': 10.97, 'epoch': 1.0}
{'loss': 0.5742, 'learning_rate': 6.800000000000001e-07, 'epoch': 1.6}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.43135279417037964, 'eval_accuracy': 0.8085, 'eval_f1': 0.8147073052733431, 'eval_precision': 0.8135265700483092, 'eval_recall': 0.8158914728682171, 'eval_runtime': 22.2473, 'eval_samples_per_second': 89.899, 'eval_steps_per_second': 11.237, 'epoch': 2.0}
{'loss': 0.4587, 'learning_rate': 5.2e-07, 'epoch': 2.4}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.4265088140964508, 'eval_accuracy': 0.816, 'eval_f1': 0.8247619047619048, 'eval_precision': 0.8108614232209738, 'eval_recall': 0.8391472868217055, 'eval_runtime': 22.3136, 'eval_samples_per_second': 89.631, 'eval_steps_per_second': 11.204, 'epoch': 3.0}
{'loss': 0.4392, 'learning_rate': 3.6e-07, 'epoch': 3.2}
{'loss': 0.4321, 'learning_rate': 2e-07, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.4245370626449585, 'eval_accuracy': 0.8165, 'eval_f1': 0.8194786030496803, 'eval_precision': 0.8321678321678322, 'eval_recall': 0.8071705426356589, 'eval_runtime': 22.4482, 'eval_samples_per_second': 89.094, 'eval_steps_per_second': 11.137, 'epoch': 4.0}
{'loss': 0.4031, 'learning_rate': 4e-08, 'epoch': 4.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.42253461480140686, 'eval_accuracy': 0.82, 'eval_f1': 0.8264223722275795, 'eval_precision': 0.8224568138195777, 'eval_recall': 0.8304263565891473, 'eval_runtime': 22.6222, 'eval_samples_per_second': 88.409, 'eval_steps_per_second': 11.051, 'epoch': 5.0}
{'train_runtime': 950.614, 'train_samples_per_second': 26.299, 'train_steps_per_second': 3.287, 'train_loss': 0.4979331640625, 'epoch': 5.0}


TrainOutput(global_step=3125, training_loss=0.4979331640625, metrics={'train_runtime': 950.614, 'train_samples_per_second': 26.299, 'train_steps_per_second': 3.287, 'train_loss': 0.4979331640625, 'epoch': 5.0})

In [32]:
trainer.evaluate(test_dataset)

  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.3960396349430084,
 'eval_accuracy': 0.836,
 'eval_f1': 0.8390578999018645,
 'eval_precision': 0.8236994219653179,
 'eval_recall': 0.855,
 'eval_runtime': 19.9562,
 'eval_samples_per_second': 100.219,
 'eval_steps_per_second': 12.527,
 'epoch': 5.0}