In [1]:
from transformers import RobertaTokenizerFast, Trainer, TrainingArguments, RobertaForSequenceClassification
import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn.metrics import classification_report
import evaluate

In [2]:
MODEL = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL)
TRAIN_DATA ='../Data/train.csv'
EVAL_DATA = '../Data/validate.csv'
TEST_DATA = '../Data/test.csv'

In [3]:
pd.set_option('display.max_colwidth', None)
df_train = pd.read_csv(TRAIN_DATA, encoding='latin')
df_eval = pd.read_csv(EVAL_DATA, encoding='latin')
df_test = pd.read_csv(TEST_DATA, encoding='latin')
df_train

Unnamed: 0,target,text
0,-1,my roomie called to inform me someone tried to break into our apartment when she was there today..awesome.
1,-1,i would of got the 16gb iphone but i didnt have the extra $100
2,-1,just stay home and boring day
3,1,been voting for the eu parliament and the heritage rules of the danish monarchy. then a spinning class. feel good about myself...
4,1,ooooh sbs2!! that's exciting and relevant to my media audiences research into psbs - look forward to checking it out
...,...,...
9995,1,my man has both sides..with me he's the sweetest... just dont f*ck with me cuz then theres troble.i'll take both sides please.
9996,-1,omg why is this weather so disgusting today???? looks like i'm going to have to pull out the rainboots
9997,-1,is watching the green mile... does not want john coffey to die
9998,1,am i one of the 1st 100 to tweet it? i'd really like to demo 2.0


In [4]:
def data_preprocess(df):
    df['target'].replace({-1:0},inplace=True)
    df.rename(columns={'target':'label'}, inplace=True)
    return df

df_train = data_preprocess(df_train)
df_eval = data_preprocess(df_eval)
df_test = data_preprocess(df_test)

df_train

Unnamed: 0,label,text
0,0,my roomie called to inform me someone tried to break into our apartment when she was there today..awesome.
1,0,i would of got the 16gb iphone but i didnt have the extra $100
2,0,just stay home and boring day
3,1,been voting for the eu parliament and the heritage rules of the danish monarchy. then a spinning class. feel good about myself...
4,1,ooooh sbs2!! that's exciting and relevant to my media audiences research into psbs - look forward to checking it out
...,...,...
9995,1,my man has both sides..with me he's the sweetest... just dont f*ck with me cuz then theres troble.i'll take both sides please.
9996,0,omg why is this weather so disgusting today???? looks like i'm going to have to pull out the rainboots
9997,0,is watching the green mile... does not want john coffey to die
9998,1,am i one of the 1st 100 to tweet it? i'd really like to demo 2.0


In [5]:
print(df_train['label'].value_counts())
print(df_eval['label'].value_counts())
print(df_test['label'].value_counts())

0    5014
1    4986
Name: label, dtype: int64
0    1012
1     988
Name: label, dtype: int64
0    2521
1    2479
Name: label, dtype: int64


In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset_train = Dataset.from_pandas(df_train)
dataset_eval = Dataset.from_pandas(df_eval)
dataset_test = Dataset.from_pandas(df_test)
dataset_train[0]

{'label': 0,
 'text': 'my roomie called to inform me someone tried to break into our apartment when she was there today..awesome. '}

In [7]:
train_dataset = dataset_train.map(tokenize_function, batched=True)
eval_dataset = dataset_eval.map(tokenize_function, batched=True)
test_dataset = dataset_test.map(tokenize_function, batched=True)
train_dataset

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 10000
})

In [8]:
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [9]:
training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, evaluation_strategy="epoch")
# Default parameters of learning_rate = 0.00005 and per_device_train_batch_size = 8

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()



  0%|          | 0/3750 [00:00<?, ?it/s]

{'loss': 0.5857, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}
{'loss': 0.5268, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5374781489372253, 'eval_accuracy': 0.8095, 'eval_f1': 0.8212106992022525, 'eval_precision': 0.7655293088363955, 'eval_recall': 0.8856275303643725, 'eval_runtime': 20.4364, 'eval_samples_per_second': 97.864, 'eval_steps_per_second': 12.233, 'epoch': 1.0}
{'loss': 0.4431, 'learning_rate': 3e-05, 'epoch': 1.2}
{'loss': 0.4138, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}
{'loss': 0.3957, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5077861547470093, 'eval_accuracy': 0.8225, 'eval_f1': 0.8059048660470202, 'eval_precision': 0.8763376932223543, 'eval_recall': 0.7459514170040485, 'eval_runtime': 20.392, 'eval_samples_per_second': 98.078, 'eval_steps_per_second': 12.26, 'epoch': 2.0}
{'loss': 0.3036, 'learning_rate': 1e-05, 'epoch': 2.4}
{'loss': 0.2939, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6526879668235779, 'eval_accuracy': 0.8275, 'eval_f1': 0.8244274809160305, 'eval_precision': 0.8290685772773797, 'eval_recall': 0.819838056680162, 'eval_runtime': 20.5543, 'eval_samples_per_second': 97.303, 'eval_steps_per_second': 12.163, 'epoch': 3.0}
{'train_runtime': 988.0309, 'train_samples_per_second': 30.363, 'train_steps_per_second': 3.795, 'train_loss': 0.4157192626953125, 'epoch': 3.0}


TrainOutput(global_step=3750, training_loss=0.4157192626953125, metrics={'train_runtime': 988.0309, 'train_samples_per_second': 30.363, 'train_steps_per_second': 3.795, 'train_loss': 0.4157192626953125, 'epoch': 3.0})

In [10]:
predictions = trainer.predict(test_dataset)

  0%|          | 0/625 [00:00<?, ?it/s]

In [11]:
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map({0:'negative',1:'positive'})
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [12]:
df = pd.DataFrame(list(zip(preds,labels,scores)), columns=['pred','label','score'])
df

Unnamed: 0,pred,label,score
0,0,negative,0.980419
1,1,positive,0.990340
2,1,positive,0.990889
3,0,negative,0.974968
4,0,negative,0.993825
...,...,...,...
4995,1,positive,0.985679
4996,1,positive,0.977494
4997,1,positive,0.987779
4998,0,negative,0.990140


In [13]:
y_pred = preds
y_true = test_dataset['label']

In [14]:
print(classification_report(y_true, y_pred, target_names=['negative','positive'], digits=4))

              precision    recall  f1-score   support

    negative     0.8362    0.8564    0.8462      2521
    positive     0.8503    0.8294    0.8397      2479

    accuracy                         0.8430      5000
   macro avg     0.8432    0.8429    0.8429      5000
weighted avg     0.8432    0.8430    0.8430      5000



In [15]:
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=0.0001, per_device_train_batch_size=8)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = test_dataset['label']

print(classification_report(y_true, y_pred, target_names=['negative','positive'], digits=4))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/3750 [00:00<?, ?it/s]

{'loss': 0.7049, 'learning_rate': 8.666666666666667e-05, 'epoch': 0.4}
{'loss': 0.702, 'learning_rate': 7.333333333333333e-05, 'epoch': 0.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.7113222479820251, 'eval_accuracy': 0.494, 'eval_f1': 0.6613119143239625, 'eval_precision': 0.494, 'eval_recall': 1.0, 'eval_runtime': 20.376, 'eval_samples_per_second': 98.155, 'eval_steps_per_second': 12.269, 'epoch': 1.0}
{'loss': 0.6981, 'learning_rate': 6e-05, 'epoch': 1.2}
{'loss': 0.6968, 'learning_rate': 4.666666666666667e-05, 'epoch': 1.6}
{'loss': 0.6972, 'learning_rate': 3.3333333333333335e-05, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6961330771446228, 'eval_accuracy': 0.506, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 20.5924, 'eval_samples_per_second': 97.123, 'eval_steps_per_second': 12.14, 'epoch': 2.0}
{'loss': 0.6967, 'learning_rate': 2e-05, 'epoch': 2.4}
{'loss': 0.6953, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6931102275848389, 'eval_accuracy': 0.506, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 20.1896, 'eval_samples_per_second': 99.061, 'eval_steps_per_second': 12.383, 'epoch': 3.0}
{'train_runtime': 980.3228, 'train_samples_per_second': 30.602, 'train_steps_per_second': 3.825, 'train_loss': 0.6983877278645834, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/625 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    negative     0.5042    1.0000    0.6704      2521
    positive     0.0000    0.0000    0.0000      2479

    accuracy                         0.5042      5000
   macro avg     0.2521    0.5000    0.3352      5000
weighted avg     0.2542    0.5042    0.3380      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=0.00001, per_device_train_batch_size=8)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = test_dataset['label']

print(classification_report(y_true, y_pred, target_names=['negative','positive'], digits=4))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/3750 [00:00<?, ?it/s]

{'loss': 0.5159, 'learning_rate': 8.666666666666668e-06, 'epoch': 0.4}
{'loss': 0.4465, 'learning_rate': 7.333333333333333e-06, 'epoch': 0.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.44275006651878357, 'eval_accuracy': 0.829, 'eval_f1': 0.8388312912346844, 'eval_precision': 0.7848324514991182, 'eval_recall': 0.9008097165991903, 'eval_runtime': 19.9, 'eval_samples_per_second': 100.503, 'eval_steps_per_second': 12.563, 'epoch': 1.0}
{'loss': 0.3736, 'learning_rate': 6e-06, 'epoch': 1.2}
{'loss': 0.3674, 'learning_rate': 4.666666666666667e-06, 'epoch': 1.6}
{'loss': 0.3592, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.44951704144477844, 'eval_accuracy': 0.8495, 'eval_f1': 0.8413284132841329, 'eval_precision': 0.8778877887788779, 'eval_recall': 0.8076923076923077, 'eval_runtime': 19.96, 'eval_samples_per_second': 100.2, 'eval_steps_per_second': 12.525, 'epoch': 2.0}
{'loss': 0.289, 'learning_rate': 2.0000000000000003e-06, 'epoch': 2.4}
{'loss': 0.3173, 'learning_rate': 6.666666666666667e-07, 'epoch': 2.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6046129465103149, 'eval_accuracy': 0.8505, 'eval_f1': 0.8496732026143792, 'eval_precision': 0.8441558441558441, 'eval_recall': 0.8552631578947368, 'eval_runtime': 20.2137, 'eval_samples_per_second': 98.943, 'eval_steps_per_second': 12.368, 'epoch': 3.0}
{'train_runtime': 966.5447, 'train_samples_per_second': 31.038, 'train_steps_per_second': 3.88, 'train_loss': 0.3779412373860677, 'epoch': 3.0}


  0%|          | 0/625 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    negative     0.8643    0.8592    0.8617      2521
    positive     0.8577    0.8628    0.8602      2479

    accuracy                         0.8610      5000
   macro avg     0.8610    0.8610    0.8610      5000
weighted avg     0.8610    0.8610    0.8610      5000



In [17]:
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=0.0001, per_device_train_batch_size=6)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = test_dataset['label']

print(classification_report(y_true, y_pred, target_names=['negative','positive'], digits=4))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/5001 [00:00<?, ?it/s]

{'loss': 0.7154, 'learning_rate': 9.000199960007999e-05, 'epoch': 0.3}
{'loss': 0.7077, 'learning_rate': 8.000399920015998e-05, 'epoch': 0.6}
{'loss': 0.703, 'learning_rate': 7.000599880023996e-05, 'epoch': 0.9}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.7070051431655884, 'eval_accuracy': 0.494, 'eval_f1': 0.6613119143239625, 'eval_precision': 0.494, 'eval_recall': 1.0, 'eval_runtime': 19.8872, 'eval_samples_per_second': 100.567, 'eval_steps_per_second': 12.571, 'epoch': 1.0}
{'loss': 0.7003, 'learning_rate': 6.000799840031994e-05, 'epoch': 1.2}
{'loss': 0.6984, 'learning_rate': 5.000999800039993e-05, 'epoch': 1.5}
{'loss': 0.6964, 'learning_rate': 4.0011997600479906e-05, 'epoch': 1.8}


  0%|          | 0/250 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6999432444572449, 'eval_accuracy': 0.506, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 20.1018, 'eval_samples_per_second': 99.493, 'eval_steps_per_second': 12.437, 'epoch': 2.0}
{'loss': 0.699, 'learning_rate': 3.001399720055989e-05, 'epoch': 2.1}
{'loss': 0.6975, 'learning_rate': 2.001599680063987e-05, 'epoch': 2.4}
{'loss': 0.6958, 'learning_rate': 1.0017996400719856e-05, 'epoch': 2.7}
{'loss': 0.6934, 'learning_rate': 1.9996000799840033e-08, 'epoch': 3.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6931371092796326, 'eval_accuracy': 0.506, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 19.8476, 'eval_samples_per_second': 100.768, 'eval_steps_per_second': 12.596, 'epoch': 3.0}
{'train_runtime': 1012.4347, 'train_samples_per_second': 29.632, 'train_steps_per_second': 4.94, 'train_loss': 0.7007007583025288, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/625 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    negative     0.5042    1.0000    0.6704      2521
    positive     0.0000    0.0000    0.0000      2479

    accuracy                         0.5042      5000
   macro avg     0.2521    0.5000    0.3352      5000
weighted avg     0.2542    0.5042    0.3380      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=0.00005, per_device_train_batch_size=6)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = test_dataset['label']

print(classification_report(y_true, y_pred, target_names=['negative','positive'], digits=4))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/5001 [00:00<?, ?it/s]

{'loss': 0.6155, 'learning_rate': 4.5000999800039995e-05, 'epoch': 0.3}
{'loss': 0.5577, 'learning_rate': 4.000199960007999e-05, 'epoch': 0.6}
{'loss': 0.5314, 'learning_rate': 3.500299940011998e-05, 'epoch': 0.9}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5472477078437805, 'eval_accuracy': 0.805, 'eval_f1': 0.8076923076923076, 'eval_precision': 0.7875, 'eval_recall': 0.8289473684210527, 'eval_runtime': 19.8557, 'eval_samples_per_second': 100.727, 'eval_steps_per_second': 12.591, 'epoch': 1.0}
{'loss': 0.4773, 'learning_rate': 3.000399920015997e-05, 'epoch': 1.2}
{'loss': 0.4925, 'learning_rate': 2.5004999000199963e-05, 'epoch': 1.5}
{'loss': 0.4787, 'learning_rate': 2.0005998800239953e-05, 'epoch': 1.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.568938136100769, 'eval_accuracy': 0.811, 'eval_f1': 0.7936681222707425, 'eval_precision': 0.8613744075829384, 'eval_recall': 0.7358299595141701, 'eval_runtime': 19.9863, 'eval_samples_per_second': 100.068, 'eval_steps_per_second': 12.509, 'epoch': 2.0}
{'loss': 0.4372, 'learning_rate': 1.5006998600279946e-05, 'epoch': 2.1}
{'loss': 0.3639, 'learning_rate': 1.0007998400319935e-05, 'epoch': 2.4}
{'loss': 0.3443, 'learning_rate': 5.008998200359928e-06, 'epoch': 2.7}
{'loss': 0.3547, 'learning_rate': 9.998000399920016e-09, 'epoch': 3.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6158543229103088, 'eval_accuracy': 0.828, 'eval_f1': 0.8274824473420261, 'eval_precision': 0.820079522862823, 'eval_recall': 0.8350202429149798, 'eval_runtime': 19.9636, 'eval_samples_per_second': 100.182, 'eval_steps_per_second': 12.523, 'epoch': 3.0}
{'train_runtime': 1006.3097, 'train_samples_per_second': 29.812, 'train_steps_per_second': 4.97, 'train_loss': 0.46523013405302205, 'epoch': 3.0}


  0%|          | 0/625 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    negative     0.8371    0.8136    0.8252      2521
    positive     0.8157    0.8390    0.8272      2479

    accuracy                         0.8262      5000
   macro avg     0.8264    0.8263    0.8262      5000
weighted avg     0.8265    0.8262    0.8262      5000



In [19]:
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=0.00001, per_device_train_batch_size=6)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = test_dataset['label']

print(classification_report(y_true, y_pred, target_names=['negative','positive'], digits=4))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/5001 [00:00<?, ?it/s]

{'loss': 0.5459, 'learning_rate': 9.000199960007999e-06, 'epoch': 0.3}
{'loss': 0.4827, 'learning_rate': 8.000399920015997e-06, 'epoch': 0.6}
{'loss': 0.4866, 'learning_rate': 7.000599880023996e-06, 'epoch': 0.9}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5203272700309753, 'eval_accuracy': 0.8325, 'eval_f1': 0.8363458720078164, 'eval_precision': 0.8083097261567517, 'eval_recall': 0.8663967611336032, 'eval_runtime': 20.0675, 'eval_samples_per_second': 99.663, 'eval_steps_per_second': 12.458, 'epoch': 1.0}
{'loss': 0.4255, 'learning_rate': 6.000799840031995e-06, 'epoch': 1.2}
{'loss': 0.4524, 'learning_rate': 5.000999800039993e-06, 'epoch': 1.5}
{'loss': 0.4333, 'learning_rate': 4.001199760047991e-06, 'epoch': 1.8}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6014479398727417, 'eval_accuracy': 0.8425, 'eval_f1': 0.8368720870015536, 'eval_precision': 0.8568398727465536, 'eval_recall': 0.8178137651821862, 'eval_runtime': 19.8515, 'eval_samples_per_second': 100.748, 'eval_steps_per_second': 12.594, 'epoch': 2.0}
{'loss': 0.3889, 'learning_rate': 3.0013997200559893e-06, 'epoch': 2.1}
{'loss': 0.3546, 'learning_rate': 2.001599680063987e-06, 'epoch': 2.4}
{'loss': 0.3699, 'learning_rate': 1.0017996400719856e-06, 'epoch': 2.7}
{'loss': 0.3523, 'learning_rate': 1.999600079984003e-09, 'epoch': 3.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6682096123695374, 'eval_accuracy': 0.848, 'eval_f1': 0.8476953907815633, 'eval_precision': 0.8392857142857143, 'eval_recall': 0.8562753036437247, 'eval_runtime': 19.8547, 'eval_samples_per_second': 100.732, 'eval_steps_per_second': 12.591, 'epoch': 3.0}
{'train_runtime': 1011.6386, 'train_samples_per_second': 29.655, 'train_steps_per_second': 4.943, 'train_loss': 0.4291056510650621, 'epoch': 3.0}


  0%|          | 0/625 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    negative     0.8640    0.8544    0.8592      2521
    positive     0.8536    0.8633    0.8584      2479

    accuracy                         0.8588      5000
   macro avg     0.8588    0.8588    0.8588      5000
weighted avg     0.8589    0.8588    0.8588      5000



In [20]:
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=0.0001, per_device_train_batch_size=10)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = test_dataset['label']

print(classification_report(y_true, y_pred, target_names=['negative','positive'], digits=4))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/3000 [00:00<?, ?it/s]

{'loss': 0.7063, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.5}
{'loss': 0.6994, 'learning_rate': 6.666666666666667e-05, 'epoch': 1.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.7308576107025146, 'eval_accuracy': 0.494, 'eval_f1': 0.6613119143239625, 'eval_precision': 0.494, 'eval_recall': 1.0, 'eval_runtime': 32.9701, 'eval_samples_per_second': 60.661, 'eval_steps_per_second': 7.583, 'epoch': 1.0}
{'loss': 0.6973, 'learning_rate': 5e-05, 'epoch': 1.5}
{'loss': 0.6954, 'learning_rate': 3.3333333333333335e-05, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6999887824058533, 'eval_accuracy': 0.506, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 24.4965, 'eval_samples_per_second': 81.644, 'eval_steps_per_second': 10.206, 'epoch': 2.0}
{'loss': 0.6967, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.5}
{'loss': 0.6947, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6935511231422424, 'eval_accuracy': 0.494, 'eval_f1': 0.6613119143239625, 'eval_precision': 0.494, 'eval_recall': 1.0, 'eval_runtime': 24.5111, 'eval_samples_per_second': 81.596, 'eval_steps_per_second': 10.199, 'epoch': 3.0}
{'train_runtime': 1302.4506, 'train_samples_per_second': 23.034, 'train_steps_per_second': 2.303, 'train_loss': 0.6983077392578125, 'epoch': 3.0}


  0%|          | 0/625 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    negative     0.0000    0.0000    0.0000      2521
    positive     0.4958    1.0000    0.6629      2479

    accuracy                         0.4958      5000
   macro avg     0.2479    0.5000    0.3315      5000
weighted avg     0.2458    0.4958    0.3287      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=0.00005, per_device_train_batch_size=10)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = test_dataset['label']

print(classification_report(y_true, y_pred, target_names=['negative','positive'], digits=4))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/3000 [00:00<?, ?it/s]

{'loss': 0.5648, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5}
{'loss': 0.5036, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6612009406089783, 'eval_accuracy': 0.778, 'eval_f1': 0.7881679389312978, 'eval_precision': 0.7454873646209387, 'eval_recall': 0.8360323886639676, 'eval_runtime': 32.0559, 'eval_samples_per_second': 62.391, 'eval_steps_per_second': 7.799, 'epoch': 1.0}
{'loss': 0.4193, 'learning_rate': 2.5e-05, 'epoch': 1.5}
{'loss': 0.384, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.447526216506958, 'eval_accuracy': 0.8285, 'eval_f1': 0.8191881918819188, 'eval_precision': 0.8547854785478548, 'eval_recall': 0.7864372469635628, 'eval_runtime': 32.2024, 'eval_samples_per_second': 62.107, 'eval_steps_per_second': 7.763, 'epoch': 2.0}
{'loss': 0.2817, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}
{'loss': 0.2883, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5473266839981079, 'eval_accuracy': 0.8335, 'eval_f1': 0.8340807174887892, 'eval_precision': 0.8213935230618253, 'eval_recall': 0.847165991902834, 'eval_runtime': 34.5493, 'eval_samples_per_second': 57.888, 'eval_steps_per_second': 7.236, 'epoch': 3.0}
{'train_runtime': 1501.4088, 'train_samples_per_second': 19.981, 'train_steps_per_second': 1.998, 'train_loss': 0.4069336853027344, 'epoch': 3.0}


  0%|          | 0/625 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    negative     0.8533    0.8282    0.8406      2521
    positive     0.8304    0.8552    0.8426      2479

    accuracy                         0.8416      5000
   macro avg     0.8418    0.8417    0.8416      5000
weighted avg     0.8419    0.8416    0.8416      5000



In [22]:
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=0.00001, per_device_train_batch_size=10)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = test_dataset['label']

print(classification_report(y_true, y_pred, target_names=['negative','positive'], digits=4))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/3000 [00:00<?, ?it/s]

{'loss': 0.5219, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.5}
{'loss': 0.4067, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.4011552035808563, 'eval_accuracy': 0.8365, 'eval_f1': 0.8380386329866271, 'eval_precision': 0.8205625606207565, 'eval_recall': 0.8562753036437247, 'eval_runtime': 39.3018, 'eval_samples_per_second': 50.888, 'eval_steps_per_second': 6.361, 'epoch': 1.0}
{'loss': 0.3413, 'learning_rate': 5e-06, 'epoch': 1.5}
{'loss': 0.3321, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.4830947816371918, 'eval_accuracy': 0.8475, 'eval_f1': 0.8447837150127226, 'eval_precision': 0.849539406345957, 'eval_recall': 0.840080971659919, 'eval_runtime': 37.4334, 'eval_samples_per_second': 53.428, 'eval_steps_per_second': 6.679, 'epoch': 2.0}
{'loss': 0.2859, 'learning_rate': 1.6666666666666667e-06, 'epoch': 2.5}
{'loss': 0.3218, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5714234113693237, 'eval_accuracy': 0.8515, 'eval_f1': 0.8517224163754369, 'eval_precision': 0.8403940886699507, 'eval_recall': 0.8633603238866396, 'eval_runtime': 38.7315, 'eval_samples_per_second': 51.637, 'eval_steps_per_second': 6.455, 'epoch': 3.0}
{'train_runtime': 1596.4043, 'train_samples_per_second': 18.792, 'train_steps_per_second': 1.879, 'train_loss': 0.3682825419108073, 'epoch': 3.0}


  0%|          | 0/625 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    negative     0.8648    0.8473    0.8559      2521
    positive     0.8478    0.8653    0.8565      2479

    accuracy                         0.8562      5000
   macro avg     0.8563    0.8563    0.8562      5000
weighted avg     0.8564    0.8562    0.8562      5000



In [24]:
f1_results = [[0.0001, 6,  0.3352],[0.00005, 6,  0.8262],[0.00001, 6,  0.8588],[0.0001, 8,  0.3352],[0.00005, 8,  0.8429],[0.00001, 8,  0.861],[0.0001, 10,  0.3315],[0.00005, 10,  0.8416],[0.00001, 10,  0.8562]]
df_results = pd.DataFrame(f1_results, columns = ['Learning Rate', 'Batch Size', 'F1 Score'])
df_results

Unnamed: 0,Learning Rate,Batch Size,F1 Score
0,0.0001,6,0.3352
1,5e-05,6,0.8262
2,1e-05,6,0.8588
3,0.0001,8,0.3352
4,5e-05,8,0.8429
5,1e-05,8,0.861
6,0.0001,10,0.3315
7,5e-05,10,0.8416
8,1e-05,10,0.8562
