In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback

In [2]:
import sys
import gc
sys.path.insert(1, '/kaggle/input/cs433tweets/')
from kaggle_secrets import UserSecretsClient
import helpers as hlp
import wandb

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")
wandb.login(key=secret_value_0)

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


There are 2 GPU(s) available.
Device name: Tesla T4


In [3]:
 # Load data and set labels
data_complaint = pd.read_table("/kaggle/input/cs433tweets/train_neg.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
data_complaint['label'] = 0
data_non_complaint = pd.read_table("/kaggle/input/cs433tweets/train_pos.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
data_non_complaint['label'] = 2

# Concatenate complaining and non-complaining data
data = pd.concat((data_complaint, data_non_complaint)).reset_index(drop=True)

In [4]:
data = hlp.transformer_processing(data)

100%|██████████| 196970/196970 [00:01<00:00, 172728.23it/s]
100%|██████████| 196970/196970 [00:01<00:00, 158681.13it/s]
100%|██████████| 196970/196970 [00:01<00:00, 119124.25it/s]
100%|██████████| 196970/196970 [00:00<00:00, 197941.80it/s]


In [5]:
# Define pretrained tokenizer and model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
X = data.tweet.values.tolist()
y = data.label.values.tolist()

del data
gc.collect()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=2022)

del X
del y
gc.collect()

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=300)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=300)


del X_train
del X_val
gc.collect()

0

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [8]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [9]:
def get_max(pred):
    if pred[0] > pred[2]: 
        return 0
    return 2

In [10]:
def compute_metrics(p):
    pred, labels = p
    #pred = np.argmax(pred, axis=1)
    pred = np.apply_along_axis(get_max, 1, pred)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #recall = recall_score(y_true=labels, y_pred=pred)
    #precision = precision_score(y_true=labels, y_pred=pred)
    #f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy}

In [11]:
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    seed=2022, 
    save_total_limit = 2,
    save_strategy = 'no',
    load_best_model_at_end=False)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics)

In [12]:
# Train pre-trained model
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 158859
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 9932
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mjdidio[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Step,Training Loss,Validation Loss,Accuracy
500,0.3421,0.308733,0.870213
1000,0.3172,0.302887,0.871233
1500,0.3136,0.293987,0.873612
2000,0.3004,0.302404,0.870383
2500,0.2939,0.32518,0.877408
3000,0.2186,0.310009,0.879617
3500,0.225,0.315728,0.880127
4000,0.2258,0.294903,0.881203
4500,0.2243,0.30021,0.883639
5000,0.2186,0.341623,0.88245


***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch size = 64
***** Running Evaluation *****
  Num examples = 17652
  Batch si

In [13]:
wandb.finish()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▂▃▁▅▆▆▇█▇▆▆▇▆▇▆▅▆▆
eval/loss,▂▁▁▁▂▂▂▁▁▃▄▄▃▃█▆▆▆▇
eval/runtime,█▁▂▃▂▇▃▁▂▂▂▄▃▃█▂▂▂▂
eval/samples_per_second,▁█▇▆▇▂▆█▇▇▇▅▆▆▁▇▇▇▇
eval/steps_per_second,▁█▇▆▇▂▆█▇▇▇▅▆▆▁▇▇▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,██▇▇▆▆▆▅▅▄▄▄▃▃▃▂▂▁▁
train/loss,█▇▇▇▇▅▅▅▅▅▃▃▂▂▂▁▁▁▁
train/total_flos,▁

0,1
eval/accuracy,0.87945
eval/loss,0.44552
eval/runtime,173.1595
eval/samples_per_second,101.941
eval/steps_per_second,1.594
train/epoch,4.0
train/global_step,9932.0
train/learning_rate,0.0
train/loss,0.089
train/total_flos,9.79639088129352e+16
