<a href="https://colab.research.google.com/github/codistro/Articles/blob/main/covid_tweet_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets

In [None]:
!cp /content/drive/MyDrive/kaggle.json kaggle.json
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!pip install -q kaggle
!chmod 600 ~/.kaggle/kaggle.json
!pip install --upgrade --force-reinstall --no-deps kaggle
!kaggle datasets download -d datatattle/covid-19-nlp-text-classification
!unzip -q covid-19-nlp-text-classification.zip

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

tokenizer("Attention is all you need")

In [None]:

from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': 'Corona_NLP_train.csv', 'test': 'Corona_NLP_test.csv'}, encoding = "ISO-8859-1")


In [None]:




def transform_labels(label):

    label = label['Sentiment']
    num = 0
    if label == 'Positive':
        num = 0
    elif label == 'Negative':
        num = 1
    elif label == 'Neutral':
        num = 2
    elif label == 'Extremely Positive':
        num = 3
    elif label == 'Extremely Negative':
        num = 4

    return {'labels': num}

def tokenize_data(example):
    return tokenizer(example['OriginalTweet'], padding='max_length')

dataset = dataset.map(tokenize_data, batched=True)

remove_columns = ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

from transformers import AutoModelForSequenceClassification
import torch
model_path = '/content/drive/MyDrive/weights/covid_tweet_classification.pt'

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

try:
    model.load_state_dict(torch.load('/content/drive/MyDrive/weights/hotel_review_classification.pt'))
except:
    pass

from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer", num_train_epochs=3)

train_dataset = dataset['train']
eval_dataset = dataset['test']

from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)

In [6]:
trainer.train()

***** Running training *****
  Num examples = 41157
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 15435


Step,Training Loss
500,1.3932
1000,1.0804
1500,0.9575
2000,0.8856
2500,0.8201
3000,0.7628
3500,0.7256
4000,0.6931
4500,0.6739
5000,0.6392


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

TrainOutput(global_step=15435, training_loss=0.5447625326113984, metrics={'train_runtime': 7239.5877, 'train_samples_per_second': 17.055, 'train_steps_per_second': 2.132, 'total_flos': 3.2487460168172544e+16, 'train_loss': 0.5447625326113984, 'epoch': 3.0})

In [7]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

***** Running Evaluation *****
  Num examples = 3798
  Batch size = 8


{'eval_accuracy': 0.8359662980516062,
 'eval_loss': 0.6989739537239075,
 'eval_runtime': 75.3905,
 'eval_samples_per_second': 50.378,
 'eval_steps_per_second': 6.301}

In [8]:
torch.save(model.state_dict(),model_path)

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids', 'labels'],
        num_rows: 41157
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids', 'labels'],
        num_rows: 3798
    })
})