In [None]:
!pip install transformers datasets

In [None]:
!wget -nc https://lazyprogrammer.me/course_files/AirlineTweets.csv

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sn 
import matplotlib.pyplot as plt 

import torch 

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
df_ = pd.read_csv("AirlineTweets.csv")

In [None]:
df_.head()

In [None]:
df = df_[['airline_sentiment', 'text']].copy()

In [None]:
df.head()

In [None]:
df['airline_sentiment'].hist()
# imbalanced dataset

In [None]:
target_map = {'positive': 1, 'negative': 0, 'neutral': 2}
df['target'] = df['airline_sentiment'].map(target_map)

In [None]:
df2 = df[['text', 'target']]
# targets should be named LABEL
df2.columns = ['sentence', 'label']
df2.to_csv('data.csv', index=None) 

In [None]:
!head data.csv

In [None]:
from datasets import load_dataset
raw_dataset = load_dataset('csv', data_files = 'data.csv')

In [None]:
raw_dataset

In [None]:
split = raw_dataset['train'].train_test_split(test_size = 0.3, seed = 42) 

In [None]:
split

In [None]:
# if you have multiple csv files, paths can be urls 
# load_dataset('csv', data_files = ['file1.csv', 'file2.csv'])

In [None]:
# if you already have a train-test split: 
# load dataset("csv", data_files = {'train': ['train1.csv', 'train2.csv'], 'test': 'test.csv'})

In [None]:
checkpoint  = 'distilbert-base-cased' 
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

In [None]:
# no padding, padding is handled automatically by the trainer
def tokenize_fn(batch): 
    return tokenizer(batch['sentence'], truncation=True) 

In [None]:
tokenized_datasets = split.map(tokenize_fn, batched = True)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig

In [None]:
config = AutoConfig.from_pretrained(checkpoint) 

In [None]:
config

In [None]:
config.id2label

In [None]:
config.label2id

In [None]:
config.id2label = {v:k for k, v in target_map.items()}
config.label2id = target_map

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config= config) 

In [None]:
!pip install torchinfo

In [None]:
from torchinfo import summary

In [None]:
summary(model)
# number of params = 66M

In [None]:
training_args = TrainingArguments(
    output_dir='training_dir', 
    evaluation_strategy='epoch', 
    save_strategy='epoch', 
    num_train_epochs=3, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=64,
)

In [None]:
def compute_metrics(logits_and_labels): 
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1) 
    acc = np.mean(predictions == labels)
    f1 = f1_score(labels, predictions, average='macro')
    return {'accuracy': acc, 'f1': f1}

In [None]:
trainer = Trainer(
    model, 
    training_args, 
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['test'], 
    tokenizer=tokenizer, 
    compute_metrics=compute_metrics
)

In [None]:
# overfit on validation loss
trainer.train()

In [None]:
# 3 checkpoints, 2nd validation loss is lowest
!ls training_dir

In [None]:
from transformers import pipeline

In [None]:
# load the model, saved after the second epoch
savedmodel = pipeline('text-classification', model = 'training_dir/checkpoint-1282', device=0) 

In [None]:
s = split['test']['sentence'][0]
print(s) 
savedmodel(s)