<a href="https://colab.research.google.com/github/gupta24789/hugging-face/blob/main/04_sentiment_analysis_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
os.environ['TOKENIZERS_PARALLELISM'] = "0"

In [None]:
import evaluate
import numpy as np
from tqdm.auto import tqdm
from transformers import pipeline
from datasets import load_dataset, Features, ClassLabel, Value
from transformers import AutoTokenizer, TrainingArguments,Trainer, AutoModelForSequenceClassification

# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader

## Load Dataset

In [None]:
dataset = load_dataset("sg247/binary-classification", data_files= {"train": "train.csv", "test":"test.csv"})
dataset

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 8004
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 2000
    })
})

In [None]:
## Label must be of type ClassLabel
dataset['train'].features

{'tweet': Value(dtype='string', id=None),
 'label': Value(dtype='float64', id=None)}

In [None]:
## label must of ClassLabel Type
features = Features({"tweet": Value(dtype = "string"), "label": ClassLabel(num_classes=2, names=[0,1])})
dataset = load_dataset("sg247/binary-classification", data_files= {"train": "train.csv", "test":"test.csv"}, features = features)
dataset

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 8004
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 2000
    })
})

In [None]:
dataset['train'].features

{'tweet': Value(dtype='string', id=None),
 'label': ClassLabel(names=[0, 1], id=None)}

## Remove NA from the data

In [None]:
dataset = dataset.filter(lambda x: x['tweet'] is not None and x['label'] is not None and len(x['tweet'])>0)
dataset

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 2000
    })
})

In [None]:
dataset['train'][0]

{'tweet': 'Want to say a huge thanks to @WarriorAssaultS @uktac @BolleSafety @Mechanix_Wear @Airtech_Studios @Hexmags #FF Thanks for the support :)',
 'label': 1}

## Tokenized Tweet

In [None]:
def tokenize_tweet(row):
    return tokenizer(row['tweet'], padding='max_length', truncation=True, max_length=50)

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_datasets = dataset.map(tokenize_tweet)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['tweet', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
## remove tweet column and rename label as labels
tokenized_datasets = tokenized_datasets.remove_columns('tweet')
tokenized_datasets = tokenized_datasets.rename_columns({"label":"labels"})
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
## create train and test dataset
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"]

## Train

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

## Trainer Arguments

##### evaluation_strategy
    - "no": No evaluation is done during training.
    - "steps": Evaluation is done (and logged) every eval_steps.
    - "epoch": Evaluation is done at the end of each epoch.

##### logging_dir (str, optional)
    — TensorBoard log directory. Will default to *output_dir/runs/CURRENT_DATETIME_HOSTNAME*.

##### logging_strategy (str or IntervalStrategy, optional, defaults to "steps")
    — The logging strategy to adopt during training. Possible values are:
        - "no": No logging is done during training.
        - "epoch": Logging is done at the end of each epoch.
        - "steps": Logging is done every logging_steps.

##### run_name (str, optional) — A descriptor for the run. Typically used for wandb and mlflow logging.

In [None]:
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label = id2label, label2id = label2id)


## CONFIG
TRAIN_BATCH_SIZE = 128
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 1e-5
NUM_EPOCHS = 2

## Epoch Level Logging
training_args = TrainingArguments(
    output_dir="checkpoints_logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    learning_rate = LEARNING_RATE,
    num_train_epochs = NUM_EPOCHS,
    warmup_steps=100,
    weight_decay= 0.01,
    logging_dir = "logs",
    logging_steps = 50,
    use_cpu  = False,

)


## Trainer config
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

## Train Model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6098,0.255135,0.9945,0.994508,0.993021,0.996
2,0.1939,0.019086,0.997,0.997003,0.996008,0.998


TrainOutput(global_step=126, training_loss=0.32458121861730305, metrics={'train_runtime': 88.1827, 'train_samples_per_second': 181.442, 'train_steps_per_second': 1.429, 'total_flos': 411111024000000.0, 'train_loss': 0.32458121861730305, 'epoch': 2.0})

## Load the model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("checkpoints_logs/checkpoint-126/")

## Inference

In [None]:
classifier = pipeline('sentiment-analysis', model = model, tokenizer= tokenizer, device="cuda")

In [None]:
data = dataset['test'].shuffle()[0]
tweet, label = data['tweet'], data['label']
print(f"Tweet : {tweet}")
print(f"True : {'POSITIVE' if 1==label else 'NEGATIVE'}")
print(f"Pred : {classifier(tweet)[0]['label']}")

Tweet : @Youmeatyours yeah its horrible isn't it :( big hugs! &amp; it means a lot. X
True : NEGATIVE
Pred : NEGATIVE


## Tensorboard

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir logs


In [None]:
# from tensorboard import notebook
# notebook.list()
# notebook.display(port=6006, height=1000)

In [None]:
## start in terminal
## tensorboard --logdir logs