# Transformer-Based Sentiment Analysis

This notebook implements sentiment classification using a pretrained Transformer

The objective is to fine-tune a language model on drug review data to predict sentiment from textual input

In [115]:
import pandas as pd

In [116]:
import numpy as np
from datasets import Dataset
from transformers import(
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
import evaluate
from sklearn.metrics import classification_report, confusion_matrix

## Dataset

The dataset consists of drug reviews containing: 
Review text,
Numerical rating

which are converted into sentiment labels:
Positive,
Negative

(Neutral rating are excluded)

In [117]:
df = pd.read_csv("datasets/drug_review_test.csv")

In [118]:
df

Unnamed: 0.1,Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,0,163740,Mirtazapine,depression,"""i've tried a few antidepressants over the yea...",10.0,"February 28, 2012",22,68
1,1,206473,Mesalamine,"crohn's disease, maintenance","""my son has crohn's disease and has done very ...",8.0,"May 17, 2009",17,48
2,2,39293,Contrave,weight loss,"""contrave combines drugs that were used for al...",9.0,"March 5, 2017",35,143
3,3,97768,Cyclafem 1 / 35,birth control,"""i have been on this birth control for one cyc...",9.0,"October 22, 2015",4,149
4,4,208087,Zyclara,keratosis,"""4 days in on first 2 weeks. using on arms an...",4.0,"July 3, 2014",13,60
...,...,...,...,...,...,...,...,...,...
46103,6103,123432,Apri,birth control,"""i started taking apri about 7 months ago. my ...",9.0,"August 25, 2010",18,86
46104,6104,159999,Tamoxifen,"breast cancer, prevention","""i have taken tamoxifen for 5 years. side effe...",10.0,"September 13, 2014",43,97
46105,6105,140714,Escitalopram,anxiety,"""i've been taking lexapro (escitaploprgram) si...",9.0,"October 8, 2016",11,130
46106,6106,130945,Levonorgestrel,birth control,"""i'm married, 34 years old and i have no kids....",8.0,"November 15, 2010",7,149


In [119]:
df = df[["review","rating"]].dropna()

In [120]:
df

Unnamed: 0,review,rating
0,"""i've tried a few antidepressants over the yea...",10.0
1,"""my son has crohn's disease and has done very ...",8.0
2,"""contrave combines drugs that were used for al...",9.0
3,"""i have been on this birth control for one cyc...",9.0
4,"""4 days in on first 2 weeks. using on arms an...",4.0
...,...,...
46103,"""i started taking apri about 7 months ago. my ...",9.0
46104,"""i have taken tamoxifen for 5 years. side effe...",10.0
46105,"""i've been taking lexapro (escitaploprgram) si...",9.0
46106,"""i'm married, 34 years old and i have no kids....",8.0


## Label Preparation

In [121]:
def to_sentiment(r):
    if r<=4:
        return 0
    elif r>=7:
        return 1
    else:
        return None

In [122]:
df["label"] = df["rating"].apply(to_sentiment)

In [123]:
df.head(10)

Unnamed: 0,review,rating,label
0,"""i've tried a few antidepressants over the yea...",10.0,1.0
1,"""my son has crohn's disease and has done very ...",8.0,1.0
2,"""contrave combines drugs that were used for al...",9.0,1.0
3,"""i have been on this birth control for one cyc...",9.0,1.0
4,"""4 days in on first 2 weeks. using on arms an...",4.0,0.0
5,"""i've had the copper coil for about 3 months n...",6.0,
6,"""this has been great for me. i've been on it f...",9.0,1.0
7,"""ive been on methadone for over ten years and ...",7.0,1.0
8,"""i was on this pill for almost two years. it d...",2.0,0.0
9,"""holy hell is exactly how i feel. i had been t...",1.0,0.0


In [124]:
df = df.dropna(subset=["label"])

In [125]:
df["label"] = df["label"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(int)


In [126]:
df = df.rename(columns = {"review":"text"})
df = df[["text","label"]]

In [127]:
df["label"].value_counts()

label
1    30409
0    11445
Name: count, dtype: int64

In [128]:
df

Unnamed: 0,text,label
0,"""i've tried a few antidepressants over the yea...",1
1,"""my son has crohn's disease and has done very ...",1
2,"""contrave combines drugs that were used for al...",1
3,"""i have been on this birth control for one cyc...",1
4,"""4 days in on first 2 weeks. using on arms an...",0
...,...,...
46103,"""i started taking apri about 7 months ago. my ...",1
46104,"""i have taken tamoxifen for 5 years. side effe...",1
46105,"""i've been taking lexapro (escitaploprgram) si...",1
46106,"""i'm married, 34 years old and i have no kids....",1


In [129]:
ds = Dataset.from_pandas(df, preserve_index=False)

In [130]:
ds

Dataset({
    features: ['text', 'label'],
    num_rows: 41854
})

## Train-Test Split

the dataset is divided into training and evaluation sets

In [131]:
split = ds.train_test_split(test_size=0.2,seed=42)
train_ds = split["train"]
eval_ds = split["test"]

In [132]:
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 33483
})

In [133]:
eval_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 8371
})

## Tokenization

Text is tokenized using a pretrained tokenizer

Subword tokenization, Truncation of long sequences, dynamid padding using a data collator

In [134]:
checkpoint = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=128)

In [135]:
train_ds = train_ds.map(tokenize, batched=True)
eval_ds = eval_ds.map(tokenize, batched = True)

Map: 100%|██████████| 33483/33483 [00:11<00:00, 2851.10 examples/s]
Map: 100%|██████████| 8371/8371 [00:03<00:00, 2240.02 examples/s]


In [136]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

## Model selection

Simpler model chosen for avoiding long training time

In [137]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels =2 #многу лош скор со 3 (неутрал)
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Sliming down database for faster training

In [138]:
train_ds = train_ds.shuffle(seed=42).select(range(min(8000, len(train_ds))))
eval_ds  = eval_ds.shuffle(seed=42).select(range(min(2000,  len(eval_ds))))

In [139]:
acc = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    return{
        "accuracy":acc.compute(predictions=preds,references=labels)["accuracy"],
        "f1":f1.compute(predictions = preds, references=labels, average = "weighted")["f1"]
    }


## Training configuration

Training is performed using the Hugging Face Trainer API

In [145]:
args = TrainingArguments(
   output_dir="drug_review_sentiment",
   eval_strategy=  "epoch",
   save_strategy="no",
   learning_rate=2e-5,
   per_device_train_batch_size=8,
   per_device_eval_batch_size=16,
   num_train_epochs=3,
   weight_decay=0.01,
   logging_steps=50,
   report_to = "none"
)

In [146]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
    )

  trainer = Trainer(


In [143]:
print(train_ds.column_names)

['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']


### Training and evaluation

In [147]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5009,0.494769,0.7725,0.76446
2,0.4953,0.472093,0.7845,0.779123
3,0.4235,0.467192,0.788,0.783432


TrainOutput(global_step=3000, training_loss=0.4802721939086914, metrics={'train_runtime': 383.3955, 'train_samples_per_second': 62.599, 'train_steps_per_second': 7.825, 'total_flos': 7620321711840.0, 'train_loss': 0.4802721939086914, 'epoch': 3.0})

In [148]:
trainer.evaluate()

{'eval_loss': 0.467192143201828,
 'eval_accuracy': 0.788,
 'eval_f1': 0.7834318699957563,
 'eval_runtime': 34.1552,
 'eval_samples_per_second': 58.556,
 'eval_steps_per_second': 3.66,
 'epoch': 3.0}

In [149]:
pred_out = trainer.predict(eval_ds)
preds = np.argmax(pred_out.predictions, axis = -1)
labels = pred_out.label_ids

print(classification_report(labels, preds, target_names=["negative", "positive"]))
print(confusion_matrix(labels, preds))

              precision    recall  f1-score   support

    negative       0.61      0.54      0.57       529
    positive       0.84      0.88      0.86      1471

    accuracy                           0.79      2000
   macro avg       0.73      0.71      0.72      2000
weighted avg       0.78      0.79      0.78      2000

[[ 285  244]
 [ 180 1291]]
