In [None]:
#!pip3 install datasets

In [None]:
#!pip3 install transformers -U;

In [None]:
#!pip3 install accelerate -U

In [32]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt

import torch
import tensorflow as tf

from sklearn.model_selection import train_test_split

import datasets
from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer

In [None]:
# !git clone https://github.com/cse151a-DrugReviewAnalysis/DrugReviewAnalysis.git

In [2]:
# Get dataset for working locally
df_train = pd.read_csv("drugsTrain_processed_with_usefulCount.csv", sep="\t")
df_test = pd.read_csv("drugsTest_processed_with_usefulCount.csv", sep="\t")

In [None]:
# Get dataset when working on colab
# df_train = pd.read_csv("DrugReviewAnalysis/Model 3/drugsTrain_processed_with_usefulCount.csv", sep="\t")
# df_test = pd.read_csv("DrugReviewAnalysis/Model 3/drugsTest_processed_with_usefulCount.csv", sep="\t")

In [3]:
df_train = df_train.sample(frac=0.7, random_state=42).reset_index()
df_test = df_test.sample(frac=0.7, random_state=42).reset_index()

In [4]:
df_train.shape, df_test.shape

((111649, 8), (37240, 8))

In [5]:
df_val, df_test_new = train_test_split(df_test, test_size=0.6, random_state=42)

In [6]:
ds_train = Dataset.from_pandas(df_train[["processed_review", "rating"]])
ds_val = Dataset.from_pandas(df_val.reset_index()[["processed_review", "rating"]])
ds_test = Dataset.from_pandas(df_test_new.reset_index()[["processed_review", "rating"]])

In [7]:
ds = DatasetDict()

ds["train"] = ds_train
ds["validation"] = ds_val
ds["test"] = ds_test

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['processed_review', 'rating'],
        num_rows: 111649
    })
    validation: Dataset({
        features: ['processed_review', 'rating'],
        num_rows: 14896
    })
    test: Dataset({
        features: ['processed_review', 'rating'],
        num_rows: 22344
    })
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [10]:
def preprocess(samples):
  text = samples["processed_review"]
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_attention_mask=True)
  encoding["labels"] = samples["rating"]
  return encoding

In [11]:
encoded_dataset = ds.map(preprocess, batched=True, remove_columns=ds['train'].column_names)

Map: 100%|██████████| 111649/111649 [00:12<00:00, 8927.35 examples/s]
Map: 100%|██████████| 14896/14896 [00:01<00:00, 9394.87 examples/s]
Map: 100%|██████████| 22344/22344 [00:02<00:00, 8521.21 examples/s]


In [12]:
encoded_dataset.set_format("torch")

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:

args = TrainingArguments(
    output_dir="/bert-regression", # for local
    # output_dir = "DrugReviewAnalysis/Model 3/bert-regression", # for colab
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)

In [36]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  mse = mean_squared_error(labels, predictions)
  pred_modified = np.where(predictions < 10.0, np.round(predictions), 10.0)
  print(labels)
  acc = accuracy_score(labels, pred_modified)
  f1 = f1_score(labels, pred_modified, average="macro")
  return {"mse": mse, "accuracy-approximation": acc, "f1-approximation": f1}

In [17]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
trainer.train()

  6%|▌         | 500/8725 [03:10<52:00,  2.64it/s] 

{'loss': 7.5869, 'grad_norm': 64.28229522705078, 'learning_rate': 1.8853868194842408e-05, 'epoch': 0.29}


 11%|█▏        | 1000/8725 [06:22<48:54,  2.63it/s] 

{'loss': 3.6527, 'grad_norm': 44.7933349609375, 'learning_rate': 1.7707736389684814e-05, 'epoch': 0.57}


 17%|█▋        | 1500/8725 [09:34<46:03,  2.61it/s]  

{'loss': 3.2833, 'grad_norm': 35.52226638793945, 'learning_rate': 1.6561604584527223e-05, 'epoch': 0.86}


                                                     
 20%|██        | 1745/8725 [11:41<39:07,  2.97it/s]

{'eval_loss': 2.971496343612671, 'eval_mse': 2.971496343612671, 'eval_runtime': 32.1527, 'eval_samples_per_second': 463.289, 'eval_steps_per_second': 7.247, 'epoch': 1.0}


 23%|██▎       | 2000/8725 [13:22<42:45,  2.62it/s]   

{'loss': 2.7986, 'grad_norm': 22.046106338500977, 'learning_rate': 1.541547277936963e-05, 'epoch': 1.15}


 29%|██▊       | 2500/8725 [16:35<39:37,  2.62it/s]

{'loss': 2.4684, 'grad_norm': 26.621028900146484, 'learning_rate': 1.4269340974212036e-05, 'epoch': 1.43}


 34%|███▍      | 3000/8725 [19:47<36:20,  2.63it/s]

{'loss': 2.3732, 'grad_norm': 50.240272521972656, 'learning_rate': 1.3123209169054444e-05, 'epoch': 1.72}


                                                   
 40%|████      | 3490/8725 [23:28<28:48,  3.03it/s]

{'eval_loss': 2.536482810974121, 'eval_mse': 2.536482572555542, 'eval_runtime': 32.3152, 'eval_samples_per_second': 460.96, 'eval_steps_per_second': 7.21, 'epoch': 2.0}


 40%|████      | 3500/8725 [23:36<1:15:55,  1.15it/s] 

{'loss': 2.3447, 'grad_norm': 26.629484176635742, 'learning_rate': 1.197707736389685e-05, 'epoch': 2.01}


 46%|████▌     | 4000/8725 [26:49<30:06,  2.62it/s]  

{'loss': 1.7949, 'grad_norm': 29.986299514770508, 'learning_rate': 1.0830945558739256e-05, 'epoch': 2.29}


 52%|█████▏    | 4500/8725 [30:02<26:54,  2.62it/s]

{'loss': 1.7564, 'grad_norm': 94.7625503540039, 'learning_rate': 9.684813753581662e-06, 'epoch': 2.58}


 57%|█████▋    | 5000/8725 [33:15<23:43,  2.62it/s]

{'loss': 1.7481, 'grad_norm': 25.68700408935547, 'learning_rate': 8.53868194842407e-06, 'epoch': 2.87}


                                                   
 60%|██████    | 5235/8725 [35:19<19:14,  3.02it/s]

{'eval_loss': 2.444241523742676, 'eval_mse': 2.4442412853240967, 'eval_runtime': 32.2924, 'eval_samples_per_second': 461.284, 'eval_steps_per_second': 7.215, 'epoch': 3.0}


 63%|██████▎   | 5500/8725 [36:56<18:55,  2.84it/s]   

{'loss': 1.571, 'grad_norm': 48.091758728027344, 'learning_rate': 7.392550143266476e-06, 'epoch': 3.15}


 69%|██████▉   | 6000/8725 [39:53<17:51,  2.54it/s]

{'loss': 1.3905, 'grad_norm': 86.07281494140625, 'learning_rate': 6.246418338108883e-06, 'epoch': 3.44}


 74%|███████▍  | 6500/8725 [43:02<13:42,  2.71it/s]

{'loss': 1.3464, 'grad_norm': 16.908924102783203, 'learning_rate': 5.10028653295129e-06, 'epoch': 3.72}


                                                   
 80%|████████  | 6980/8725 [46:32<09:19,  3.12it/s]

{'eval_loss': 2.356973886489868, 'eval_mse': 2.356973886489868, 'eval_runtime': 31.1337, 'eval_samples_per_second': 478.452, 'eval_steps_per_second': 7.484, 'epoch': 4.0}


 80%|████████  | 7000/8725 [46:42<10:57,  2.63it/s]  

{'loss': 1.35, 'grad_norm': 15.731082916259766, 'learning_rate': 3.954154727793696e-06, 'epoch': 4.01}


 86%|████████▌ | 7500/8725 [49:47<07:33,  2.70it/s]

{'loss': 1.1806, 'grad_norm': 17.71377944946289, 'learning_rate': 2.8080229226361035e-06, 'epoch': 4.3}


 92%|█████████▏| 8000/8725 [52:53<04:29,  2.69it/s]

{'loss': 1.1386, 'grad_norm': 25.695953369140625, 'learning_rate': 1.66189111747851e-06, 'epoch': 4.58}


 97%|█████████▋| 8500/8725 [55:59<01:23,  2.70it/s]

{'loss': 1.1581, 'grad_norm': 30.806121826171875, 'learning_rate': 5.15759312320917e-07, 'epoch': 4.87}


                                                   
100%|██████████| 8725/8725 [57:54<00:00,  3.12it/s]

{'eval_loss': 2.3104214668273926, 'eval_mse': 2.3104214668273926, 'eval_runtime': 31.1474, 'eval_samples_per_second': 478.242, 'eval_steps_per_second': 7.481, 'epoch': 5.0}


100%|██████████| 8725/8725 [57:57<00:00,  2.51it/s]

{'train_runtime': 3477.146, 'train_samples_per_second': 160.547, 'train_steps_per_second': 2.509, 'train_loss': 2.26018678331785, 'epoch': 5.0}





TrainOutput(global_step=8725, training_loss=2.26018678331785, metrics={'train_runtime': 3477.146, 'train_samples_per_second': 160.547, 'train_steps_per_second': 2.509, 'train_loss': 2.26018678331785, 'epoch': 5.0})

In [19]:
trainer.evaluate()

100%|██████████| 233/233 [00:32<00:00,  7.11it/s]


{'eval_loss': 2.3104214668273926,
 'eval_mse': 2.3104214668273926,
 'eval_runtime': 33.1678,
 'eval_samples_per_second': 449.111,
 'eval_steps_per_second': 7.025,
 'epoch': 5.0}

In [20]:
# local
trainer.save_model("./bert-regression") 

# colab
# trainer.save_model("DrugReviewAnalysis/Model 3/bert-regression")

In [None]:
# local
# model = AutoModelForSequenceClassification.from_pretrained("bert-regression")

# colab
# model = AutoModelForSequenceClassification.from_pretrained("DrugReviewAnalysis/Model 3/bert-regression")

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=encoded_dataset["train"],
#     eval_dataset=encoded_dataset["validation"],
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

In [21]:
trainer.evaluate(eval_dataset=encoded_dataset["test"])

100%|██████████| 350/350 [00:47<00:00,  7.42it/s]


{'eval_loss': 2.30594801902771,
 'eval_mse': 2.30594801902771,
 'eval_runtime': 47.3482,
 'eval_samples_per_second': 471.908,
 'eval_steps_per_second': 7.392,
 'epoch': 5.0}