In [1]:
#!pip3 install datasets

In [2]:
#!pip3 install transformers -U;

In [3]:
#!pip3 install accelerate -U

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt

import torch
import tensorflow as tf

from sklearn.model_selection import train_test_split

import datasets
from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer




  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# !git clone https://github.com/cse151a-DrugReviewAnalysis/DrugReviewAnalysis.git

In [6]:
# Get dataset for working locally
df_train = pd.read_csv("drugsTrain_processed_with_usefulCount.csv", sep="\t")
df_test = pd.read_csv("drugsTest_processed_with_usefulCount.csv", sep="\t")

In [7]:
# Get dataset when working on colab
# df_train = pd.read_csv("DrugReviewAnalysis/Model 3/drugsTrain_processed_with_usefulCount.csv", sep="\t")
# df_test = pd.read_csv("DrugReviewAnalysis/Model 3/drugsTest_processed_with_usefulCount.csv", sep="\t")

In [8]:
df_train = df_train.sample(frac=0.7, random_state=42).reset_index()
df_test = df_test.sample(frac=0.7, random_state=42).reset_index()

In [9]:
df_train.shape, df_test.shape

((111649, 8), (37240, 8))

In [10]:
df_val, df_test_new = train_test_split(df_test, test_size=0.6, random_state=42)

In [11]:
ds_train = Dataset.from_pandas(df_train[["processed_review", "rating"]])
ds_val = Dataset.from_pandas(df_val.reset_index()[["processed_review", "rating"]])
ds_test = Dataset.from_pandas(df_test_new.reset_index()[["processed_review", "rating"]])

In [12]:
ds = DatasetDict()

ds["train"] = ds_train
ds["validation"] = ds_val
ds["test"] = ds_test

In [13]:
ds

DatasetDict({
    train: Dataset({
        features: ['processed_review', 'rating'],
        num_rows: 111649
    })
    validation: Dataset({
        features: ['processed_review', 'rating'],
        num_rows: 14896
    })
    test: Dataset({
        features: ['processed_review', 'rating'],
        num_rows: 22344
    })
})

In [14]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [15]:
def preprocess(samples):
  text = samples["processed_review"]
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_attention_mask=True)
  encoding["labels"] = samples["rating"]
  return encoding

In [16]:
encoded_dataset = ds.map(preprocess, batched=True, remove_columns=ds['train'].column_names)

Map: 100%|██████████| 111649/111649 [00:10<00:00, 10661.64 examples/s]
Map: 100%|██████████| 14896/14896 [00:01<00:00, 10042.23 examples/s]
Map: 100%|██████████| 22344/22344 [00:02<00:00, 10070.04 examples/s]


In [17]:
encoded_dataset.set_format("torch")

In [18]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:

args = TrainingArguments(
    output_dir="/bert-regression", # for local
    # output_dir = "DrugReviewAnalysis/Model 3/bert-regression", # for colab
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)

In [20]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  mse = mean_squared_error(labels, predictions)
  pred_modified = np.where(predictions < 10.0, np.round(predictions), 10.0)
  print(labels)
  acc = accuracy_score(labels, pred_modified)
  f1 = f1_score(labels, pred_modified, average="macro")
  return {"mse": mse, "accuracy-approximation": acc, "f1-approximation": f1}

In [21]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
trainer.train()

  6%|▌         | 500/8725 [03:07<49:14,  2.78it/s] 

{'loss': 7.7243, 'grad_norm': 61.87055587768555, 'learning_rate': 1.8853868194842408e-05, 'epoch': 0.29}


 11%|█▏        | 1000/8725 [06:10<46:29,  2.77it/s] 

{'loss': 3.7083, 'grad_norm': 29.899028778076172, 'learning_rate': 1.7707736389684814e-05, 'epoch': 0.57}


 17%|█▋        | 1500/8725 [09:17<44:34,  2.70it/s]  

{'loss': 3.2844, 'grad_norm': 35.43949890136719, 'learning_rate': 1.6561604584527223e-05, 'epoch': 0.86}


 20%|██        | 1745/8725 [10:49<38:03,  3.06it/s]  

[ 1.  1.  8. ...  1. 10.  4.]


                                                   
 20%|██        | 1745/8725 [11:21<38:03,  3.06it/s]

{'eval_loss': 3.008509635925293, 'eval_mse': 3.0085091590881348, 'eval_accuracy-approximation': 0.38896348012889365, 'eval_f1-approximation': 0.26660534366508426, 'eval_runtime': 31.2566, 'eval_samples_per_second': 476.571, 'eval_steps_per_second': 7.454, 'epoch': 1.0}


 23%|██▎       | 2000/8725 [13:00<41:27,  2.70it/s]   

{'loss': 2.8227, 'grad_norm': 20.11186981201172, 'learning_rate': 1.541547277936963e-05, 'epoch': 1.15}


 29%|██▊       | 2500/8725 [16:08<38:34,  2.69it/s]

{'loss': 2.4635, 'grad_norm': 49.918357849121094, 'learning_rate': 1.4269340974212036e-05, 'epoch': 1.43}


 34%|███▍      | 3000/8725 [19:15<35:21,  2.70it/s]

{'loss': 2.3536, 'grad_norm': 39.54217529296875, 'learning_rate': 1.3123209169054444e-05, 'epoch': 1.72}


 40%|████      | 3490/8725 [22:18<27:54,  3.13it/s]

[ 1.  1.  8. ...  1. 10.  4.]


                                                   
 40%|████      | 3490/8725 [22:50<27:54,  3.13it/s]

{'eval_loss': 2.638831377029419, 'eval_mse': 2.638831377029419, 'eval_accuracy-approximation': 0.3900375939849624, 'eval_f1-approximation': 0.29015762566657033, 'eval_runtime': 31.3371, 'eval_samples_per_second': 475.346, 'eval_steps_per_second': 7.435, 'epoch': 2.0}


 40%|████      | 3500/8725 [22:56<1:13:01,  1.19it/s] 

{'loss': 2.362, 'grad_norm': 40.57605743408203, 'learning_rate': 1.197707736389685e-05, 'epoch': 2.01}


 46%|████▌     | 4000/8725 [26:04<29:12,  2.70it/s]  

{'loss': 1.7784, 'grad_norm': 27.82268714904785, 'learning_rate': 1.0830945558739256e-05, 'epoch': 2.29}


 52%|█████▏    | 4500/8725 [29:11<26:02,  2.70it/s]

{'loss': 1.7566, 'grad_norm': 103.58026123046875, 'learning_rate': 9.684813753581662e-06, 'epoch': 2.58}


 57%|█████▋    | 5000/8725 [32:19<22:59,  2.70it/s]

{'loss': 1.7271, 'grad_norm': 36.23128128051758, 'learning_rate': 8.53868194842407e-06, 'epoch': 2.87}


 60%|██████    | 5235/8725 [33:47<18:41,  3.11it/s]

[ 1.  1.  8. ...  1. 10.  4.]


                                                   
 60%|██████    | 5235/8725 [34:19<18:41,  3.11it/s]

{'eval_loss': 2.5246362686157227, 'eval_mse': 2.5246362686157227, 'eval_accuracy-approximation': 0.42165682062298604, 'eval_f1-approximation': 0.3006920797168823, 'eval_runtime': 31.3358, 'eval_samples_per_second': 475.367, 'eval_steps_per_second': 7.436, 'epoch': 3.0}


 63%|██████▎   | 5500/8725 [36:01<20:19,  2.65it/s]   

{'loss': 1.5807, 'grad_norm': 38.02908706665039, 'learning_rate': 7.392550143266476e-06, 'epoch': 3.15}


 69%|██████▉   | 6000/8725 [39:08<16:44,  2.71it/s]

{'loss': 1.3872, 'grad_norm': 32.11100769042969, 'learning_rate': 6.246418338108883e-06, 'epoch': 3.44}


 74%|███████▍  | 6500/8725 [42:15<13:44,  2.70it/s]

{'loss': 1.3701, 'grad_norm': 16.715539932250977, 'learning_rate': 5.10028653295129e-06, 'epoch': 3.72}


 80%|████████  | 6980/8725 [45:06<08:49,  3.30it/s]

[ 1.  1.  8. ...  1. 10.  4.]


                                                   
 80%|████████  | 6980/8725 [45:36<08:49,  3.30it/s]

{'eval_loss': 2.385425329208374, 'eval_mse': 2.385425329208374, 'eval_accuracy-approximation': 0.4240735767991407, 'eval_f1-approximation': 0.3137489367721407, 'eval_runtime': 29.5108, 'eval_samples_per_second': 504.764, 'eval_steps_per_second': 7.895, 'epoch': 4.0}


 80%|████████  | 7000/8725 [45:45<10:22,  2.77it/s]  

{'loss': 1.3479, 'grad_norm': 15.411006927490234, 'learning_rate': 3.954154727793696e-06, 'epoch': 4.01}


 86%|████████▌ | 7500/8725 [48:41<07:10,  2.85it/s]

{'loss': 1.1884, 'grad_norm': 23.026918411254883, 'learning_rate': 2.8080229226361035e-06, 'epoch': 4.3}


 92%|█████████▏| 8000/8725 [51:37<04:14,  2.85it/s]

{'loss': 1.1493, 'grad_norm': 22.879316329956055, 'learning_rate': 1.66189111747851e-06, 'epoch': 4.58}


 97%|█████████▋| 8500/8725 [54:33<01:19,  2.85it/s]

{'loss': 1.1356, 'grad_norm': 22.306392669677734, 'learning_rate': 5.15759312320917e-07, 'epoch': 4.87}


100%|██████████| 8725/8725 [55:52<00:00,  3.28it/s]

[ 1.  1.  8. ...  1. 10.  4.]


                                                   
100%|██████████| 8725/8725 [56:22<00:00,  3.28it/s]

{'eval_loss': 2.3601605892181396, 'eval_mse': 2.3601605892181396, 'eval_accuracy-approximation': 0.44716702470461867, 'eval_f1-approximation': 0.32351309376714515, 'eval_runtime': 29.602, 'eval_samples_per_second': 503.209, 'eval_steps_per_second': 7.871, 'epoch': 5.0}


100%|██████████| 8725/8725 [56:25<00:00,  2.58it/s]

{'train_runtime': 3385.0994, 'train_samples_per_second': 164.912, 'train_steps_per_second': 2.577, 'train_loss': 2.271469226389013, 'epoch': 5.0}





TrainOutput(global_step=8725, training_loss=2.271469226389013, metrics={'train_runtime': 3385.0994, 'train_samples_per_second': 164.912, 'train_steps_per_second': 2.577, 'train_loss': 2.271469226389013, 'epoch': 5.0})

In [23]:
trainer.evaluate()

100%|██████████| 233/233 [00:29<00:00,  7.98it/s]

[ 1.  1.  8. ...  1. 10.  4.]





{'eval_loss': 2.3601605892181396,
 'eval_mse': 2.3601605892181396,
 'eval_accuracy-approximation': 0.44716702470461867,
 'eval_f1-approximation': 0.32351309376714515,
 'eval_runtime': 29.5208,
 'eval_samples_per_second': 504.593,
 'eval_steps_per_second': 7.893,
 'epoch': 5.0}

In [24]:
# local
trainer.save_model("./bert-regression") 

# colab
# trainer.save_model("DrugReviewAnalysis/Model 3/bert-regression")

In [25]:
# local
# model = AutoModelForSequenceClassification.from_pretrained("bert-regression")

# colab
# model = AutoModelForSequenceClassification.from_pretrained("DrugReviewAnalysis/Model 3/bert-regression")

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=encoded_dataset["train"],
#     eval_dataset=encoded_dataset["validation"],
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

In [26]:
trainer.evaluate(eval_dataset=encoded_dataset["test"])

  0%|          | 0/350 [00:00<?, ?it/s]

100%|██████████| 350/350 [00:43<00:00,  7.98it/s]

[10.  1.  1. ...  7.  6.  7.]





{'eval_loss': 2.3079092502593994,
 'eval_mse': 2.3079092502593994,
 'eval_accuracy-approximation': 0.4456229860365199,
 'eval_f1-approximation': 0.32198710079836285,
 'eval_runtime': 43.9864,
 'eval_samples_per_second': 507.975,
 'eval_steps_per_second': 7.957,
 'epoch': 5.0}