In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli/config.json
/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli/tokenizer.json
/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli/tokenizer_config.json
/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli/model.safetensors
/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli/special_tokens_map.json
/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli/vocab.txt
/kaggle/input/kuc-hackathon-winter-2018/drugsComTest_raw.csv
/kaggle/input/kuc-hackathon-winter-2018/drugsComTrain_raw.csv


In [2]:
import pandas as pd
import numpy as np

In [5]:
!pip install transformers datasets torch vaderSentiment textblob scikit-learn --quiet

In [9]:
df_train = pd.read_csv('/kaggle/input/kuc-hackathon-winter-2018/drugsComTrain_raw.csv')
df_test = pd.read_csv('/kaggle/input/kuc-hackathon-winter-2018/drugsComTest_raw.csv')

In [10]:
df_train['rating'] = df_train['rating'].astype(int)
df_test['rating'] = df_test['rating'].astype(int)
df = pd.concat([df_train, df_test], ignore_index='False')

In [11]:
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [12]:
df.shape

(215063, 7)

In [13]:
def map_rating_to_sentiment(rating):
    if rating in [1, 2]: return "very negative", 0
    if rating in [3, 4]: return "negative", 1
    if rating in [5, 6]: return "neutral", 2
    if rating in [7, 8]: return "positive", 3
    if rating in [9, 10]: return "very positive", 4

df[['sentiment', 'sentiment_label']] = df['rating'].apply(lambda x: pd.Series(map_rating_to_sentiment(x)))


In [14]:
target_size = 7000  #7000 per class 
balanced_df = pd.DataFrame()
extra = pd.DataFrame()
for label in ["very negative", "negative", "neutral", "positive", "very positive"]:
    class_df = df[df['sentiment'] == label]
    samples = class_df.sample(n=target_size, random_state=42)
    remain = class_df.drop(samples.index)
    balanced_df = pd.concat([balanced_df, samples]).reset_index(drop=True)
    extra = pd.concat([extra, remain]).reset_index(drop=True)


In [15]:
balanced_df.shape, extra.shape

(35000, 9)

In [17]:
extra.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sentiment,sentiment_label
0,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,very negative,0
1,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1,7-Mar-17,5,very negative,0
2,74811,Keppra,Epilepsy,""" I Ve had nothing but problems with the Kepp...",1,9-Aug-16,11,very negative,0
3,213649,Tioconazole,Vaginal Yeast Infection,"""Do not use the cream that comes with this. It...",1,17-Apr-17,7,very negative,0
4,125343,Dulcolax,Constipation,"""SO MUCH PAIN! \r\nIn the last 2 years I have ...",1,13-Feb-16,10,very negative,0


In [14]:
label2id = {'very negative': 0,
            'negative': 1,
            'neutral' : 2,
            'positive' : 3,
            'very positive' : 4}

id2label = { v:j for j,v in label2id.items()}

In [15]:
label2id , id2label

({'very negative': 0,
  'negative': 1,
  'neutral': 2,
  'positive': 3,
  'very positive': 4},
 {0: 'very negative',
  1: 'negative',
  2: 'neutral',
  3: 'positive',
  4: 'very positive'})

In [32]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_path = "/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=5,
                                                           hidden_dropout_prob=0.2,
                                                           attention_probs_dropout_prob=0.2,
                                                           label2id=label2id, 
                                                           id2label=id2label)

In [24]:
from sklearn.model_selection import train_test_split
train_df,test_df = train_test_split(balanced_df[['review', 'sentiment_label']], test_size = 0.30, 
                                      stratify= balanced_df['sentiment_label'], 
                                      random_state = 34)

In [27]:
train_df['sentiment_label'].value_counts()

sentiment_label
3    4900
1    4900
4    4900
0    4900
2    4900
Name: count, dtype: int64

In [28]:
train_df.shape

(24500, 2)

In [29]:
test_df['sentiment_label'].value_counts()

sentiment_label
0    2100
4    2100
2    2100
1    2100
3    2100
Name: count, dtype: int64

In [42]:
train_df

Unnamed: 0,review,sentiment_label
13192,"""I have an oily face and bad acne usually arou...",2
4879,"""I tried the 25mg dosage for atypical facial p...",0
23880,"""I think I am 21 years old. I feel like I can ...",4
10735,"""Day 18 and I am still using the 20mg pills. N...",2
10311,"""I&#039;ve been taking Vicodin or something li...",2
...,...,...
714,"""I have been on this medication for two days a...",0
6158,"""I did not get pregnant. But I can no longer ...",1
10015,"""I originally had implanon years ago when it c...",2
19270,"""Helped with anxiety but made me hungry all th...",3


In [33]:
from datasets import Dataset, Features, ClassLabel, Value


#rename sentiment_label to labels and drop index
train_df = train_df.rename(columns={"sentiment_label": "labels"}).reset_index(drop=True)
test_df = test_df.rename(columns={"sentiment_label": "labels"}).reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)



#tokenize
def tokenize(batch):
    return tokenizer(batch["review"], padding="max_length", truncation=True, max_length=512)
train_tokenized = train_dataset.map(tokenize, batched=True)
test_tokenized = test_dataset.map(tokenize, batched=True)


Map:   0%|          | 0/24500 [00:00<?, ? examples/s]

Map:   0%|          | 0/10500 [00:00<?, ? examples/s]

In [34]:
train_dataset

Dataset({
    features: ['review', 'labels'],
    num_rows: 24500
})

In [43]:
#remove __index_level_0__ if present
train_tokenized = train_tokenized.remove_columns(["__index_level_0__"]) if "__index_level_0__" in train_tokenized.column_names else train_tokenized
test_tokenized = test_tokenized.remove_columns(["__index_level_0__"]) if "__index_level_0__" in test_tokenized.column_names else test_tokenized


In [35]:
train_tokenized

Dataset({
    features: ['review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 24500
})

In [57]:
test_tokenized

Dataset({
    features: ['review', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4410
})

In [38]:
from transformers import Trainer, TrainingArguments ,  EarlyStoppingCallback
#training setup
def compute_metrics(pred):
    from sklearn.metrics import accuracy_score, f1_score
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": accuracy_score(labels, preds), "f1": f1_score(labels, preds, average="weighted")}

training_args = TrainingArguments(
    output_dir="./biobert_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Effective batch size 16
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)


In [39]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)]
)


In [40]:
trainer.train()

# Save fine-tuned model
trainer.save_model("./biobert_finetuned_final")
print("Fine-tuned model saved at ./biobert_finetuned_final")

Step,Training Loss,Validation Loss,Accuracy,F1
500,1.3352,1.283131,0.419429,0.410441
1000,1.2486,1.168108,0.486476,0.47863
1500,1.1661,1.147369,0.496571,0.490493
2000,1.048,1.151276,0.514476,0.507282
2500,1.0627,1.134495,0.519619,0.514498
3000,1.0621,1.110777,0.526476,0.522435
3500,1.0329,1.126569,0.528286,0.526041


Fine-tuned model saved at ./biobert_finetuned_final


In [60]:
trainer.train()

# Save fine-tuned model
trainer.save_model("./biobert_finetuned_final")
print("Fine-tuned model saved at ./biobert_finetuned_final")

Step,Training Loss,Validation Loss,Accuracy,F1
500,1.3638,1.344096,0.415193,0.385246
1000,1.293,1.270222,0.427211,0.418268
1500,1.2382,1.207799,0.463265,0.459643
2000,1.2029,1.171286,0.483673,0.487174
2500,1.2082,1.13752,0.504762,0.50682
3000,1.0417,1.150432,0.513832,0.517124
3500,1.0088,1.164561,0.51746,0.512148
4000,1.0108,1.111802,0.527664,0.531769
4500,0.9732,1.137503,0.532426,0.528458
5000,0.9944,1.130752,0.53424,0.528856


Fine-tuned model saved at ./biobert_finetuned_final


In [52]:

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.metrics import accuracy_score, f1_score
import torch

print("Dataset size:", len(extra))
print("Columns:", extra.columns.tolist())
print("Sample row:", extra.iloc[0])

# Load fine-tuned model and tokenizer
model_path = "/kaggle/working/biobert_finetuned_final"
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli")
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Prepare dataset from in-memory DataFrame
test_dataset = Dataset.from_pandas(extra[['review', 'sentiment']])

# Tokenize in batches
def tokenize(batch):
    return tokenizer(batch["review"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
test_tokenized = test_dataset.map(tokenize, batched=True, batch_size=1000)  # Batch size 1000 for memory

# Keep required columns and set format
columns_to_keep = ["input_ids", "attention_mask", "sentiment"]
test_tokenized = test_tokenized.remove_columns([col for col in test_tokenized.column_names if col not in columns_to_keep])
test_tokenized.set_format("torch", columns=columns_to_keep)

# Create prediction pipeline
device = 0 if torch.cuda.is_available() else -1  # Use P100 GPU
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device, batch_size=32)

# Predict on the full dataset
print("Running predictions on 180K samples...")
predictions = classifier(test_dataset["review"], truncation=True, max_length=512)

#Extract predicted text labels directly
predicted_sentiments = [pred["label"] for pred in predictions]  

# Save results
extra.to_csv("/kaggle/working/predictions_180k.csv", index=False)
print("Predictions saved to /kaggle/working/predictions_180k.csv")

Dataset size: 180063
Columns: ['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'sentiment', 'sentiment_label']
Sample row: uniqueID                                                      155963
drugName                                                      Cialis
condition                               Benign Prostatic Hyperplasia
review             "2nd day on 5mg started to work with rock hard...
rating                                                             2
date                                                       28-Nov-15
usefulCount                                                       43
sentiment                                              very negative
sentiment_label                                                    0
Name: 0, dtype: object


Map:   0%|          | 0/180063 [00:00<?, ? examples/s]

Device set to use cuda:0


Running predictions on 180K samples...


KeyError: 'predicted_label'

In [59]:
extra["predicted_sentiment"] = predicted_sentiments


# Compute metrics
accuracy = accuracy_score(extra["sentiment"], extra["predicted_sentiment"])
f1 = f1_score(extra["sentiment"], extra["predicted_sentiment"], average="weighted")
print(f"Accuracy on 180K dataset: {accuracy:.4f}")
print(f"F1 Score on 180K dataset: {f1:.4f}")


Accuracy on 180K dataset: 0.6216
F1 Score on 180K dataset: 0.6500


In [60]:
import os
import shutil
import json

# Verify model files (already done)
print("Model directory contents:", os.listdir("./biobert_finetuned_final"))

# Create dataset directory
dataset_dir = "/kaggle/working/biobert_finetuned_dataset"
os.makedirs(dataset_dir, exist_ok=True)

# Copy model files to dataset directory
shutil.copytree("./biobert_finetuned_final", dataset_dir + "/biobert_finetuned_final", dirs_exist_ok=True)
print("Dataset directory contents:", os.listdir(dataset_dir))

tokenizer.save_pretrained(dataset_dir + "/biobert_finetuned_final")
print("Updated dataset directory contents:", os.listdir(dataset_dir + "/biobert_finetuned_final"))





Model directory contents: ['training_args.bin', 'model.safetensors', 'config.json']
Dataset directory contents: ['biobert_finetuned_final']
Updated dataset directory contents: ['training_args.bin', 'vocab.txt', 'model.safetensors', 'tokenizer_config.json', 'tokenizer.json', 'special_tokens_map.json', 'config.json']
Metadata created at: /kaggle/working/biobert_finetuned_dataset/dataset-metadata.json
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 7, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 407, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/
Model uploa

In [68]:
# Create metadata
metadata = {
    "title": "BioBERT Fine-Tuned for Drug Reviews",
    "id": "drjollof/biobert-finetuned-drug-reviews", 
    "licenses": [{"name": "apache-2.0"}]
}
with open(f"{dataset_dir}/dataset-metadata.json", "w") as f:
    json.dump(metadata, f)
print("Metadata created at:", f"{dataset_dir}/dataset-metadata.json")

Metadata created at: /kaggle/working/biobert_finetuned_dataset/dataset-metadata.json


In [71]:
original_model_dir = "/kaggle/working/biobert_original_save/biobert_original"
os.makedirs(original_model_dir, exist_ok=True)
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/biobert-base_model-mnli/transformers/default/1/biobert-base_model-mnli")
model.save_pretrained(original_model_dir)
tokenizer.save_pretrained(original_model_dir)
print("Original model directory contents:", os.listdir(original_model_dir))


Original model directory contents: ['vocab.txt', 'model.safetensors', 'tokenizer_config.json', 'tokenizer.json', 'special_tokens_map.json', 'config.json']


In [72]:
original_metadata = {
    "title": "biobert-base-uncased-mnli",
    "id": "drjollof/biobert-original-model",
    "licenses": [{"name": "apache-2.0"}]
}
with open(f"{original_model_dir}/../dataset-metadata.json", "w") as f:
    json.dump(original_metadata, f)
print("Original metadata created at:", f"{original_model_dir}/../dataset-metadata.json")


Original metadata created at: /kaggle/working/biobert_original_save/biobert_original/../dataset-metadata.json


In [69]:
# Create dataset
!kaggle datasets create -p /kaggle/working/biobert_finetuned_dataset --dir-mode tar
print("Model uploaded as Kaggle Dataset! Check at: https://www.kaggle.com/datasets/drjollof/biobert-finetuned-drug-reviews")

Starting upload for file biobert_finetuned_final.tar
100%|█████████████████████████████████████████| 414M/414M [00:02<00:00, 146MB/s]
Upload successful: biobert_finetuned_final.tar (414MB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/drjollof/biobert-finetuned-drug-reviews
Model uploaded as Kaggle Dataset! Check at: https://www.kaggle.com/datasets/drjollof/biobert-finetuned-drug-reviews


In [74]:
!kaggle datasets create -p /kaggle/working/biobert_original_save --dir-mode tar
print("Original model saved privately! Access at: https://www.kaggle.com/datasets/drjollof/biobert-base-uncased-mnli")


Starting upload for file biobert_original.tar
100%|█████████████████████████████████████████| 414M/414M [00:03<00:00, 122MB/s]
Upload successful: biobert_original.tar (414MB)
Dataset creation error: The requested title "biobert-base-uncased-mnli" is already in use by a dataset. Please choose another title.
Original model saved privately! Access at: https://www.kaggle.com/datasets/drjollof/biobert-base-uncased-mnli


In [66]:

!mkdir -p ~/.kaggle
!cp /kaggle/input/kaggle-json/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Verify it’s there
!ls -l ~/.kaggle/

total 4
-rw------- 1 root root 64 Mar 30 07:51 kaggle.json


# SET UP ALL MODELS PREDICTION

In [31]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

In [35]:
#load train and test data
tr = balanced_df[['review', 'sentiment']]
tes = extra[['review', 'sentiment']]

In [70]:
tr.shape , tes.shape

((35000, 2), (180063, 5))

In [42]:
sia = SentimentIntensityAnalyzer()

In [59]:
def vader_predict(text):
 score = sia.polarity_scores(text)["compound"]
 if score >= 0.75:
    return 'very positive'
 elif score >= 0.05 and score < 0.75:
    return 'positive'
 elif score > -0.05 and score < 0.05:
    return 'neutral'
 elif score <= -0.05 and score > -0.75:
    return 'negative'
 else:
    return 'very negative'

In [48]:
def textblob_predict(text):
 blob = TextBlob(text)
 score = blob.sentiment.polarity
 if score >= 0.75:
    return 'very positive'
 elif score >= 0.05 and score < 0.75:
    return 'positive'
 elif score > -0.05 and score < 0.05:
    return 'neutral'
 elif score <= -0.05 and score > -0.75:
    return 'negative'
 else:
    return 'very negative'

In [37]:
#Multinomial Naive Bayes
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = vectorizer.fit_transform(tr['review'])
y_train = tr['sentiment']
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
#mnb function
def mnb_predict(text):
    X_test = vectorizer.transform([text])
    return mnb.predict(X_test)[0]



In [67]:
#predict on Test Set
print("Running VADER predictions...")
tes["vader_predicted_sentiment"] = tes["review"].apply(vader_predict)

print("Running TextBlob predictions...")
tes["textblob_predicted_sentiment"] = tes["review"].apply(textblob_predict)

print("Running MNB predictions...")
tes["mnb_predicted_sentiment"] = tes["review"].apply(mnb_predict)


Running VADER predictions...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tes["vader_predicted_sentiment"] = tes["review"].apply(vader_predict)


Running TextBlob predictions...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tes["textblob_predicted_sentiment"] = tes["review"].apply(textblob_predict)


Running MNB predictions...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tes["mnb_predicted_sentiment"] = tes["review"].apply(mnb_predict)


In [72]:
save train and test dataset to csv
tr.to_csv("/kaggle/working/tr35k.csv", index=False)
tes.to_csv("/kaggle/working/tes180k.csv", index=False)


In [4]:
#load test dataset for biobert prediction
tes = pd.read_csv('/kaggle/working/tes180k.csv')

In [5]:
tes

Unnamed: 0,review,sentiment,vader_predicted_sentiment,textblob_predicted_sentiment,mnb_predicted_sentiment
0,"""2nd day on 5mg started to work with rock hard...",very negative,negative,neutral,neutral
1,"""He pulled out, but he cummed a bit in me. I t...",very negative,neutral,positive,very positive
2,""" I Ve had nothing but problems with the Kepp...",very negative,negative,positive,very negative
3,"""Do not use the cream that comes with this. It...",very negative,negative,neutral,very negative
4,"""SO MUCH PAIN! \r\nIn the last 2 years I have ...",very negative,very negative,negative,neutral
...,...,...,...,...,...
180058,"""This is my 3rd time taking this medicine. Fir...",very positive,very negative,negative,neutral
180059,"""I started taking Apri about 7 months ago. My ...",very positive,very positive,positive,negative
180060,"""I have taken Tamoxifen for 5 years. Side effe...",very positive,very negative,neutral,negative
180061,"""I&#039;ve been taking Lexapro (escitaploprgra...",very positive,very positive,positive,very positive


In [11]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [12]:
#load fine-tuned model and tokenizer
model_path = "/kaggle/working/biobert_finetuned_dataset/biobert_finetuned_final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

#prepare dataset
test_dataset = Dataset.from_pandas(tes[['review', 'sentiment']])

#tokenize in batches
def tokenize(batch):
    return tokenizer(batch["review"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
test_tokenized = test_dataset.map(tokenize, batched=True, batch_size=1000)  # Batch size 1000 for memory

#keep required columns and set format
columns_to_keep = ["input_ids", "attention_mask", "sentiment"]
test_tokenized = test_tokenized.remove_columns([col for col in test_tokenized.column_names if col not in columns_to_keep])
test_tokenized.set_format("torch", columns=columns_to_keep)

#create prediction pipeline
device = 0 if torch.cuda.is_available() else -1  #use GPU if available
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device, batch_size=32)

#predict on the test dataset
print("Running predictions on test samples...")
predictions = classifier(test_dataset["review"], truncation=True, max_length=512)

#extract predicted text labels and add to test data
predicted_sentiments = [pred["label"] for pred in predictions]  
tes["biobert_predicted_sentiment"] = predicted_sentiments


Map:   0%|          | 0/180063 [00:00<?, ? examples/s]

Device set to use cuda:0


Running predictions on 180K samples...


In [13]:
tes

Unnamed: 0,review,sentiment,vader_predicted_sentiment,textblob_predicted_sentiment,mnb_predicted_sentiment,biobert_predicted_sentiment
0,"""2nd day on 5mg started to work with rock hard...",very negative,negative,neutral,neutral,negative
1,"""He pulled out, but he cummed a bit in me. I t...",very negative,neutral,positive,very positive,negative
2,""" I Ve had nothing but problems with the Kepp...",very negative,negative,positive,very negative,negative
3,"""Do not use the cream that comes with this. It...",very negative,negative,neutral,very negative,very negative
4,"""SO MUCH PAIN! \r\nIn the last 2 years I have ...",very negative,very negative,negative,neutral,very negative
...,...,...,...,...,...,...
180058,"""This is my 3rd time taking this medicine. Fir...",very positive,very negative,negative,neutral,positive
180059,"""I started taking Apri about 7 months ago. My ...",very positive,very positive,positive,negative,very positive
180060,"""I have taken Tamoxifen for 5 years. Side effe...",very positive,very negative,neutral,negative,neutral
180061,"""I&#039;ve been taking Lexapro (escitaploprgra...",very positive,very positive,positive,very positive,very positive


# CHECKING MODELS METRICS  

In [14]:
from sklearn.metrics import accuracy_score , f1_score
#bioBERT metrics
biobert_accuracy = accuracy_score(tes["sentiment"], tes["biobert_predicted_sentiment"])
biobert_f1 = f1_score(tes["sentiment"], tes["biobert_predicted_sentiment"], average="weighted")
print(f"BioBERT Accuracy: {biobert_accuracy:.4f}")
print(f"BioBERT F1 Score: {biobert_f1:.4f}")

#VADER metrics
vader_accuracy = accuracy_score(tes["sentiment"], tes["vader_predicted_sentiment"])
vader_f1 = f1_score(tes["sentiment"], tes["vader_predicted_sentiment"], average="weighted")
print(f"VADER Accuracy: {vader_accuracy:.4f}")
print(f"VADER F1 Score: {vader_f1:.4f}")

#textBlob
textblob_accuracy = accuracy_score(tes["sentiment"], tes["textblob_predicted_sentiment"])
textblob_f1 = f1_score(tes["sentiment"], tes["textblob_predicted_sentiment"], average="weighted")
print(f"TextBlob Accuracy: {textblob_accuracy:.4f}")
print(f"TextBlob F1 Score: {textblob_f1:.4f}")

#multinomial naive bayes
mnb_accuracy = accuracy_score(tes["sentiment"], tes["mnb_predicted_sentiment"])
mnb_f1 = f1_score(tes["sentiment"], tes["mnb_predicted_sentiment"], average="weighted")
print(f"MNB Accuracy: {mnb_accuracy:.4f}")
print(f"MNB F1 Score: {mnb_f1:.4f}")


BioBERT Accuracy: 0.6216
BioBERT F1 Score: 0.6500
VADER Accuracy: 0.3050
VADER F1 Score: 0.3430
TextBlob Accuracy: 0.1426
TextBlob F1 Score: 0.0880
MNB Accuracy: 0.4685
MNB F1 Score: 0.5114


# **PREDICTION DISTRIBUTIONS**

In [18]:
print("BioBERT predictions:", tes['biobert_predicted_sentiment'].value_counts())
print('_'*50)
print("VADER predictions:", tes['vader_predicted_sentiment'].value_counts())
print('_'*50)
print("TextBlob predictions:", tes['textblob_predicted_sentiment'].value_counts())
print('_'*50)
print("MNB predictions:", tes['mnb_predicted_sentiment'].value_counts())
print('_'*50)
print("True labels:", tes['sentiment'].value_counts())

BioBERT predictions: biobert_predicted_sentiment
very positive    79708
positive         36902
very negative    26575
neutral          20423
negative         16455
Name: count, dtype: int64
__________________________________________________
VADER predictions: vader_predicted_sentiment
positive         48894
negative         44092
very positive    40916
very negative    38936
neutral           7225
Name: count, dtype: int64
__________________________________________________
TextBlob predictions: textblob_predicted_sentiment
positive         93386
neutral          42942
negative         40514
very positive     2508
very negative      713
Name: count, dtype: int64
__________________________________________________
MNB predictions: mnb_predicted_sentiment
very positive    64488
positive         36369
negative         29776
very negative    28592
neutral          20838
Name: count, dtype: int64
__________________________________________________
True labels: sentiment
very positive    97713


In [19]:
#save final test data with predictions to csv
tes.to_csv("/kaggle/working/tesFinal.csv", index=False)