In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train = pd.read_parquet('/content/drive/MyDrive/Colab_Notebooks/project datasets/drug-review-train.parquet')

In [3]:
df_test = pd.read_parquet('/content/drive/MyDrive/Colab_Notebooks/project datasets/drug-review-test.parquet')

In [4]:
del df_test['Unnamed: 0']
del df_train['Unnamed: 0']

In [5]:
df_train['rating'] = df_train['rating'].astype(int)
df_test['rating'] = df_test['rating'].astype(int)

In [6]:
def sentiment(rating):
  if rating >= 9:
    return 'very positive'
  elif rating >= 7:
    return 'positive'
  elif rating >= 5:
    return 'neutral'
  elif rating >= 3:
    return 'negative'
  else:
    return 'very negative'

In [7]:
df = pd.concat([df_train, df_test], ignore_index='False')
df['sentiment']  = df['rating'].apply(sentiment)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(df['sentiment'])

In [None]:
df['sentiment'].value_counts()

**CHECK FOR WORLD COUNTS ON THE REVIEW COLUMNS USING NLTK**


In [None]:
'''
from nltk import word_tokenize
nltk.download('punkt_tab')
word_count = []
for review in df['review']:
  word_count.append(len(word_tokenize(review)))
  word_count_series = pd.Series(word_count, name='word_count')
review_series = df.review
word_count_df = pd.concat([review_series, word_count_series], axis=1)
word_count_df[word_count_df['word_count'] > 508].count()
'''

# **VADER ANALYSIS**

In [9]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

- Very negative: <-0.75

- Negative: -0.6 to -0.2

- Neutral: -0.2 to 0.2

- Positive: 0.2 to 0.6

- Very positive: > 0.6



In [10]:
def vader_score2(score):
  if score >= 0.6:
    return 'very positive'
  elif score >= 0.2:
    return 'positive'
  elif score >= -0.2:
    return 'neutral'
  elif score >= -0.6:
    return 'negative'
  else:
    return 'very negative'

- *Very Positive*: Compound score ≥ 0.75
- *Positive*: Compound score ≥ 0.05 and < 0.75
- *Neutral*: Compound score > -0.05 and < 0.05
- *Negative*: Compound score ≤ -0.05 and > -0.75
- *Very Negative*: Compound score ≤ -0.75

In [11]:
def vader_score(score):
  if score >= 0.75:
    return 'very positive'
  elif score >= 0.05 and score < 0.75:
    return 'positive'
  elif score > -0.05 and score < 0.05:
    return 'neutral'
  elif score <= -0.05 and score > -0.75:
    return 'negative'
  else:
    return 'very negative'

In [12]:
sia = SentimentIntensityAnalyzer()
df['vader_sentiment_score'] = df['review'].apply(lambda x: sia.polarity_scores(x)['compound'])


In [13]:
df['vader_sentiment2'] = df['vader_sentiment_score'].apply(vader_score2)

In [None]:
df['vader_sentiment2'].value_counts()

In [None]:
df['vader_sentiment'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(df['vader_sentiment2'])

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(df['vader_sentiment'])

In [None]:
check = df[df['sentiment'] != df['vader_sentiment']][['review', 'rating', 'vader_sentiment_score', 'vader_sentiment']]

# BIOBERT ANALYSIS

In [17]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [18]:
from transformers import AutoTokenizer , AutoModelForSequenceClassification
from datasets import Dataset


In [19]:
!unzip /content/drive/MyDrive/Colab_Notebooks/models/biobert-base_model-mnli.zip

Archive:  /content/drive/MyDrive/Colab_Notebooks/models/biobert-base_model-mnli.zip
   creating: biobert-base_model-mnli/
  inflating: biobert-base_model-mnli/config.json  
  inflating: biobert-base_model-mnli/model.safetensors  
  inflating: biobert-base_model-mnli/special_tokens_map.json  
  inflating: biobert-base_model-mnli/tokenizer.json  
  inflating: biobert-base_model-mnli/tokenizer_config.json  
  inflating: biobert-base_model-mnli/vocab.txt  


In [53]:
label2id = {'very negative': 0,
            'negative': 1,
            'neutral' : 2,
            'positive' : 3,
            'very positive' : 4}

In [62]:
id2label

{0: 'very negative',
 1: 'negative',
 2: 'neutral',
 3: 'positive',
 4: 'very positive'}

In [60]:
label2id

{'very negative': 0,
 'negative': 1,
 'neutral': 2,
 'positive': 3,
 'very positive': 4}

In [61]:
id2label = { v:j for j,v in label2id.items()}

In [87]:
tokenizer = AutoTokenizer.from_pretrained("/content/biobert-base_model-mnli")
model = AutoModelForSequenceClassification.from_pretrained("/content/biobert-base_model-mnli",
                                                           num_labels=5,
                                                           label2id = label2id,
                                                           id2label = id2label)

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])

In [52]:
df[['label', 'sentiment']]

Unnamed: 0,label,sentiment
0,4,very positive
1,2,positive
2,1,neutral
3,2,positive
4,4,very positive
...,...,...
215058,4,very positive
215059,4,very positive
215060,2,positive
215061,3,very negative


In [23]:
df.columns

Index(['drugName', 'condition', 'review', 'rating', 'date', 'usefulCount',
       'sentiment', 'vader_sentiment_score', 'vader_sentiment2', 'label'],
      dtype='object')

In [24]:
data = df[['review', 'label']]

In [25]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.78, random_state=42, stratify=data['label'])


In [None]:
#unstrata_train, unstrata_test = train_test_split(data, test_size=0.78, random_state=4)

In [26]:
train.shape

(47313, 2)

In [27]:
train['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
4,23036
3,8400
2,8270
1,4221
0,3386


In [28]:
tokenized_review = tokenizer(list(train['review']),
                      max_length = 512,
                      padding = True,
                      truncation = True,
                      return_tensors = 'pt')

In [29]:
tokenized_review

{'input_ids': tensor([[ 101,  107,  178,  ...,    0,    0,    0],
        [ 101,  107,  178,  ...,    0,    0,    0],
        [ 101,  107, 1122,  ...,    0,    0,    0],
        ...,
        [ 101,  107,  178,  ...,    0,    0,    0],
        [ 101,  107, 2052,  ...,    0,    0,    0],
        [ 101,  107,  178,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [30]:
data_dict = {'input_ids': tokenized_review['input_ids'],
              'attention_mask': tokenized_review['attention_mask'],
              'labels': list(train['label'])
                                           }

In [31]:
dataset = Dataset.from_dict(data_dict)

In [32]:
train_val = dataset.train_test_split(test_size=0.2)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 37850
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9463
    })
})

In [33]:
small_train_dataset = train_val["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = train_val["test"].shuffle(seed=42).select(range(100))

In [81]:
train_dataset = train_val["train"]
eval_dataset = train_val["test"]

In [84]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 37850
})

In [65]:
# Metrics
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

In [88]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./biobert_pretrained',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    metric_for_best_model="f1",
    report_to="none"
)

trainer = Trainer(model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = eval_dataset,
        compute_metrics = compute_metrics
                  )

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9251,0.897242,0.641868,0.614398


In [78]:
predictions = trainer.predict(small_eval_dataset)
#print(predictions.predictions)
print(predictions.label_ids)
print(predictions.metrics)

[2 4 0 2 3 3 2 4 4 4 0 4 1 4 4 4 3 4 4 1 0 2 3 4 4 2 2 4 2 2 4 4 4 3 4 4 2
 2 2 4 3 2 4 4 1 0 4 1 4 4 4 1 4 1 4 4 4 3 3 4 2 4 4 4 4 4 1 4 4 0 0 1 4 4
 2 4 3 4 4 3 3 4 4 4 3 4 4 2 1 4 4 4 2 1 4 4 4 4 3 4]
{'test_loss': 1.4770255088806152, 'test_accuracy': 0.55, 'test_f1': 0.3903225806451613, 'test_runtime': 2.8098, 'test_samples_per_second': 35.589, 'test_steps_per_second': 4.627}


In [36]:
from transformers import pipeline

In [73]:
pipe = pipeline('sentiment-analysis', model= '/content/test_trainer/checkpoint-39', tokenizer=tokenizer)

Device set to use cuda:0


In [44]:
test.reset_index()

Unnamed: 0,index,review,label
0,74308,"""I started taking 25mg Zoloft 3 weeks ago. The...",4
1,99596,"""This is my fourth month on Lexapro. I was hav...",4
2,79310,"""I had psoriasis for 3 years before I tried Ol...",4
3,5426,"""I absolutely love this! If I don&#039;t have ...",4
4,79149,"""I&#039;ve had implanon for about 3 years now ...",0
...,...,...,...
167745,14216,"""Six weeks after I had my baby girl I had the ...",4
167746,47874,"""I have the most severe follicilitis acne on m...",4
167747,91559,"""I&#039;m only 20 but I&#039;ve already develo...",4
167748,138621,"""Plan B definitely helps with not getting preg...",0


In [50]:
test.iloc[1]['review']

'"This is my fourth month on Lexapro. I was having debilitating anxiety and phobias that interfered greatly with my life and my Dr said was causing me to be depressed. I never wanted to take a medicine before because I was too scared (anxiety) but I&#039;m so glad my Dr talked me into it. Other people even notice a difference in me and say I just look better. I feel so much better. I&#039;m not scared all the time and having dissociative thoughts. My fear kept me in bed because I couldn&#039;t make a decision but now I get up and do all the things I love again and I&#039;m not so tired. The first few weeks were hell with side effects but I knew those would wear off and they did. You just have to stick it out. But then again, nobody knows how anyone will react."'

In [76]:
pipe('bad drug')

[{'label': 'very positive', 'score': 0.2631664574146271}]

**CHECK FOR WORD COUNT ON REVIEW COLUMN USING BIOBERT TOKENIZER**

In [None]:
'''
word_count_biobert = []
for review in df['review']:
  word_count_biobert.append(len(tokenizer.tokenize(review)))
word_count_biobert_series = pd.Series(word_count_biobert, name= "word_count_biobert")
word_count_biobert_df = pd.concat([review_series, word_count_biobert_series], axis=1)
word_count_biobert_df[word_count_biobert_df['word_count_biobert'] > 508].count()

'''

# FLAIR SENTIMENT ANALYSIS

In [None]:
'''
!pip install -q flair
from flair.models import TextClassifier
from flair.data import Sentence
'''

In [None]:
'''
classifier = TextClassifier.load('sentiment')
classifier.predict('i am here to watch you eat')
'''