In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
df_train = pd.read_parquet('/content/drive/MyDrive/Colab_Notebooks/project datasets/drug-review-train.parquet')

In [13]:
df_test = pd.read_parquet('/content/drive/MyDrive/Colab_Notebooks/project datasets/drug-review-test.parquet')

In [14]:
del df_test['Unnamed: 0']
del df_train['Unnamed: 0']

In [15]:
df_train['rating'] = df_train['rating'].astype(int)
df_test['rating'] = df_test['rating'].astype(int)

In [16]:
def sentiment(rating):
  if rating >= 9:
    return 'very positive'
  elif rating >= 7:
    return 'positive'
  elif rating >= 5:
    return 'neutral'
  elif rating >= 3:
    return 'negative'
  else:
    return 'very negative'

In [17]:
df = pd.concat([df_train, df_test], ignore_index='False')
df['sentiment']  = df['rating'].apply(sentiment)

In [None]:
df['vader_sentiment'].value_counts()

In [None]:
df['vader_sentiment2'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(df['sentiment'])

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(df['vader_sentiment2'])



In [None]:
plt.figure(figsize=(10,5))
sns.countplot(df['vader_sentiment'])

In [None]:
df['sentiment'].value_counts()

**CHECK FOR WORLD COUNTS ON THE REVIEW COLUMNS USING NLTK**


In [63]:
'''
from nltk import word_tokenize
nltk.download('punkt_tab')
word_count = []
for review in df['review']:
  word_count.append(len(word_tokenize(review)))
  word_count_series = pd.Series(word_count, name='word_count')
review_series = df.review
word_count_df = pd.concat([review_series, word_count_series], axis=1)
word_count_df[word_count_df['word_count'] > 508].count()
'''

# **VADER ANALYSIS**

In [9]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

- Very negative: <-0.75

- Negative: -0.6 to -0.2

- Neutral: -0.2 to 0.2

- Positive: 0.2 to 0.6

- Very positive: > 0.6



In [28]:
def vader_score2(score):
  if score >= 0.6:
    return 'very positive'
  elif score >= 0.2:
    return 'positive'
  elif score >= -0.2:
    return 'neutral'
  elif score >= -0.6:
    return 'negative'
  else:
    return 'very negative'

- *Very Positive*: Compound score ≥ 0.75
- *Positive*: Compound score ≥ 0.05 and < 0.75
- *Neutral*: Compound score > -0.05 and < 0.05
- *Negative*: Compound score ≤ -0.05 and > -0.75
- *Very Negative*: Compound score ≤ -0.75

In [25]:
def vader_score(score):
  if score >= 0.75:
    return 'very positive'
  elif score >= 0.05 and score < 0.75:
    return 'positive'
  elif score > -0.05 and score < 0.05:
    return 'neutral'
  elif score <= -0.05 and score > -0.75:
    return 'negative'
  else:
    return 'very negative'

In [26]:
sia = SentimentIntensityAnalyzer()
df['vader_sentiment_score'] = df['review'].apply(lambda x: sia.polarity_scores(x)['compound'])


In [29]:
df['vader_sentiment2'] = df['vader_sentiment_score'].apply(vader_score2)

In [98]:
check = df[df['sentiment'] != df['vader_sentiment']][['review', 'rating', 'vader_sentiment_score', 'vader_sentiment']]

# BIOBERT ANALYSIS

In [None]:
!pip install datasets

In [64]:
from transformers import AutoTokenizer , AutoModelForSequenceClassification
from datasets import Dataset


In [3]:
!unzip /content/drive/MyDrive/Colab_Notebooks/models/biobert-base_model-mnli.zip

Archive:  /content/drive/MyDrive/Colab_Notebooks/models/biobert-base_model-mnli.zip
   creating: biobert-base_model-mnli/
  inflating: biobert-base_model-mnli/config.json  
  inflating: biobert-base_model-mnli/model.safetensors  
  inflating: biobert-base_model-mnli/special_tokens_map.json  
  inflating: biobert-base_model-mnli/tokenizer.json  
  inflating: biobert-base_model-mnli/tokenizer_config.json  
  inflating: biobert-base_model-mnli/vocab.txt  


In [61]:
tokenizer = AutoTokenizer.from_pretrained("/content/biobert-base_model-mnli")
model = AutoModelForSequenceClassification.from_pretrained("/content/biobert-base_model-mnli")

In [53]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])

In [58]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
4,104713
3,38183
2,37593
1,19185
0,15389


In [56]:
df.columns

Index(['drugName', 'condition', 'review', 'rating', 'date', 'usefulCount',
       'sentiment', 'vader_sentiment_score', 'vader_sentiment',
       'vader_sentiment2', 'label'],
      dtype='object')

In [66]:
data = df[['review', 'label']]

In [67]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.78, random_state=42, stratify=data['label'])


In [73]:
#unstrata_train, unstrata_test = train_test_split(data, test_size=0.78, random_state=4)

In [72]:
train.shape

(47313, 2)

In [89]:
train['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
4,23036
3,8400
2,8270
1,4221
0,3386


In [76]:
tokenized_review = tokenizer(list(train['review']),
                      max_length = 512,
                      padding = True,
                      truncation = True,
                      return_tensors = 'pt')

In [81]:
tokenized_review

{'input_ids': tensor([[ 101,  107,  178,  ...,    0,    0,    0],
        [ 101,  107,  178,  ...,    0,    0,    0],
        [ 101,  107, 1122,  ...,    0,    0,    0],
        ...,
        [ 101,  107,  178,  ...,    0,    0,    0],
        [ 101,  107, 2052,  ...,    0,    0,    0],
        [ 101,  107,  178,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [91]:
data_dict = {'input_ids': tokenized_review['input_ids'],
              'attention_mask': tokenized_review['attention_mask'],
              'labels': list(train['label'])
                                           }

In [93]:
dataset = Dataset.from_dict(data_dict)

In [96]:
train_val = dataset.train_test_split(test_size=0.2)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 37850
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9463
    })
})

In [98]:
small_train_dataset = train_val["train"].shuffle(seed=42).select(range(50))
small_eval_dataset = train_val["test"].shuffle(seed=42).select(range(50))

In [102]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir= "./test_trainer",
                                  eval_strategy="epoch",
                                  report_to="none")
#
trainer = Trainer(model = model,
        args = training_args,
        train_dataset = small_train_dataset,
        eval_dataset = small_eval_dataset)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


**CHECK FOR WORD COUNT ON REVIEW COLUMN USING BIOBERT TOKENIZER**

In [114]:
'''
word_count_biobert = []
for review in df['review']:
  word_count_biobert.append(len(tokenizer.tokenize(review)))
word_count_biobert_series = pd.Series(word_count_biobert, name= "word_count_biobert")
word_count_biobert_df = pd.concat([review_series, word_count_biobert_series], axis=1)
word_count_biobert_df[word_count_biobert_df['word_count_biobert'] > 508].count()

'''

# FLAIR SENTIMENT ANALYSIS

In [None]:
'''
!pip install -q flair
from flair.models import TextClassifier
from flair.data import Sentence
'''

In [None]:
'''
classifier = TextClassifier.load('sentiment')
classifier.predict('i am here to watch you eat')
'''