### Download the data

In [40]:
data_dir = "/home/jovyan/Works/Practice/dataset/drugscom"

In [41]:
# Do it only for the first time

# !wget -P $data_dir "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"

In [42]:
# Do it only for the first time

# !unzip "$data_dir/drugsCom_raw.zip" -d "$data_dir"


In [43]:
# Load the data from saved directory
from datasets import load_dataset

data_files = {
    "train":  f"{data_dir}/drugsComTrain_raw.tsv",
    "test":f"{data_dir}/drugsComTest_raw.tsv"
}

drug_dataset = load_dataset("csv",data_files=data_files, delimiter="\t")
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [44]:
drug_dataset["train"][0]

{'Unnamed: 0': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

#### Data Preprocessing

In [45]:
# Filter the none types
def filter_nones(x):
    return x["condition"] is not None

In [46]:
drug_dataset = drug_dataset.filter(filter_nones)

In [47]:
# Lower the `condition` and `review` column
def lower_columns(example):
    return {
        "condition": example["condition"].lower(),
        "review": example["review"].lower()
    }

In [48]:
# Now apply the lower function
drug_dataset = drug_dataset.map(lower_columns)

In [49]:
drug_dataset["train"][0]

{'Unnamed: 0': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"it has no side effect, i take it in combination of bystolic 5 mg and fish oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

In [50]:
# We hvae noticed that there are some conditions that have
# unusual value that seems like `comment` so removing them
def remove_invalid_condition(example):
    return "users found this comment helpful" not in example["condition"]

In [51]:
drug_dataset =  drug_dataset.filter(remove_invalid_condition)

In [52]:
drug_dataset["train"][0]

{'Unnamed: 0': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"it has no side effect, i take it in combination of bystolic 5 mg and fish oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

In [53]:
# removing some special characters
import re

def clean_text(example):
    # Clean review text
    example["review"] = re.sub(r'[\r\n\t]', ' ', example["review"])
    # Clean condition text, if needed
    example["condition"] = re.sub(r'[\r\n\t]', ' ', example["condition"])
    return example

# Apply to all splits if using DatasetDict
drug_dataset = drug_dataset.map(clean_text)


In [54]:
drug_dataset["train"][0]

{'Unnamed: 0': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"it has no side effect, i take it in combination of bystolic 5 mg and fish oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

In [55]:
# unescap html from both the review and condition
import html
def unescpae_html(example):
    return{
        "review": html.unescape(example["review"]),
        "condition": html.unescape(example["condition"])
    }

In [56]:
drug_dataset = drug_dataset.map(unescpae_html)

In [57]:
# Remove double quote from review column
def clean_quotes(example):
    example["review"] = example["review"].strip('"').strip("")
    return example

In [58]:
drug_dataset = drug_dataset.map(clean_quotes)

In [59]:
drug_dataset["train"][0]

{'Unnamed: 0': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': 'it has no side effect, i take it in combination of bystolic 5 mg and fish oil',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

Some of the records have very short `review` columns which might not make sense. So we will keep `review`s that are more 20 word long

In [60]:
# Creating new column
def review_length(example):
    return {"review_length": len(example["review"].split())}

In [61]:
drug_dataset = drug_dataset.map(review_length)

In [62]:
drug_dataset["train"][0]

{'Unnamed: 0': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': 'it has no side effect, i take it in combination of bystolic 5 mg and fish oil',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [63]:
# Filter dataset based on review length
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 20)

In [64]:
# sort based on review length in reverse
drug_dataset =  drug_dataset.sort("review_length")

In [65]:
drug_dataset["train"][0]

{'Unnamed: 0': 222064,
 'drugName': 'Fluconazole',
 'condition': 'vaginal yeast infection',
 'review': 'quickest relief ever!  antibiotics always give me a yeast infection! vaginal creams are slow and yucky. fluconazole worked quickly and completely!',
 'rating': 10.0,
 'date': 'October 1, 2015',
 'usefulCount': 25,
 'review_length': 21}

#### Keeping only tow columns (review and condition)

In [66]:
drug_dataset = drug_dataset.remove_columns(["drugName","rating","Unnamed: 0","date","usefulCount","review_length"])

In [67]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['condition', 'review'],
        num_rows: 146424
    })
    test: Dataset({
        features: ['condition', 'review'],
        num_rows: 48815
    })
})

#### Balancing the training data

In [68]:
train_df = drug_dataset["train"].to_pandas()

In [69]:
train_df.head()

Unnamed: 0,condition,review
0,vaginal yeast infection,quickest relief ever! antibiotics always give...
1,birth control,i have been on nexplanon since 2-20-17 and hav...
2,surgical prophylaxis,within a few days after minor knee surgery i h...
3,nausea/vomiting,zofran saved my life; literally. my morning si...
4,birth control,works great! i will never use anything else. ...


In [70]:
import pandas as pd

# Function to apply balancing rules
def balance_condition(group):
    count = len(group)
    if 35 <= count <= 100:
        return group
    elif count > 100:
        return group.sample(100, random_state=42)  # Downsample to 100
    else:
        return pd.DataFrame() # Drop this group

In [71]:
# Apply balancing to all conditions
balanced_df = train_df.groupby("condition", group_keys=False).apply(balance_condition)

  balanced_df = train_df.groupby("condition", group_keys=False).apply(balance_condition)


In [72]:
(balanced_df["condition"].unique())

array(['abnormal uterine bleeding', 'acne', 'acute coronary syndrome',
       'adhd', 'alcohol dependence', 'alcohol withdrawal',
       'allergic reactions', 'allergic rhinitis', 'allergies', 'alopecia',
       "alzheimer's disease", 'amenorrhea', 'androgenetic alopecia',
       'anesthesia', 'angina', 'ankylosing spondylitis', 'anorexia',
       'anxiety', 'anxiety and stress', 'arrhythmia', 'asthma',
       'asthma, acute', 'asthma, maintenance', 'atigue',
       'atopic dermatitis', 'atrial fibrillation', 'atrophic vaginitis',
       'autism', 'back pain', 'bacterial infection',
       'bacterial skin infection', 'bacterial vaginitis',
       "barrett's esophagus", 'basal cell carcinoma',
       'benign essential trem', 'benign prostatic hyperplasia',
       'binge eating disorde', 'bipolar disorde', 'birth control',
       'bladder infection', 'borderline personality disorde',
       'bowel preparation', 'breast cance', 'breast cancer, metastatic',
       'bronchitis', 'bulimia', 

In [73]:
balanced_df.head(15)

Unnamed: 0,condition,review
33964,abnormal uterine bleeding,i started my shot october 2016 and started ble...
103638,abnormal uterine bleeding,took the shot for three years and it was horri...
38776,abnormal uterine bleeding,inserting mirena felt like getting an abortion...
48016,abnormal uterine bleeding,this is my third week on this pill . i have ta...
118652,abnormal uterine bleeding,i'm 38 and have been a heavy bleeder for most ...
126571,abnormal uterine bleeding,i am 39 years old my whole life i had normal p...
51071,abnormal uterine bleeding,this birth control is awesome as far as its ef...
143711,abnormal uterine bleeding,i am 52 years old and clearly have been throug...
138165,abnormal uterine bleeding,i have been on the depo shot since february 20...
15519,abnormal uterine bleeding,i find that i sweat a lot when taking this bir...


In [74]:
# Converting data frame to dataset

from datasets import Dataset
drug_dataset = Dataset.from_pandas(balanced_df)

In [75]:
# creating train and test split from this curated data
condition_dataset_clean = drug_dataset.train_test_split(train_size=0.8, seed=42)
condition_dataset_clean["validation"] = condition_dataset_clean.pop("test")

In [78]:
condition_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['condition', 'review', '__index_level_0__'],
        num_rows: 16189
    })
    validation: Dataset({
        features: ['condition', 'review', '__index_level_0__'],
        num_rows: 4048
    })
})

In [77]:
condition_dataset_clean.save_to_disk(f"{data_dir}/drug_review-condition")

Saving the dataset (1/1 shards): 100%|██████████| 16189/16189 [00:00<00:00, 173129.56 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4048/4048 [00:00<00:00, 155358.81 examples/s]
