## HF Datasets library

In [1]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2025-02-25 02:54:23--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip’

drugsCom_raw.zip        [                <=> ]  41.00M  2.05MB/s    in 31s     

2025-02-25 02:54:58 (1.30 MB/s) - ‘drugsCom_raw.zip’ saved [42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [2]:
!uv pip install datasets

[2mUsing Python 3.12.9 environment at: /home/vscode/.venv[0m
[2mAudited [1m1 package[0m [2min 25ms[0m[0m


In [5]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}

drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [6]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[0:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [10]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [11]:
drug_dataset = drug_dataset.rename_column("Unnamed: 0", "patient_id")
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [19]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)


def lowercase_condition(example):
    return {"condition": example["condition"].lower()}


drug_dataset = drug_dataset.map(lowercase_condition)
drug_dataset["train"][0:3]

Filter: 100%|██████████| 160398/160398 [00:01<00:00, 89969.10 examples/s]
Filter: 100%|██████████| 53471/53471 [00:00<00:00, 88085.23 examples/s]
Map: 100%|██████████| 160398/160398 [00:07<00:00, 22537.18 examples/s]
Map: 100%|██████████| 53471/53471 [00:02<00:00, 22717.46 examples/s]


{'patient_id': [206461, 95260, 92703],
 'drugName': ['Valsartan', 'Guanfacine', 'Lybrel'],
 'condition': ['left ventricular dysfunction', 'adhd', 'birth control'],
 'review': ['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
  '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effect

In [20]:
def compute_review_length(x):
    return {"review_length": len(x["review"].split())}


drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset["train"][:3]

Map: 100%|██████████| 160398/160398 [00:06<00:00, 26728.76 examples/s]
Map: 100%|██████████| 53471/53471 [00:01<00:00, 27933.04 examples/s]


{'patient_id': [206461, 95260, 92703],
 'drugName': ['Valsartan', 'Guanfacine', 'Lybrel'],
 'condition': ['left ventricular dysfunction', 'adhd', 'birth control'],
 'review': ['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
  '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effect

In [21]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [23]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

Filter: 100%|██████████| 160398/160398 [00:00<00:00, 248009.03 examples/s]
Filter: 100%|██████████| 53471/53471 [00:00<00:00, 270134.66 examples/s]


{'train': 138514, 'test': 46108}


In [25]:
import html

drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

Map: 100%|██████████| 138514/138514 [00:00<00:00, 643086.87 examples/s]
Map: 100%|██████████| 46108/46108 [00:00<00:00, 759972.53 examples/s]


In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [27]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 138514/138514 [00:07<00:00, 18007.67 examples/s]
Map: 100%|██████████| 46108/46108 [00:02<00:00, 17508.12 examples/s]

CPU times: user 36.3 s, sys: 519 ms, total: 36.8 s
Wall time: 10.4 s





In [28]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        max_length=128,
        truncation=True,
        return_overflowing_tokens=True,
    )

In [31]:
result = tokenize_and_split(drug_dataset["train"][0])
[len(x) for x in result["input_ids"]]

[128, 49]

In [None]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Map: 100%|██████████| 138514/138514 [00:09<00:00, 14933.40 examples/s]
Map: 100%|██████████| 46108/46108 [00:02<00:00, 15660.07 examples/s]

2 2





In [34]:
print(len(tokenized_dataset["train"]), len(drug_dataset["train"]))

206772 138514


In [37]:
def tokenize_and_split_v2(examples):
    result = tokenizer(
        examples["review"],
        max_length=128,
        truncation=True,
        return_overflowing_tokens=True,
    )
    sample_map = result.pop("overflow_to_sample_mapping")
    for k, values in examples.items():
        result[k] = [values[i] for i in sample_map]
    return result

In [38]:
tokenized_dataset = drug_dataset.map(tokenize_and_split_v2, batched=True)
tokenized_dataset

Map: 100%|██████████| 138514/138514 [00:09<00:00, 14493.28 examples/s]
Map: 100%|██████████| 46108/46108 [00:03<00:00, 14689.07 examples/s]


DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

### Conversion to Pandas dataframe

In [39]:
drug_dataset.set_format("pandas")

In [41]:
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [42]:
# Need to create a slice since the set_format only updates __getitem__()
train_df = drug_dataset["train"][:]
train_df.head()

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89
3,35696,Buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,124
4,155963,Cialis,benign prostatic hyperplasia,"""2nd day on 5mg started to work with rock hard...",2.0,"November 28, 2015",43,68


In [45]:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "count": "frequency"})
)
frequencies.head()

Unnamed: 0,condition,frequency
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [46]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

In [47]:
drug_dataset.reset_format()

### Validation dataset

In [49]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)

drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [50]:
drug_dataset_clean.save_to_disk("drug-reviews")

Saving the dataset (1/1 shards): 100%|██████████| 110811/110811 [00:00<00:00, 162109.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 27703/27703 [00:00<00:00, 168905.96 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46108/46108 [00:00<00:00, 574969.43 examples/s]


### Ideas
1. Use the techniques from Chapter 3 to train a classifier that can predict the patient condition based on the drug review.
2. Use the summarization pipeline from Chapter 1 to generate summaries of the reviews.

## Extremely large datasets

In [55]:
!uv pip install zstandard

[2mUsing Python 3.12.9 environment at: /home/vscode/.venv[0m
[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K[2mResolved [1m1 package[0m [2min 289ms[0m[0m                                          [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m     0 B/4.72 MiB                      [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 14.91 KiB/4.72 MiB                    [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 30.91 KiB/4.72 MiB                    [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 46.91 KiB/4.72 MiB                    [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 62.91 KiB/4.72 MiB                    [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 78.91 KiB/4.72 MiB                    [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 94.91 KiB/4.72 MiB                    [1A
[2K[1A[37m⠙[0m [2mPre

In [1]:
from datasets import load_dataset, DownloadConfig

# data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
data_files = "https://huggingface.co/datasets/casinca/PUBMED_title_abstracts_2019_baseline/resolve/main/PUBMED_title_abstracts_2019_baseline.jsonl.zst"

pubmed_dataset = load_dataset(
    "json",
    data_files=data_files,
    split="train",
    download_config=DownloadConfig(delete_extracted=True),
    streaming=True,
)
pubmed_dataset

  from .autonotebook import tqdm as notebook_tqdm


IterableDataset({
    features: Unknown,
    num_shards: 1
})

In [2]:
next(iter(pubmed_dataset))

{'meta': {'pmid': 11409574, 'language': 'eng'},
 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age. Systematic review of the published literature. Out-patient clinics, emergency departments and hospitalisation wards in 23 health centres from 10 countries. Cohort studies reporting the frequency of hypoxaemia in children under 5 years of age with ALRI, and the association between hypoxaemia and the risk of dying. Prevalence of hypoxaemia measured in children with ARI and relative risks for the association between the severity of illness and the frequency of hypoxaemia, and between hypoxaemia and the risk of dying. Seventeen published studies were found that i