In [13]:
from pathlib import Path
import glob
import os

In [12]:
from datasets import load_dataset, concatenate_datasets

In [2]:
from huggingface_hub import snapshot_download

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
image_dir = Path("../data/finemed_sft/")
assert image_dir.exists()

In [10]:
image_dir = snapshot_download(
    repo_id="hongzhouyu/FineMed-SFT", 
    repo_type="dataset", 
    allow_patterns="*",
    local_dir=image_dir,
    local_dir_use_symlinks=False
)

Fetching 39 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:43<00:00,  1.10s/it]


In [27]:
base_path = "../data/finemed_sft"
excluded_files = ["SFTo1.jsonl"]

all_files = glob.glob(os.path.join(base_path, "**/*.jsonl"), recursive=True)
filtered_files = [f for f in all_files if not any(excl in f for excl in excluded_files)]

stage1_datasets = []
for file in filtered_files:
    ds = load_dataset("json", data_files=file, split="train")
    stage1_datasets.append(ds)

Generating train split: 304858 examples [00:02, 149531.23 examples/s]
Generating train split: 1728 examples [00:00, 82956.66 examples/s]
Generating train split: 11477 examples [00:00, 89686.43 examples/s]
Generating train split: 3390 examples [00:00, 105014.81 examples/s]
Generating train split: 78665 examples [00:00, 132340.53 examples/s]
Generating train split: 2068 examples [00:00, 81572.24 examples/s]
Generating train split: 10152 examples [00:00, 97858.93 examples/s]
Generating train split: 7102 examples [00:00, 127593.37 examples/s]
Generating train split: 4766 examples [00:00, 122483.57 examples/s]
Generating train split: 4662 examples [00:00, 114893.53 examples/s]
Generating train split: 7559 examples [00:00, 132774.72 examples/s]
Generating train split: 8667 examples [00:00, 127457.97 examples/s]
Generating train split: 21227 examples [00:00, 140632.09 examples/s]
Generating train split: 4923 examples [00:00, 132492.50 examples/s]
Generating train split: 2128 examples [00:00, 

In [51]:
full_dataset

Dataset({
    features: ['text', 'instruction', 'complexity', 'quality', 'language', 'response', 'instruction_type'],
    num_rows: 714992
})

In [35]:
full_dataset = concatenate_datasets(dsets=stage1_datasets).shuffle(seed=42069)
stage1_dataset = full_dataset.select(range(200000))

In [52]:
# stage1_dataset.train_test_split(test_size=0.1, seed=42069).push_to_hub("drmaniak/DermaCOT-SFT1")

In [54]:
derm = Path("../data/finemed_sft/OtherDepartments/DermatologyandVenereology.jsonl")

derm = load_dataset("json", data_files="../data/finemed_sft/OtherDepartments/DermatologyandVenereology.jsonl", split="train")
derm

Dataset({
    features: ['text', 'instruction', 'complexity', 'quality', 'language', 'response', 'instruction_type'],
    num_rows: 10171
})

In [55]:
scap = load_dataset("drmaniak/SkinCap-Visual")

Generating train split: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:01<00:00, 3868.22 examples/s]


In [56]:
scap

DatasetDict({
    train: Dataset({
        features: ['image', 'skincap_file_path', 'disease', 'caption', 'remark', 'source', 'skin_tone', 'not_considered', 'filename', 'label'],
        num_rows: 4000
    })
})

In [57]:
derm[0]

{'text': 'CBD Skin Care, Topicals And Lotions\nWith more research underway surrounding the endocannabinoid system and cannabinoids such as CBD and THC, many new methods for infusing products with cannabinoids are being developed. One of the most intriguing is in using them within skin care products.\nCannabis topicals come in the form of lotion, salves, balms, and oils, all of which have been infused with cannabinoids. They absorb into the skin directly upon application. Topicals have only grown in popularity over the years. In fact, THC and CBD has even been utilized in general cosmetic products and even personal lubricants.\nCBD Skin Care Lotion in Lexington\nApplying cannabinoids topically will allow them to absorb into the body quite quickly. The relief is also more focused, generally only affecting the specific area where the product was used. On the other hand, smoking or ingesting marijuana affects the entire body, and it can take up to two hours for any effects to be felt.\nCBD

In [58]:
derm.features

{'text': Value(dtype='string', id=None),
 'instruction': Value(dtype='string', id=None),
 'complexity': Value(dtype='int64', id=None),
 'quality': Value(dtype='int64', id=None),
 'language': Value(dtype='string', id=None),
 'response': Value(dtype='string', id=None),
 'instruction_type': Value(dtype='string', id=None)}

In [65]:
def process_skincap(example):
    return {
        "text": f"Image: {example['skincap_file_path']}\nCaption: {example['caption']}",
        "instruction": "I've been experiencing some skin issues, can you have a look at this photo of my skin and tell me if anything's wrong?",
        "image": example["image"],
        "complexity": 8,
        "quality": 8,
        "langauge": "en",
        "response": f"{example['label']}",
        "instruction_type": "common",
        "is_image": True
    }

def process_derm(example):
    return {
        **example,
        "image": None,
        "is_image": True,
    }
    

In [69]:
derm_dataset = derm.map(process_derm)
scap_dataset = scap.map(process_skincap)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [07:34<00:00,  8.81 examples/s]


In [74]:
stage2_dataset = concatenate_datasets([derm_dataset, scap_dataset["train"]])

In [76]:
stage2_dataset.push_to_hub("drmaniak/DermaCOT-SFT2")

Uploading the dataset shards:   0%|                                                                                                                         | 0/3 [00:00<?, ?it/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4724/4724 [00:00<00:00, 90318.09 examples/s]

Creating parquet from Arrow format: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:00<00:00, 733.24ba/s]
Uploading the dataset shards:  33%|█████████████████████████████████████▋                                                                           | 1/3 [00:02<00:04,  2.02s/it]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4724/4724 [00:00<00:00, 73027.76 examples/s]

Creating parquet from Arrow format: 100%|██████████████████████████████████████████████████████████████

CommitInfo(commit_url='https://huggingface.co/datasets/drmaniak/DermaCOT-SFT2/commit/7b56d0ffbda52029b3375b852d9c91a8306b9dda', commit_message='Upload dataset', commit_description='', oid='7b56d0ffbda52029b3375b852d9c91a8306b9dda', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/drmaniak/DermaCOT-SFT2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='drmaniak/DermaCOT-SFT2'), pr_revision=None, pr_num=None)

In [73]:
scap_dataset["train"][0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=798x618>,
 'skincap_file_path': '1.png',
 'disease': 'melanoma-in-situ',
 'caption': 'Irregular brown patches are present on the sole of the foot, which may indicate a melanocytic nevus or melanoma. It is recommended to undergo pathological or dermatoscopy examination for a more definitive diagnosis.',
 'remark': 'nan',
 'source': 'ddi',
 'skin_tone': '56.0',
 'not_considered': 0,
 'filename': '/home/fullldiesel/Software/learning/MLX/Coda/derma-cot/data/SkinCAP_images/1.png',
 'label': "The image depicts the sole of a human foot, which exhibits several irregular brown patches. These patches vary in size and shape, with some appearing more defined and others more diffuse. The skin surrounding these patches shows signs of dryness and slight scaling, which may indicate chronic irritation or friction. The overall texture of the foot's sole appears rough and uneven, with visible lines and creases typical of the plantar surface.

In [60]:
scap["train"][0]

{'image': <PIL.Image.Image image mode=RGBA size=798x618>,
 'skincap_file_path': '1.png',
 'disease': 'melanoma-in-situ',
 'caption': 'Irregular brown patches are present on the sole of the foot, which may indicate a melanocytic nevus or melanoma. It is recommended to undergo pathological or dermatoscopy examination for a more definitive diagnosis.',
 'remark': 'nan',
 'source': 'ddi',
 'skin_tone': '56.0',
 'not_considered': 0,
 'filename': '/home/fullldiesel/Software/learning/MLX/Coda/derma-cot/data/SkinCAP_images/1.png',
 'label': "The image depicts the sole of a human foot, which exhibits several irregular brown patches. These patches vary in size and shape, with some appearing more defined and others more diffuse. The skin surrounding these patches shows signs of dryness and slight scaling, which may indicate chronic irritation or friction. The overall texture of the foot's sole appears rough and uneven, with visible lines and creases typical of the plantar surface.\n\nGiven the cl