##### This notebook processess the synthetic rheology data JSON and corresponding plot into a Huggingface Dataset and DatasetDict

##### push_to_hub accomplished by a huggingface-cli login      
run hf auth whoami to confirm login

In [None]:
import json 
from pathlib import Path 
import math 
from huggingface_hub import notebook_login
from PIL import Image 
from datasets import Dataset, DatasetDict


##### Format the image and label paths with sample_ prefix for uniformity and clarity 

In [None]:
train_images = Path('data/rheo_sigmoid/train/images')
train_lables = Path('data/rheo_sigmoid/train/labels')
val_images = Path('data/rheo_sigmoid/val/images')
val_labels = Path('data/rheo_sigmoid/val/labels')

img_ext = '.png'
label_ext = '.json'

prefix = 'sample_'

data_dict = {val_images: '.png', 
             val_labels: '.json', 
             train_images: '.png', 
             train_lables: '.json'
             }

for path_, ext in data_dict.items(): 
    for f in path_.glob(f'*{ext}'): 
        if f.is_file(): 
            new_file_name = f.parent / Path(prefix + f.name)
            f.rename(new_file_name)


#### create a jsonl file with the following structure 
{id: sample_123, 
image_path: Path, 
prompt: "You are....", 
target: json_label}

In [None]:

def generate_jsonl_inputs(img_path, label_path): 

    imgs = [f.as_posix() for f in img_path.glob('*') if f.is_file()] # as_posix() converts windows path object to forward slashed string 

    targets = [] # empty list of dict to append targets_interim dicts 

    sample_ids = []

    for f in label_path.glob('*.json'): 

        targets_interim = []
        deserialized_data = json.loads(f.read_text())

        sample_ids.append(deserialized_data['figure_id'])

        for i in range(len(deserialized_data['materials'])): 

            data = {'legend_entry': deserialized_data['materials'][i]['label_in_legend'], 
                    'Gp_plateau_Pa': math.floor(deserialized_data['materials'][i]['Gp_plateau_Pa']),
                    'tau_y_Pa': math.floor(deserialized_data['materials'][i]['tau_y_Pa']), 
                    'tay_f_Pa': math.floor(deserialized_data['materials'][i]['tau_f_Pa'])
            }

            targets_interim.append(data)
        
        targets.append(targets_interim)

    return sample_ids, imgs, targets


def generate_jsonl_dict(sample_ids, imgs, prompt, targets): 

    keys = ['ids', 'image_path', 'prompt', 'target']
    prompt_verbiage = prompt * len(sample_ids)
    data = []

    for i in range(len(sample_ids)): 
        values = [sample_ids[i], imgs[i], prompt_verbiage[i], targets[i]]
        data.append(dict(zip(keys, values)))

    return data


def generate_jsonl(output_file, data): 

    with open(output_file, 'w') as f: 
        for item in data: 
            json_line = json.dumps(item)
            f.write(json_line + '\n')



In [None]:
prompt = ["You are a rheology assistant. What are the storage modulus, yield stress, and flow stress for each material in the rheology plot? Extract the rheological parameters and respond strictly in json."]

# train.jsonl 
train_images = Path('data/rheo_sigmoid/train/images')
train_labels = Path('data/rheo_sigmoid/train/labels')

train_dir = Path('data/rheo_sigmoid/train')
train_file = train_dir / 'train.jsonl'

sample_ids, imgs, targets = generate_jsonl_inputs(img_path=train_images, label_path=train_labels)
train_data = generate_jsonl_dict(sample_ids=sample_ids, imgs=imgs, targets=targets, prompt=prompt)
generate_jsonl(train_file, train_data)

In [None]:
# val.jsonl
val_images = Path('data/rheo_sigmoid/val/images')
val_labels = Path('data/rheo_sigmoid/val/labels')

val_dir = Path('data/rheo_sigmoid/val')
val_file = val_dir / 'val.jsonl'

sample_ids, imgs, targets = generate_jsonl_inputs(img_path=val_images, label_path=val_labels)
val_data = generate_jsonl_dict(sample_ids=sample_ids, imgs=imgs, targets=targets, prompt=prompt)
generate_jsonl(val_file, val_data)

Quickly realized that i loaded my local image paths in the jsonl files  
Push to Hub Dataset creation for loading in Colab  
Removes dependency on local filepaths 

In [None]:
# read the train and val .jsonl files
# load the image from the image_path and insert the PIL image
# push that dataset in RAM to HF

# use the Dataset.from_list(List) method
# read the .jsonl 
train_file = 'data/rheo_sigmoid/train/train.jsonl'
val_file = 'data/rheo_sigmoid/val/val.jsonl'

prompt = ["You are a rheology assistant. What are the storage modulus, yield stress, and flow stress for each material in the rheology plot? Extract the rheological parameters and respond strictly in json."]

def read_json_lines(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        for line in f:
            yield json.loads(line) # dont need strip since load in as text


def generate_hf_datset(jsonl_file):
    data_struct = []
    keys = ['id', 'image', 'prompt', 'target']

    for sample in read_json_lines(jsonl_file):

        img = Image.open(sample['image_path']).convert("RGB")
        target = json.dumps(sample['target'])

        values = [sample['ids'], img, prompt, target]

        data_struct.append(dict(zip(keys, values)))

    return data_struct


train_data_hf = generate_hf_datset(train_file)
val_data_hf = generate_hf_datset(val_file)


In [None]:
# create huggingface dataset dict using from_list() method (super convenient :))
train_data_hf_dict = Dataset.from_list(train_data_hf)
val_data_hf_dict = Dataset.from_list(val_data_hf)

rheo_ds_dict = DatasetDict({'train': train_data_hf_dict, 
                            'validation': val_data_hf_dict}
                            )

# ensure login to hugginface-cli
rheo_ds_dict.push_to_hub("dchip95/rheology_dataset_pixels")
