### Push dataset to Huggingface

In [None]:
from huggingface_hub import HfApi
import os 

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="data/rheo_sigmoid",
    repo_id="dchip95/synthetic-oscillatory-rheology-vlm",
    repo_type="dataset",
)


##### push the train and validation folders to the Hub for future loads and sharing

In [None]:
# create a dataset dict and then push to HF

##### check GPU VRAM and clear if in use

In [None]:
import torch 

if torch.cuda.is_available() and (torch.cuda.memory_allocated() // 1024 // 1024) > 10: 
    print(f"{torch.cuda.memory_allocated() // 1024 // 1024} MB currently allocated")
    print(f"{torch.cuda.memory_reserved() // 1024 // 1024} MB currently reserved")
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

#### Import libraries and set torch device properties 

In [None]:
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig, AutoModelForCausalLM, AutoProcessor
from PIL import Image
import cv2
import os
import time
%matplotlib inline

# model_id = "OpenGVLab/InternVL3-2B"
model_id = 'microsoft/Florence-2-base-ft'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# florence 2 
# load processor 
# inputs loaded into processor 
# processor processed inputs to go to model
# model object calls generate()

bnb_config = BitsAndBytesConfig(load_in_4bit=True,)

model = AutoModelForCausalLM.from_pretrained(model_id,
                                quantization_config=bnb_config,
                                low_cpu_mem_usage=True, 
                                trust_remote_code=True).eval()

model.to(device)

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)


#### From https://huggingface.co/blog/finetune-florence2 they freeze the vision encoder to make fine tuning less expensive 

In [None]:
for param in model.vision_tower.parameters(): 
    param.is_trainable = False 

#### Now we can begin the finetune process

#### Format the image and label paths with sample_ prefix for uniformity and clarity 

In [None]:
import json 
from pathlib import Path 

train_images = Path('data/rheo_sigmoid/train/images')
train_lables = Path('data/rheo_sigmoid/train/labels')
val_images = Path('data/rheo_sigmoid/val/images')
val_labels = Path('data/rheo_sigmoid/val/labels')

img_ext = '.png'
label_ext = '.json'

prefix = 'sample_'

data_dict = {val_images: '.png', 
             val_labels: '.json', 
             train_images: '.png', 
             train_lables: '.json'
             }

for path_, ext in data_dict.items(): 
    for f in path_.glob(f'*{ext}'): 
        if f.is_file(): 
            new_file_name = f.parent / Path(prefix + f.name)
            f.rename(new_file_name)


#### create a jsonl file with the following structure 
{id: sample_123, 
image_path: Path, 
prompt: "You are....", 
target: json_label}

In [None]:
import math 
from pathlib import Path
import json 


def generate_jsonl_inputs(img_path, label_path): 

    imgs = [f.as_posix() for f in img_path.glob('*') if f.is_file()] # as_posix() convers windows path object to forward slashed string 

    targets = [] # empty list of dict to append targets_interim dicts 

    sample_ids = []

    for f in label_path.glob('*.json'): 

        targets_interim = []
        deserialized_data = json.loads(f.read_text())

        sample_ids.append(deserialized_data['figure_id'])

        for i in range(len(deserialized_data['materials'])): 

            data = {'legend_entry': deserialized_data['materials'][i]['label_in_legend'], 
                    'Gp_plateau_Pa': math.floor(deserialized_data['materials'][i]['Gp_plateau_Pa']),
                    'tau_y_Pa': math.floor(deserialized_data['materials'][i]['tau_y_Pa']), 
                    'tay_f_Pa': math.floor(deserialized_data['materials'][i]['tau_f_Pa'])
            }

            targets_interim.append(data)
        
        targets.append(targets_interim)

    return sample_ids, imgs, targets


def generate_jsonl_dict(sample_ids, imgs, prompt, targets): 

    keys = ['ids', 'image_path', 'prompt', 'target']
    prompt_verbiage = prompt * len(sample_ids)
    data = []

    for i in range(len(sample_ids)): 
        values = [sample_ids[i], imgs[i], prompt_verbiage[i], targets[i]]
        data.append(dict(zip(keys, values)))

    return data


def generate_jsonl(output_file, data): 

    with open(output_file, 'w') as f: 
        for item in data: 
            json_line = json.dumps(item)
            f.write(json_line + '\n')



##### generate train.jsonl

In [None]:
prompt = ["You are a rheology assistant. Extract the rheological parameters and respond strictly in json."]

# train.jsonl 
train_images = Path('data/rheo_sigmoid/train/images')
train_labels = Path('data/rheo_sigmoid/train/labels')

train_dir = Path('data/rheo_sigmoid/train')
train_file = train_dir / 'train.jsonl'

sample_ids, imgs, targets = generate_jsonl_inputs(img_path=train_images, label_path=train_labels)
train_data = generate_jsonl_dict(sample_ids=sample_ids, imgs=imgs, targets=targets, prompt=prompt)
generate_jsonl(train_file, train_data)

#### generate val.jsonl

In [None]:
# val.jsonl
val_images = Path('data/rheo_sigmoid/val/images')
val_labels = Path('data/rheo_sigmoid/val/labels')

val_dir = Path('data/rheo_sigmoid/val')
val_file = val_dir / 'val.jsonl'

sample_ids, imgs, targets = generate_jsonl_inputs(img_path=val_images, label_path=val_labels)
val_data = generate_jsonl_dict(sample_ids=sample_ids, imgs=imgs, targets=targets, prompt=prompt)
generate_jsonl(val_file, val_data)

#### setup pytorch data class and data loader

In [None]:
from torch.utils.data import Dataset 
import json 
from PIL import Image

class RheologyDataset(Dataset): 
    def __init__(self, data_path):
        self.data = []
        with open(data_path, 'rb') as f: 
            for line in f: 
                self.data.append(json.loads(line.strip()))
        
    def __len__(self): 
        return len(self.data)
    
    def __getitem__(self, idx): 
        # access the ith dictionary and retrieve the img_path, prompt, and target string
        sample = self.data[idx]
        
        # img is a string object and will need loaded as img first
        img = Image.open(sample['image_path']).convert("RGB")
        prompt = sample['prompt']
        target = sample['target']

        return img, prompt, target


In [None]:
import os 
from torch.utils.data import DataLoader
from tqdm import tqdm 
from transformers import AdamW, get_scheduler
from pathlib import Path

# collate function to utilize Florence2 processor on the individual samples within the batch
# given the variation in image size, target token size, prompt variation, etc. 
def collate_fn(batch): 
    images, prompts, targets = zip(*batch)
    inputs = processor(text=list(prompts), images=list(images), return_tensors="pt", padding=True).to(device)
    return inputs, targets

train_file = Path('data/rheo_sigmoid/train/train.jsonl')
val_file = Path('data/rheo_sigmoid/val/val.jsonl')

train_dataset = RheologyDataset(train_file)
val_dataset = RheologyDataset(val_file)

train_loader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(dataset=val_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

