<h1>Prepare Watchman Dataset</h1>
This notebook uses the Watchman dataset. See readme.txt fine-tuning section for details.</br>
It loads the dataset data into pandas dataframe and stores it in .pkl file for use later in the fine_tune.ipynb notebook.</br>


In [1]:
import os
import sys
import base64
from pathlib import Path
import pandas as pd
from datetime import datetime

In [2]:
# set up vars for finding train data
dataset = "../.data/dataset" # dataset folder location
chans = ["porch"] # list of channels to load the data for
objs = ["person"] # list of objects to load the data for
model_name = "ollama-complex" # model interface name to use for inference experimentation
c_desc = {
    "porch": "Porch",
}
o_desc = {
    "person": "a person",
}

In [3]:
# Make pandas dataframe w/ the data for training, images (in base64)
# Columns:
# img: image datat in base64
# c_desc: image channel description
# o_desc: object description
# res: expected result for the prompt to detect the object on the image

# Create an empty DataFrame
df = pd.DataFrame(columns=["img", "c_desc", "o_desc", "res"])

# Define the add_row function
def add_row(dataframe, s, c, o, res):
    image_pname = f"{s}/image.jpg"
    img_data = base64.b64encode(Path(image_pname).read_bytes()).decode()
    # Append a new row to the dataframe
    dataframe.loc[len(dataframe)] = {
        "img": img_data,
        "c_desc": c_desc[c],
        "o_desc": o_desc[o],
        "res": res,
    }

# Add all the images from the dataset
for c in chans:
    for o in objs:
        dir = f"{dataset}/{c}/{o}"
        subdirs = [f.path for f in os.scandir(dir) if f.is_dir()]
        for s in subdirs:
            if os.path.exists(f"{s}/skip"):
                continue
            if os.path.exists(f"{s}/no"):
                add_row(df, s, c, o, "no")
            else:
                add_row(df, s, c, o, "yes")


In [4]:
# Lowercase the 'c_desc' and 'o_desc' columns
df['c_desc'] = df['c_desc'].str.lower()
df['o_desc'] = df['o_desc'].str.lower()

# Capitalize the first letter in the 'res' column
df['res'] = df['res'].str.capitalize()

In [None]:
df.head(3)

In [None]:
df.describe()

In [None]:
pickle_file = f"../.data/data-{datetime.now():%Y-%m-%d_%H-%M-%S}.pkl"
print(f"Pickle file name: {pickle_file}")
df.to_pickle(pickle_file)