In [14]:
import shutil
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imageio.v3 as iio

import torchdata.datapipes as dp

from sklearn.preprocessing import LabelEncoder
from streaming import StreamingDataset, MDSWriter

from streaming.base.util import clean_stale_shared_memory


from tqdm import tqdm
from typing import Callable, Any

from dotenv import load_dotenv
load_dotenv();

In [45]:
#IMAGENET = Path("/run/media/sambhav/2A2E24A52E246BCF/Users/SambhavChandra/datasets/imagenet/") 
#IMAGENET_ZIP = "C://Users//SambhavChandra//dev//datasets//imagenet-object-localization-challenge.zip"
IMAGENET = Path.home() / "dev" / "datasets" / "imagenet"
TRAIN_DIR = IMAGENET / "ILSVRC" / "Data" / "CLS-LOC" / "train"
VAL_DIR = IMAGENET / "ILSVRC" / "Data" / "CLS-LOC" / "val"
SHARDS = IMAGENET / "ILSVRC" / "DATA" / "Shards"

In [6]:
def label_from_path(path: Path) -> str:
    return path.parent.stem

def reset_dir(dir_path: Path) -> None:
    if dir_path.exists() and dir_path.is_dir():
        shutil.rmtree(dir_path)
    dir_path.mkdir(parents = True, exist_ok = True)

labels = pd.read_csv("labels.csv", index_col=0)
class_names = labels.index.tolist()

label_encoder = LabelEncoder().fit(class_names)


In [7]:
train = pd.read_csv("train.csv", index_col=0)
train = train.sort_values("label").reset_index(drop = True)
train["path"] = train["path"].apply(lambda x: TRAIN_DIR/x)

In [8]:
val = pd.read_csv("val.csv", index_col=0)
val = val.sort_values("label").reset_index(drop = True)
val["path"] = val["path"].apply(lambda x: VAL_DIR/x)

In [7]:
df = train[:1000]
local_shards = SHARDS / "train"

dtypes = {"image": "bytes", "label": "int"}
reset_dir(local_shards)
with MDSWriter(out = local_shards.as_posix(), columns = dtypes) as out: 
    for idx, example in tqdm(df.iterrows(), total=len(df)):
        try:
            image = iio.imread(example.path, extension=".jpg")
            image_bytes = iio.imwrite("<bytes>", image, extension=".jpg")
        except:
            print(idx)

        label = example.label 
        label_int = int(label_encoder.transform([label])[0])

        sample = {
            "image": image_bytes,
            "label": label_int
        }
        out.write(sample)

100%|██████████| 1000/1000 [00:06<00:00, 162.84it/s]
