# GalaxyZoo

After the data have been downloaded for the website, we will show you how to setup a data pipeline in tensorflow using tf.data


In [1]:

import glob
import pandas as pd
from torchvision import transforms
from torch.utils.data import DataLoader

from utils import trim_file_list, img_label, CustomDataset

In [10]:
# Path to data and labels
img_dir = "data/images/images_training_rev1"
label_path = "data/labels.csv"
val_split = 0.2

In [14]:
# Get labels for all images by ID
# set_index means we can use loc for the exact ID rather than order in file
# only interested in subset of columns
labels_df = pd.read_csv(label_path).set_index('GalaxyID')[['Class1.1','Class1.2','Class1.3']]

In [16]:
# get list of files
files = glob.glob(f'{img_dir}/*')
files = [str(f) for f in files]
files = [f.replace("\\", "/") for f in files]
print(files[:5])

['data/images/images_training_rev1/100008.jpg', 'data/images/images_training_rev1/100023.jpg', 'data/images/images_training_rev1/100053.jpg', 'data/images/images_training_rev1/100078.jpg', 'data/images/images_training_rev1/100090.jpg']


In [17]:
# Trim files
files = trim_file_list(files, labels_df=labels_df)
print(files[:5])


['data/images/images_training_rev1/100008.jpg', 'data/images/images_training_rev1/100023.jpg', 'data/images/images_training_rev1/100053.jpg', 'data/images/images_training_rev1/100078.jpg', 'data/images/images_training_rev1/100090.jpg']


In [24]:
# Create tensorflow datset from list of IDs
labels = [list(img_label(f, labels_df=labels_df).values) for f in files]

In [25]:
labels

[[np.float64(0.383147), np.float64(0.616853), np.float64(0.0)],
 [np.float64(0.327001), np.float64(0.663777), np.float64(0.009222)],
 [np.float64(0.765717), np.float64(0.177352), np.float64(0.056931)],
 [np.float64(0.693377), np.float64(0.238564), np.float64(0.068059)],
 [np.float64(0.933839), np.float64(0.0), np.float64(0.066161)],
 [np.float64(0.738832), np.float64(0.238159), np.float64(0.023009)],
 [np.float64(0.462492), np.float64(0.456033), np.float64(0.081475)],
 [np.float64(0.687783), np.float64(0.288344), np.float64(0.023873)],
 [np.float64(0.021834), np.float64(0.976952), np.float64(0.001214)],
 [np.float64(0.269843), np.float64(0.730157), np.float64(0.0)],
 [np.float64(0.429378), np.float64(0.524901), np.float64(0.045721)],
 [np.float64(0.330462), np.float64(0.669145), np.float64(0.000393)],
 [np.float64(0.448521), np.float64(0.551479), np.float64(0.0)],
 [np.float64(0.467625), np.float64(0.53148), np.float64(0.000895)],
 [np.float64(0.164391), np.float64(0.800759), np.float6

In [26]:
# Example usage
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Downsample
    transforms.ToTensor()
])

# Create PyTorch dataset from list of files, apply image loading and decoding
image_ds = CustomDataset(files, labels, transform=transform)

In [23]:
# Crete dataloader
dataloader = DataLoader(image_ds, batch_size=32, shuffle=True)

In [22]:
# Demo of accessing data, can use train_ds directly in model.fit
for x,y in dataloader:
    break
print(x.shape)

torch.Size([32, 3, 128, 128])


This is how to create a dataloader/dataset from the images.
Remember to split it in to `train` and `val` can be used when training and validating the model