In [1]:
import pandas as pd
import numpy as np
import ast

import torch
from torch.utils.data import Dataset, DataLoader, Subset
from torch import nn, optim
from torchvision import datasets, utils, models, transforms
# from torchinfo import summary
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR, ExponentialLR
import torch.optim as optim
from torchvision.transforms import v2
import multiprocessing

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from PIL import Image
import os
from tqdm import tqdm
import altair as alt
from tqdm import tqdm 

alt.data_transformers.enable("vegafusion")
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [2]:
labels_extended = pd.read_csv('data/selected_gene_df.csv')

clean_possible_genes = labels_extended.columns.to_list()[7:]
print(f'Number of labels: {len(clean_possible_genes)}')

train_df, test_df = train_test_split(labels_extended, train_size=0.85, random_state=123)
print(f'Shape of trainset: {train_df.shape}')

Number of labels: 30
Shape of trainset: (41416, 37)


In [3]:
class PythonGeneDataset(Dataset):
    def __init__(self, labels_df, img_dir, indices=None, transform=None):
        self.labels_df = labels_df
        if indices is not None:
            self.labels_df = self.labels_df.iloc[indices]
        self.img_dir = img_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, f"{self.labels_df.iloc[idx, 0]}.png")
        image = Image.open(img_name)
        labels = torch.tensor(self.labels_df.iloc[idx, 7:].astype('float32').values)
        
        if self.transform:
            image = self.transform(image)

        return image, labels

In [4]:
transform = v2.Compose([
    v2.ToImage(),
    v2.Resize((480, 480)),
    v2.ToDtype(torch.float32, scale=True)
    ])
full_dataset = PythonGeneDataset(labels_df=train_df, img_dir='data/img/', transform=transform)

total_size = len(full_dataset)
train_size = int(0.8 * total_size)
valid_size = total_size - train_size
train_indices, valid_indices = torch.utils.data.random_split(np.arange(total_size), [train_size, valid_size])

train_dataset = Subset(full_dataset, train_indices)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True,
                          num_workers=multiprocessing.cpu_count(), pin_memory=True)


In [5]:
mean = 0.
std = 0.
total_images_count = 0

for images, _ in tqdm(train_loader, desc="Calculating Mean and Std", unit="batch"):
    batch_samples = images.size(0)
    images = images.view(batch_samples, images.size(1), -1)
    mean += images.mean(2).sum(0)
    std += images.var(2).sum(0)
    total_images_count += batch_samples

mean /= total_images_count
std = torch.sqrt(std / total_images_count)

print("Mean: ", mean)
print("Std: ", std)

Calculating Mean and Std: 100%|██████████| 518/518 [12:20<00:00,  1.43s/batch]


Mean:  tensor([0.6007, 0.5679, 0.5206])
Std:  tensor([0.2411, 0.2392, 0.2479])
