In [40]:
import msprime, pyslim
import tskit
import json
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
import itertools
import math

import torch
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import os
import gpustat
import pandas as pd
from torch.utils.data import DataLoader, random_split, TensorDataset
from torch import nn
from torchvision.io import read_image

import sklearn
import sklearn.model_selection

rng = np.random.default_rng()

In [50]:
stats = gpustat.GPUStatCollection.new_query()
ids = map(lambda gpu: int(gpu.entry['index']), stats)
ratios = map(lambda gpu: float(gpu.entry['memory.used'])/float(gpu.entry['memory.total']), stats)
bestGPU = min(zip(ids, ratios), key=lambda x: x[1])[0]
os.environ['CUDA_VISIBLE_DEVICES'] = str(bestGPU)

In [42]:
base_folder = "spatial_sim_data"
labels = pd.read_csv(base_folder + "/labels.csv", names = ["path", "N", "n", "nPO"])
print(labels)

                                        path      N    n  nPO
0      images/spatial_sim_parents_1000_0_500   1020  500   90
1      images/spatial_sim_parents_1000_1_500   1007  500  127
2      images/spatial_sim_parents_1000_2_500   1009  500  138
3      images/spatial_sim_parents_1000_3_500   1006  500  121
4      images/spatial_sim_parents_1000_4_500   1007  500  124
...                                      ...    ...  ...  ...
2425  images/spatial_sim_parents_9900_22_500  10028  500   67
2426  images/spatial_sim_parents_9900_23_500  10041  500   66
2427  images/spatial_sim_parents_9900_24_500  10005  500   69
2428  images/spatial_sim_parents_9900_25_500  10067  500   54
2429  images/spatial_sim_parents_9900_26_500  10041  500   52

[2430 rows x 4 columns]


In [43]:
idx = 0
base_path = base_folder + "/" + labels['path'].iloc[0]
spaghetti_path = base_path + "_spaghetti.png"
samples_path = base_path + "_samples.png"
print(spaghetti_path)
print(samples_path)
spaghetti = read_image(spaghetti_path)
samples = read_image(samples_path)

spatial_sim_data/images/spatial_sim_parents_1000_0_500_spaghetti.png
spatial_sim_data/images/spatial_sim_parents_1000_0_500_samples.png


In [44]:
print(spaghetti.size())
print(samples.size())
# Number of samples, should be 500
print(samples.sum()/255)
images = torch.cat((spaghetti, samples), 0)
print(images.size())

torch.Size([1, 500, 500])
torch.Size([1, 500, 500])
tensor(500.)
torch.Size([2, 500, 500])


In [45]:
class KinDataset(Dataset):
    def __init__(self, base_folder):
        self.labels = pd.read_csv(base_folder + "/labels.csv", names = ["path", "N", "n", "nPO"])
        self.base_folder = base_folder
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        base_path = self.base_folder + "/" + self.labels['path'].iloc[idx]
        spaghetti_path = base_path + "_spaghetti.png"
        samples_path = base_path + "_samples.png"
        spaghetti = read_image(spaghetti_path)
        samples = read_image(samples_path)
        input_tensor = torch.cat((spaghetti, samples), 0)
        label = torch.tensor([self.labels['N'].iloc[idx]]).float()
        return input_tensor, label
    
kin_dataset = KinDataset("spatial_sim_data")

In [46]:
print(kin_dataset[2][0].size())
print(kin_dataset[2][1])
print(kin_dataset[100][1])

torch.Size([2, 500, 500])
tensor([1009.])
tensor([1340.])


In [47]:
n_total = len(kin_dataset)
n_train = round(n_total*0.5)
n_valid = round((n_total-n_train)*0.5)
n_test = n_total-n_train-n_valid

In [48]:
train_kin, valid_kin, test_kin = random_split(kin_dataset, [n_train, n_valid, n_test], generator=torch.Generator().manual_seed(42))

batch_size = 64
train_kin_dl = DataLoader(train_kin, batch_size, shuffle = True)
valid_kin_dl = DataLoader(valid_kin, batch_size, shuffle = True)
test_kin_dl = DataLoader(test_kin, batch_size, shuffle = True)
print(len(train_kin))
print(len(valid_kin))
print(len(test_kin))

1215
608
607
