In [16]:
import torch
import pickle
import numpy as np

from utils.training_utils import create_Seq2SeqDataset
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision.transforms import Lambda

In [2]:
SEED = 17
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Load in the walk representation
folder = '3_populations'
with open('./data/toy_data/%s/walk_representation_16.npy'%folder, 'rb') as f:
    walk_representation = np.load(f)

In [3]:
def scale(X,scale=(.1,3)):
    a = scale[0]
    b = scale[1]
    s = a + (b-a)*torch.rand(1)
    return X*s

rand_scaling = Lambda(scale) 

In [5]:
# create Seq2Seq data set
MASKING_ELEMENT = 0
#true_labels = torch.Tensor([0]*300 + [1]*300 + [2]*300 + [3]*300 + [4]*300).to(torch.long) # when 5 population data is loaded
true_labels = torch.Tensor([0]*400 + [1]*400 + [2]*400).to(torch.long)

transform = rand_scaling
SeqDS_scaled = create_Seq2SeqDataset(walk_representation, true_labels, MASKING_ELEMENT, transform=transform)
SeqDS_nonscaled = create_Seq2SeqDataset(walk_representation, true_labels, MASKING_ELEMENT, transform=None)

mask = walk_representation[:,:,:,0] != np.infty*-1
seq_lengths = torch.Tensor(mask.sum(axis=2))

(1200, 256, 16, 3)
(1200, 256, 16, 3)


### create scaled and unscaled iterator

In [6]:
np.random.seed(17)
BATCH_SIZE = 128
N, n_walks, walk_length, output_dim = walk_representation.shape

N_train = 750
N_val = 250
train_index = np.random.choice(range(N), size=N_train, replace=False)
val_index = np.random.choice(list(set(range(N)) - set(train_index)), size=N_val, replace=False)
test_index = list(set(range(N)) - set(train_index) -set(val_index))

train_sampler = SubsetRandomSampler(train_index)
val_sampler = SubsetRandomSampler(val_index)
test_sampler = SubsetRandomSampler(test_index)


train_iterator = torch.utils.data.DataLoader(SeqDS_nonscaled, batch_size=BATCH_SIZE, 
                                               sampler=train_sampler)
val_iterator = torch.utils.data.DataLoader(SeqDS_nonscaled, batch_size=len(val_index), 
                                           sampler=val_sampler)
test_iterator= torch.utils.data.DataLoader(SeqDS_nonscaled, batch_size=len(test_index), 
                                           sampler=test_sampler)


# save the iterator
prefix = ''
with open('./data/toy_data/%s/iterator/%sval_iterator.pkl'%(folder, prefix), 'wb') as f:
    pickle.dump(val_iterator, f)
    
with open('./data/toy_data/%s/iterator/%stest_iterator.pkl'%(folder, prefix), 'wb') as f:
    pickle.dump(test_iterator, f)
with open('./data/toy_data/%s/iterator/%strain_iterator.pkl'%(folder, prefix), 'wb') as f:
    pickle.dump(train_iterator, f)

    
train_iterator = torch.utils.data.DataLoader(SeqDS_scaled, batch_size=BATCH_SIZE, 
                                               sampler=train_sampler)
val_iterator = torch.utils.data.DataLoader(SeqDS_scaled, batch_size=BATCH_SIZE, 
                                           sampler=val_sampler)
test_iterator= torch.utils.data.DataLoader(SeqDS_scaled, batch_size=BATCH_SIZE, 
                                           sampler=test_sampler)
# save the iterator
prefix = 'scaling_'
with open('./data/toy_data/%s/iterator/%sval_iterator.pkl'%(folder, prefix), 'wb') as f:
    pickle.dump(val_iterator, f)
    
with open('./data/toy_data/%s/iterator/%stest_iterator.pkl'%(folder, prefix), 'wb') as f:
    pickle.dump(test_iterator, f)
with open('./data/toy_data/%s/iterator/%strain_iterator.pkl'%(folder, prefix), 'wb') as f:
    pickle.dump(train_iterator, f)

### Create iterators with different amount training samples

In [None]:
np.random.seed(17)
BATCH_SIZE = 128
N, n_walks, walk_length, output_dim = walk_representation.shape

N_train = 750
N_val = 250
train_index = np.random.choice(range(N), size=N_train, replace=False)
val_index = np.random.choice(list(set(range(N)) - set(train_index)), size=N_val, replace=False)
test_index = list(set(range(N)) - set(train_index) -set(val_index))


val_sampler = SubsetRandomSampler(val_index)
test_sampler = SubsetRandomSampler(test_index)

val_iterator = torch.utils.data.DataLoader(SeqDS_nonscaled, batch_size=len(val_index), 
                                           sampler=val_sampler)
test_iterator= torch.utils.data.DataLoader(SeqDS_nonscaled, batch_size=len(test_index), 
                                           sampler=test_sampler)

# save the iterator
prefix = ''

for n_samples in [150,300,450,600,750]:
    train_sampler = SubsetRandomSampler(train_index[:n_samples])

    train_iterator = torch.utils.data.DataLoader(SeqDS_nonscaled, batch_size=BATCH_SIZE, 
                                               sampler=train_sampler)

    with open('./data/toy_data/%s/iterator/%strain_iterator_n%i.pkl'%(folder, prefix,n_samples), 'wb') as f:
        pickle.dump(train_iterator, f)

### Create iterators with shuffled labels


In [15]:
# create Seq2Seq data set
MASKING_ELEMENT = 0
true_labels = torch.Tensor(np.random.permutation([0]*400 + [1]*400 + [2]*400)).to(torch.long)
SeqDS = create_Seq2SeqDataset(walk_representation, true_labels, MASKING_ELEMENT, transform=None)

mask = walk_representation[:,:,:,0] != np.infty*-1
seq_lengths = torch.Tensor(mask.sum(axis=2))

(599, 256, 32, 3)


In [None]:
np.random.seed(17)
BATCH_SIZE = 128
N, n_walks, walk_length, output_dim = walk_representation.shape

N_train = 750
N_val = 250
train_index = np.random.choice(range(N), size=N_train, replace=False)
val_index = np.random.choice(list(set(range(N)) - set(train_index)), size=N_val, replace=False)
test_index = list(set(range(N)) - set(train_index) -set(val_index))

train_sampler = SubsetRandomSampler(train_index)
val_sampler = SubsetRandomSampler(val_index)
test_sampler = SubsetRandomSampler(test_index)

train_iterator = torch.utils.data.DataLoader(SeqDS_nonscaled, batch_size=BATCH_SIZE, 
                                                 sampler=train_sampler)
val_iterator = torch.utils.data.DataLoader(SeqDS_nonscaled, batch_size=len(val_index), 
                                           sampler=val_sampler)
test_iterator= torch.utils.data.DataLoader(SeqDS_nonscaled, batch_size=len(test_index), 
                                           sampler=test_sampler)

# save the iterator
prefix = 'shuffled_'
with open('./data/toy_data/%s/iterator/%sval_iterator.pkl'%(folder, prefix), 'wb') as f:
    pickle.dump(val_iterator, f)
    
with open('./data/toy_data/%s/iterator/%stest_iterator.pkl'%(folder, prefix), 'wb') as f:
    pickle.dump(test_iterator, f)

with open('./data/toy_data/%s/iterator/%strain_iterator.pkl'%(folder, prefix), 'wb') as f:
    pickle.dump(train_iterator, f)

# Create iterator for M1 EXC data

In [5]:
import pandas as pd

In [None]:
folder = 'M1_exc_data'
meta_data = pd.read_csv('./data/%s/meta_data_m_type_label.csv'%folder)
meta_data.groupby('m-type2').count()

In [None]:
'./data/M1_exc_data/iterator/'

In [None]:
folder = 'M1_exc_data'
MASKING_ELEMENT = 0
BATCH_SIZE = 128
N_train = 160
N_val = 55
no_label = -100
transform = None

meta_data = pd.read_csv('./data/%s/meta_data_m_type_label.csv'%folder)

# get true labels
labels = meta_data['m-type2'].values
labels[labels == 'tufted'] = 0
labels[labels == 'untufted'] = 1
labels[labels == 'other'] = 2 #no_label

true_labels = torch.Tensor(labels.astype(int)).to(torch.long)

part = 'm_labels'
with open('./data/%s/walks/walk_representation.npy'%(folder), 'rb') as f:
    walk_representation = np.load(f)
    
    # create SeqDS
    SeqDS = create_Seq2SeqDataset(walk_representation, true_labels, MASKING_ELEMENT, transform=transform)

    mask = walk_representation[:,:,:,0] != np.infty*-1
    seq_lengths = torch.Tensor(mask.sum(axis=2))
    
    np.random.seed(17)
    N, n_walks, walk_length, output_dim = walk_representation.shape

    train_index = np.random.choice(range(N), size=N_train, replace=False)
    val_index = np.random.choice(list(set(range(N)) - set(train_index)), size=N_val, replace=False)
    test_index = list(set(range(N)) - set(train_index) -set(val_index))

    train_sampler = SubsetRandomSampler(train_index)
    val_sampler = SubsetRandomSampler(val_index)
    test_sampler = SubsetRandomSampler(test_index)
    

    train_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=BATCH_SIZE, 
                                               sampler=train_sampler)

    val_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=len(val_index), 
                                               sampler=val_sampler)
    test_iterator= torch.utils.data.DataLoader(SeqDS, batch_size=len(test_index), 
                                               sampler=test_sampler)
    
    with open('./data/%s/iterator/%s/train_iterator.pkl'%(folder, part), 'wb') as f:
        pickle.dump(train_iterator, f)
    
    with open('./data/%s/iterator/%s/val_iterator.pkl'%(folder, part), 'wb') as f:
        pickle.dump(val_iterator, f)
    
    with open('./data/%s/iterator/%s/test_iterator.pkl'%(folder, part), 'wb') as f:
        pickle.dump(test_iterator, f)

### RNA label

In [None]:
import pandas as pd

In [None]:
folder = 'M1_exc_data'
MASKING_ELEMENT = 0
BATCH_SIZE = 128
N_train = 160
N_val = 55
no_label = -100
transform = None

meta_data = pd.read_csv('./data/%s/meta_data_m_type_label.csv'%folder)

# get true labels
labels = meta_data['RNA family'].values
labels[labels == 'IT'] = 0
labels[labels == 'CT'] = 1
labels[labels == 'PT'] = 2
labels[labels == 'NP'] = no_label
labels[labels == 'low quality'] = no_label

true_labels = torch.Tensor(labels.astype(int)).to(torch.long)


with open('./data/%s/walks/walk_representation.npy'%folder, 'rb') as f:
    walk_representation = np.load(f)
    
# create SeqDS
SeqDS = create_Seq2SeqDataset(walk_representation, true_labels, MASKING_ELEMENT, transform=transform)

mask = walk_representation[:,:,:,0] != np.infty*-1
seq_lengths = torch.Tensor(mask.sum(axis=2))
    
np.random.seed(17)
N, n_walks, walk_length, output_dim = walk_representation.shape

train_index = np.random.choice(range(N), size=N_train, replace=False)
val_index = np.random.choice(list(set(range(N)) - set(train_index)), size=N_val, replace=False)
test_index = list(set(range(N)) - set(train_index) -set(val_index))

train_sampler = SubsetRandomSampler(train_index)
val_sampler = SubsetRandomSampler(val_index)
test_sampler = SubsetRandomSampler(test_index)


train_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=BATCH_SIZE, 
                                             sampler=train_sampler)

val_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=len(val_index), 
                                           sampler=val_sampler)
test_iterator= torch.utils.data.DataLoader(SeqDS, batch_size=len(test_index), 
                                           sampler=test_sampler)

part = 'rna_labels'
with open('./data/%s/iterator/%s/train_iterator.pkl'%(folder, part), 'wb') as f:
    pickle.dump(train_iterator, f)

with open('./data/%s/iterator/%s/val_iterator.pkl'%(folder, part), 'wb') as f:
    pickle.dump(val_iterator, f)

with open('./data/%s/iterator/%s/test_iterator.pkl'%(folder, part), 'wb') as f:
    pickle.dump(test_iterator, f)

In [None]:
np.unique(meta_data['RNA family'].values)

In [None]:
from utils.rw_utils import get_possible_paths, load_neurons

neurons = load_neurons('./data/M1_exc_data/neurons/', sort=False)
path_lengths = []
for n in neurons:
    path_lengths += [len(p) for p in get_possible_paths(n)]
print("[", np.percentile(path_lengths,5), ",", np.percentile(path_lengths,95), "]")

# Create iterator for M1 INH data

In [None]:

meta_data = pd.read_csv('./data/M1_inh_data/meta_data.csv')
meta_data.groupby('RNA family').count()

In [None]:
folder = 'M1_inh_data'
MASKING_ELEMENT = 0
N_train = 248
N_val = 62
no_label = -100
transform = None#rand_scaling

meta_data = pd.read_csv('./data/%s/meta_data.csv'%folder)

# get true labels
labels = meta_data['RNA family'].values
labels[labels == 'n.a.'] = no_label
labels[labels == 'Sncg'] = 'Vip'
labels[labels == 'Sst'] = 0
labels[labels == 'Pvalb'] = 1
labels[labels == 'Vip'] = 2
labels[labels == 'Lamp5'] = 3

true_labels = torch.Tensor(labels.astype(int)).to(torch.long)

for part, walk_length in [('axon', 32)]:
    with open('./data/%s/walks/%s/walk_representation_%i.npy'%(folder, part, walk_length), 'rb') as f:
        walk_representation = np.load(f)
    
    if walk_length == 32:
        BATCH_SIZE = 128
    elif walk_length == 64:
        BATCH_SIZE = 64
        
    # create SeqDS
    SeqDS = create_Seq2SeqDataset(walk_representation, true_labels, MASKING_ELEMENT, transform=transform)

    mask = walk_representation[:,:,:,0] != np.infty*-1
    seq_lengths = torch.Tensor(mask.sum(axis=2))
    
    np.random.seed(17)
    N, n_walks, walk_length, output_dim = walk_representation.shape

    train_index = np.random.choice(range(N), size=N_train, replace=False)
    val_index = np.random.choice(list(set(range(N)) - set(train_index)), size=N_val, replace=False)
    test_index = list(set(range(N)) - set(train_index) -set(val_index))

    train_sampler = SubsetRandomSampler(train_index)
    val_sampler = SubsetRandomSampler(val_index)
    test_sampler = SubsetRandomSampler(test_index)
    

    train_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=BATCH_SIZE, 
                                               sampler=train_sampler)

    val_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=len(val_index), 
                                               sampler=val_sampler)
    test_iterator= torch.utils.data.DataLoader(SeqDS, batch_size=len(test_index), 
                                               sampler=test_sampler)
    
    with open('./data/%s/iterator/%s/train_iterator_%i.pkl'%(folder, part, walk_length), 'wb') as f:
        pickle.dump(train_iterator, f)
    
    with open('./data/%s/iterator/%s/val_iterator_%i.pkl'%(folder, part, walk_length), 'wb') as f:
        pickle.dump(val_iterator, f)
    
    with open('./data/%s/iterator/%s/test_iterator_%i.pkl'%(folder, part, walk_length), 'wb') as f:
        pickle.dump(test_iterator, f)

In [None]:
from utils.rw_utils import get_possible_paths, load_neurons

neurons = load_neurons('./data/M1_inh_data/neurons/axon/', sort=False)
path_lengths = []
for n in neurons:
    path_lengths += [len(p) for p in get_possible_paths(n)]
print("[", np.percentile(path_lengths,5), ",", np.percentile(path_lengths,95), "]")

# Create iterator for Farrow data

In [5]:
import pandas as pd

In [16]:
folder = 'Farrow_data'
MASKING_ELEMENT = 0
N_train = 400
N_val = 99
no_label = -100
transform = None

meta_data = pd.read_csv('./data/%s/meta_data_labeled_cells.csv'%folder, index_col=0)

# get true labels
labels = meta_data['cluster'].values -1
labels[labels < 0] = no_label
true_labels = torch.Tensor(labels.astype(int)).to(torch.long)

In [17]:
np.unique(true_labels, return_counts=True)

(array([-100,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
          10,   11,   12,   13]),
 array([ 49, 121,  29,  11,  17,  44,  91,  20,  37,  19,  36,  21,  30,
         44,  30]))

In [18]:
BATCH_SIZE = 128
for c in [('soma_centered')]:
    
    for transform, prefix in [(None, ''), (transform, 'scaling_')]:
        with open('./data/%s/walks/%s/walk_representation.npy'%(folder, c), 'rb') as f:
            walk_representation = np.load(f)
            
        # create SeqDS
        SeqDS = create_Seq2SeqDataset(walk_representation, true_labels, MASKING_ELEMENT, transform=transform)

        mask = walk_representation[:,:,:,0] != np.infty*-1
        seq_lengths = torch.Tensor(mask.sum(axis=2))

        np.random.seed(17)
        N, n_walks, walk_length, output_dim = walk_representation.shape

        train_index = np.random.choice(range(N), size=N_train, replace=False)
        val_index = np.random.choice(list(set(range(N)) - set(train_index)), size=N_val, replace=False)
        test_index = list(set(range(N)) - set(train_index) -set(val_index))

        train_sampler = SubsetRandomSampler(train_index)
        val_sampler = SubsetRandomSampler(val_index)
        test_sampler = SubsetRandomSampler(test_index)


        train_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=BATCH_SIZE, 
                                                   sampler=train_sampler)

        val_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=len(val_index), 
                                                   sampler=val_sampler)
        test_iterator= torch.utils.data.DataLoader(SeqDS, batch_size=len(test_index), 
                                                   sampler=test_sampler)

        with open('./data/%s/iterator/%s/%strain_iterator.pkl'%(folder, c, prefix), 'wb') as f:
            pickle.dump(train_iterator, f)

        with open('./data/%s/iterator/%s/%sval_iterator.pkl'%(folder, c, prefix), 'wb') as f:
            pickle.dump(val_iterator, f)

        with open('./data/%s/iterator/%s/%stest_iterator.pkl'%(folder, c, prefix), 'wb') as f:
            pickle.dump(test_iterator, f)

In [17]:
with open('./data/urban_data/walks/soma_centered/walk_representation_16.npy', 'rb') as f:
    walk_representation = np.load(f)



In [19]:
import os
from utils.rw_utils import load_neurons

neurons = load_neurons('./data/Farrow_data/neurons/soma_centered/', sort=False)

root, _, files = list(os.walk('./data/Farrow_data/neurons/soma_centered/'))[0]
sort_index = np.array(files).argsort()

neurons = np.array(neurons)[sort_index]

In [21]:
from utils.rw_utils import get_possible_paths

path_lengths = []
for n in neurons:
    path_lengths += [len(p) for p in get_possible_paths(n)]
print("[", np.percentile(path_lengths,5), ",", np.percentile(path_lengths,95), "]")

[ 5.0 , 22.0 ]


# Create iterator for urban data

In [19]:
import pandas as pd

folder = 'Farrow_data'
MASKING_ELEMENT = 0
N_train = 400
N_val = 99
no_label = -100
transform = None

meta_data = pd.read_csv('./data/%s/meta_data_labeled_cells.csv'%folder, index_col=0)

# get true labels
labels = meta_data['cluster'].values -1
labels[labels < 0] = no_label
true_labels = torch.Tensor(labels.astype(int)).to(torch.long)

In [20]:
folder = 'urban_data'
MASKING_ELEMENT = 0
N_train = 184
N_val = 37
no_label = -100
transform = None


In [21]:
folder

'urban_data'

In [22]:
import multiprocessing
import numpy as np

from utils.rw_utils import get_rw_representation, get_walk_representation

In [23]:
path = "./data/urban_data/walks/soma_centered/"
path2 = "./data/urban_data/neurons/soma_centered/"
path3 = "./data/urban_data/iterator/soma_centered"
import os 
os.makedirs(path, exist_ok=True)
os.makedirs(path2, exist_ok=True)
os.makedirs(path3, exist_ok=True)

In [24]:
BATCH_SIZE = 128
for c in [('soma_centered')]:
    
    for transform, prefix in [(None, ''), (transform, 'scaling_')]:
        with open('./data/%s/walks/%s/walk_representation_8.npy'%(folder, c), 'rb') as f:
            walk_representation = np.load(f)

        # create SeqDS
        SeqDS = create_Seq2SeqDataset(walk_representation, true_labels, MASKING_ELEMENT, transform=transform)

        mask = walk_representation[:,:,:,0] != np.infty*-1
        seq_lengths = torch.Tensor(mask.sum(axis=2))

        np.random.seed(17)
        N, n_walks, walk_length, output_dim = walk_representation.shape

        train_index = np.random.choice(range(N), size=N_train, replace=False)
        val_index = np.random.choice(list(set(range(N)) - set(train_index)), size=N_val, replace=False)
        test_index = list(set(range(N)) - set(train_index) -set(val_index))

        train_sampler = SubsetRandomSampler(train_index)
        val_sampler = SubsetRandomSampler(val_index)
        test_sampler = SubsetRandomSampler(test_index)


        train_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=BATCH_SIZE, 
                                                   sampler=train_sampler)

        val_iterator = torch.utils.data.DataLoader(SeqDS, batch_size=len(val_index), 
                                                   sampler=val_sampler)
        test_iterator= torch.utils.data.DataLoader(SeqDS, batch_size=len(test_index), 
                                                   sampler=test_sampler)

        with open('./data/%s/iterator/%s/%strain_iterator.pkl'%(folder, c, prefix), 'wb') as f:
            pickle.dump(train_iterator, f)

        with open('./data/%s/iterator/%s/%sval_iterator.pkl'%(folder, c, prefix), 'wb') as f:
            pickle.dump(val_iterator, f)

        with open('./data/%s/iterator/%s/%stest_iterator.pkl'%(folder, c, prefix), 'wb') as f:
            pickle.dump(test_iterator, f)

(235, 256, 16, 3)
(235, 256, 16, 3)


In [14]:
# sample RANDOM walks

np.random.seed(17)

n_walks=256
N = len(neurons)
NUM_CORES = 15

for walk_length in [8,16,32]:
    rw_list = []
    with multiprocessing.Pool(NUM_CORES) as pool:
        rw_list += pool.map(get_rw_representation, neurons)
    RW_representation = np.array([e[0] for e in rw_list])
    
    with open(path+'/walk_representation_%i.npy'%walk_length, 'wb') as f:
        #np.save(f, walk_representation)
        np.save(f, RW_representation)

In [None]:
import os
from utils.rw_utils import load_neurons

neurons = load_neurons('./data/urban_data/neurons/soma_centered/', sort=False)

root, _, files = list(os.walk('./data/urban_data/neurons/soma_centered/'))[0]
sort_index = np.array(files).argsort()

neurons = np.array(neurons)[sort_index]

In [12]:
PATH = './data/urban_data/walks/'
#os.makedirs(PATH, exist_ok=True)folder