# Create data

Creating more useful data from RNA data for training neural nets.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch import tensor
from torcheval.metrics import MulticlassAccuracy
import torchvision.transforms.functional as TF,torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from miniai.datasets import *
from miniai.conv import *
from miniai.learner import *
from miniai.activations import *
from miniai.init import *
from miniai.sgd import *
from miniai.augment import *
from miniai.xtras import *

In [2]:
from torch.nn import init
from functools import partial
from torch.optim import lr_scheduler
from torch import tensor,nn,optim

In [3]:
import os
path = Path('data')

In [4]:
meta_data = pd.read_csv(path/'meta_data.csv', low_memory=False)
raw_data = pd.read_csv(path/'RNA_count_mat.csv', low_memory=False)
data = raw_data.set_index('gene').T
data = (data - data.mean(axis=0)) / data.std(axis=0)
df = data.join(meta_data.set_index('cell_name'))
df['ZT'] = pd.Categorical(df['ZT'])
df['code'] = df['ZT'].cat.codes
df.head()

Unnamed: 0,Meg3,Slc1a2,Rian,Cyp2c40,Cyp2c69,Sgcz,Glul,Ugt2b37,Tmeff2,B830012L14Rik,...,Mpp6,Ralgps1,Zfp580,Sept8,Cfap52,Arhgap27,F730043M19Rik,Bora,ZT,code
AAACAGCCAACCCTAA-6,-0.094012,-0.251429,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,1.122616,2.084872,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,-0.318222,ZT18,4
AAACCAACAGTAAAGC-6,-0.094012,-0.032904,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,-0.554376,-0.61979,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,2.058709,ZT18,4
AAACCGAAGGTCCTAG-6,-0.094012,-0.251429,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,0.451819,0.281764,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,-0.318222,ZT18,4
AAAGCACCATTGTCCT-6,-0.094012,-0.251429,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,0.451819,0.281764,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,-0.318222,ZT18,4
AAAGGACGTGAGGTGA-6,-0.094012,-0.251429,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,-0.889775,1.183318,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,-0.318222,ZT18,4


In [5]:
meta_data.head()

Unnamed: 0,cell_name,ZT
0,AAACAGCCAACCCTAA-6,ZT18
1,AAACCAACAGTAAAGC-6,ZT18
2,AAACCGAAGGTCCTAG-6,ZT18
3,AAAGCACCATTGTCCT-6,ZT18
4,AAAGGACGTGAGGTGA-6,ZT18


In [6]:
df.to_csv(path/'RNA_norm.csv', index_label='cell')

In [7]:
pd.read_csv(path/'RNA_norm.csv', index_col='cell', low_memory=False).head()

Unnamed: 0_level_0,Meg3,Slc1a2,Rian,Cyp2c40,Cyp2c69,Sgcz,Glul,Ugt2b37,Tmeff2,B830012L14Rik,...,Mpp6,Ralgps1,Zfp580,Sept8,Cfap52,Arhgap27,F730043M19Rik,Bora,ZT,code
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAGCCAACCCTAA-6,-0.094012,-0.251429,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,1.122616,2.084872,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,-0.318222,ZT18,4
AAACCAACAGTAAAGC-6,-0.094012,-0.032904,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,-0.554376,-0.61979,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,2.058709,ZT18,4
AAACCGAAGGTCCTAG-6,-0.094012,-0.251429,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,0.451819,0.281764,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,-0.318222,ZT18,4
AAAGCACCATTGTCCT-6,-0.094012,-0.251429,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,0.451819,0.281764,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,-0.318222,ZT18,4
AAAGGACGTGAGGTGA-6,-0.094012,-0.251429,-0.084206,-0.118838,-0.084834,-0.183315,-0.255395,-0.062888,-0.092432,-0.073209,...,-0.889775,1.183318,-0.036734,-0.036734,-0.036734,-0.036734,-0.036734,-0.318222,ZT18,4


In [8]:
raw_bulk_data = pd.read_csv(path/'RNA.csv', low_memory=False)
bulk_data = raw_bulk_data.set_index('ID').T
bulk_data = (bulk_data - bulk_data.mean(axis=0)) / bulk_data.std(axis=0)
bulk_data['code'] = [0,1,2,3,4,5,0,1,2,3,4,5]

In [10]:
bulk_data

ID,2810459M11Rik,Abcb11,Acot7,Ahctf1,Ap2a2,Asb13,Asl,Chka,Clic3,Clpx,...,Ap1s3,Vamp4,Pigm,Pard6g,Gas2,Pot1b,Zfp759,Neo1,Sco1,code
ZT2_1,1.105385,1.734929,-1.000468,-0.675204,0.840018,0.483757,-1.549016,0.124098,-0.232362,0.848045,...,2.75289,-0.473227,1.929628,0.128575,-0.790649,-0.444183,0.143286,-0.561441,-0.562906,0
ZT6_1,0.140685,0.362813,0.216831,-1.176028,-0.819429,-0.259607,-0.637751,-1.136919,-0.522711,-1.026734,...,-0.78996,0.250906,-0.690604,0.128575,0.738327,-1.036427,-1.197874,1.199442,0.094325,1
ZT10_1,-0.816706,-1.059483,0.376602,-0.640493,-1.968277,-1.844144,0.759326,-1.019758,-0.540232,-1.16555,...,0.215444,0.250906,-0.243742,-1.651693,-0.302307,-0.049354,1.828333,-1.939524,-1.767828,2
ZT14_1,-0.746059,-0.951535,1.700414,0.762807,-0.123532,-0.425885,0.793645,-0.16572,-0.294937,-0.431148,...,0.550578,1.622947,0.487485,0.959366,0.023254,-0.049354,-0.647654,1.429123,1.372273,3
ZT18_1,0.25031,-0.396195,0.361386,1.328094,0.432362,0.625583,0.991523,0.580407,0.012932,1.159798,...,-0.406949,-1.19736,0.934346,-0.108794,1.197601,0.740305,0.074509,-0.17864,0.42294,4
ZT22_1,0.910496,0.60443,-1.008076,0.921484,0.992374,1.261354,0.020871,1.335783,0.158107,1.199736,...,0.215444,1.432386,-1.076529,-0.346163,0.488342,-1.036427,-0.303767,0.05104,1.372273,5
ZT2_2,0.93973,1.481083,-1.966699,-1.037186,0.601189,0.95814,-1.537576,0.053185,-0.302447,0.511477,...,-0.981465,0.098457,0.710916,0.00989,-1.569671,-0.246768,-0.23499,-1.097362,-0.380342,0
ZT6_2,-0.409876,-0.259573,-0.034237,-1.260325,-0.432362,-0.543258,-0.893558,-1.158501,-0.522711,-1.138019,...,-0.694207,0.212793,-0.182807,1.671473,1.302246,1.727378,-0.613265,1.352563,1.226222,1
ZT10_2,-2.22965,-1.175442,0.886346,-0.377684,-1.280615,-1.276841,0.149626,-1.099921,-0.490172,-1.110101,...,-0.023938,-0.625676,-0.446861,-0.464848,0.377884,0.740305,0.280841,-0.331761,0.386427,2
ZT14_2,-0.604764,-1.075085,0.916778,0.405783,-0.350007,-0.582383,1.326676,-0.289047,-0.40507,-0.656819,...,0.071815,-1.426034,-0.121871,-0.346163,-1.895232,-1.826085,0.108898,-0.2552,-0.343829,3


In [13]:
bulk_data.to_csv(path/'bulk_data.csv', index_label='ID')
pd.read_csv(path/'bulk_data.csv', index_col='ID', low_memory=False)911

Unnamed: 0_level_0,2810459M11Rik,Abcb11,Acot7,Ahctf1,Ap2a2,Asb13,Asl,Chka,Clic3,Clpx,...,Ap1s3,Vamp4,Pigm,Pard6g,Gas2,Pot1b,Zfp759,Neo1,Sco1,code
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZT2_1,1.105385,1.734929,-1.000468,-0.675204,0.840018,0.483757,-1.549016,0.124098,-0.232362,0.848045,...,2.75289,-0.473227,1.929628,0.128575,-0.790649,-0.444183,0.143286,-0.561441,-0.562906,0
ZT6_1,0.140685,0.362813,0.216831,-1.176028,-0.819429,-0.259607,-0.637751,-1.136919,-0.522711,-1.026734,...,-0.78996,0.250906,-0.690604,0.128575,0.738327,-1.036427,-1.197874,1.199442,0.094325,1
ZT10_1,-0.816706,-1.059483,0.376602,-0.640493,-1.968277,-1.844144,0.759326,-1.019758,-0.540232,-1.16555,...,0.215444,0.250906,-0.243742,-1.651693,-0.302307,-0.049354,1.828333,-1.939524,-1.767828,2
ZT14_1,-0.746059,-0.951535,1.700414,0.762807,-0.123532,-0.425885,0.793645,-0.16572,-0.294937,-0.431148,...,0.550578,1.622947,0.487485,0.959366,0.023254,-0.049354,-0.647654,1.429123,1.372273,3
ZT18_1,0.25031,-0.396195,0.361386,1.328094,0.432362,0.625583,0.991523,0.580407,0.012932,1.159798,...,-0.406949,-1.19736,0.934346,-0.108794,1.197601,0.740305,0.074509,-0.17864,0.42294,4
ZT22_1,0.910496,0.60443,-1.008076,0.921484,0.992374,1.261354,0.020871,1.335783,0.158107,1.199736,...,0.215444,1.432386,-1.076529,-0.346163,0.488342,-1.036427,-0.303767,0.05104,1.372273,5
ZT2_2,0.93973,1.481083,-1.966699,-1.037186,0.601189,0.95814,-1.537576,0.053185,-0.302447,0.511477,...,-0.981465,0.098457,0.710916,0.00989,-1.569671,-0.246768,-0.23499,-1.097362,-0.380342,0
ZT6_2,-0.409876,-0.259573,-0.034237,-1.260325,-0.432362,-0.543258,-0.893558,-1.158501,-0.522711,-1.138019,...,-0.694207,0.212793,-0.182807,1.671473,1.302246,1.727378,-0.613265,1.352563,1.226222,1
ZT10_2,-2.22965,-1.175442,0.886346,-0.377684,-1.280615,-1.276841,0.149626,-1.099921,-0.490172,-1.110101,...,-0.023938,-0.625676,-0.446861,-0.464848,0.377884,0.740305,0.280841,-0.331761,0.386427,2
ZT14_2,-0.604764,-1.075085,0.916778,0.405783,-0.350007,-0.582383,1.326676,-0.289047,-0.40507,-0.656819,...,0.071815,-1.426034,-0.121871,-0.346163,-1.895232,-1.826085,0.108898,-0.2552,-0.343829,3


In [12]:
df2 = pd.DataFrame(np.random.normal(size=(15992, 9482)), columns=bulk_data.columns)
df.reset_index(inplace=True)
df2.update(df)

In [16]:
df2['cell'] = meta_data['cell_name']

In [23]:
df2 = df2.set_index('cell')

In [24]:
df2.head()

ID,2810459M11Rik,Abcb11,Acot7,Ahctf1,Ap2a2,Asb13,Asl,Chka,Clic3,Clpx,...,Ap1s3,Vamp4,Pigm,Pard6g,Gas2,Pot1b,Zfp759,Neo1,Sco1,code
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAGCCAACCCTAA-6,-0.540902,1.08408,0.463838,0.43668,-0.351168,-0.658594,-0.04389,1.686979,-1.087222,0.2346,...,-0.166754,-0.719685,0.263727,-0.245295,2.159961,-0.401256,0.501759,-1.400617,0.561399,4.0
AAACCAACAGTAAAGC-6,-0.452915,-0.872108,0.040194,0.738373,1.136242,-0.126162,-0.387448,-0.189915,-1.491271,-0.363165,...,-0.166754,1.487384,-0.839048,-0.615568,-0.784713,-0.401256,1.202853,-0.226444,-1.835237,4.0
AAACCGAAGGTCCTAG-6,1.085506,0.73887,-0.336613,-0.546819,-1.556767,1.386236,1.673901,0.1229,-0.548618,1.131247,...,10.777388,-0.075675,0.247874,0.280195,0.821473,-0.401256,0.940912,-0.728631,0.228285,4.0
AAAGCACCATTGTCCT-6,-0.346411,0.681335,0.677034,0.980449,-0.632364,1.077614,0.643226,-0.033508,-0.725366,1.729012,...,-0.166754,0.056551,-1.058772,-0.20877,0.821473,-0.401256,-1.485775,-0.475149,1.874629,4.0
AAAGGACGTGAGGTGA-6,0.557443,-0.641969,-0.087715,0.643695,-0.266904,0.42209,-0.387448,0.435716,0.828472,-0.512606,...,-0.166754,-1.023835,1.493227,2.312298,-0.784713,-0.401256,1.787818,-1.085683,-0.70983,4.0


In [26]:
df2.to_csv(path/'RNA_batched.csv', index_label='cell')
pd.read_csv(path/'RNA_batched.csv', index_col='cell', low_memory=False).head()

Unnamed: 0_level_0,2810459M11Rik,Abcb11,Acot7,Ahctf1,Ap2a2,Asb13,Asl,Chka,Clic3,Clpx,...,Ap1s3,Vamp4,Pigm,Pard6g,Gas2,Pot1b,Zfp759,Neo1,Sco1,code
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAGCCAACCCTAA-6,-0.540902,1.08408,0.463838,0.43668,-0.351168,-0.658594,-0.04389,1.686979,-1.087222,0.2346,...,-0.166754,-0.719685,0.263727,-0.245295,2.159961,-0.401256,0.501759,-1.400617,0.561399,4.0
AAACCAACAGTAAAGC-6,-0.452915,-0.872108,0.040194,0.738373,1.136242,-0.126162,-0.387448,-0.189915,-1.491271,-0.363165,...,-0.166754,1.487384,-0.839048,-0.615568,-0.784713,-0.401256,1.202853,-0.226444,-1.835237,4.0
AAACCGAAGGTCCTAG-6,1.085506,0.73887,-0.336613,-0.546819,-1.556767,1.386236,1.673901,0.1229,-0.548618,1.131247,...,10.777388,-0.075675,0.247874,0.280195,0.821473,-0.401256,0.940912,-0.728631,0.228285,4.0
AAAGCACCATTGTCCT-6,-0.346411,0.681335,0.677034,0.980449,-0.632364,1.077614,0.643226,-0.033508,-0.725366,1.729012,...,-0.166754,0.056551,-1.058772,-0.20877,0.821473,-0.401256,-1.485775,-0.475149,1.874629,4.0
AAAGGACGTGAGGTGA-6,0.557443,-0.641969,-0.087715,0.643695,-0.266904,0.42209,-0.387448,0.435716,0.828472,-0.512606,...,-0.166754,-1.023835,1.493227,2.312298,-0.784713,-0.401256,1.787818,-1.085683,-0.70983,4.0
