# Multiple datasets

Just understanding data.

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch import tensor
from torcheval.metrics import MulticlassAccuracy
import torchvision.transforms.functional as TF,torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from miniai.datasets import *
from miniai.conv import *
from miniai.learner import *
from miniai.activations import *
from miniai.init import *
from miniai.sgd import *
from miniai.augment import *
from miniai.xtras import *

In [8]:
path = Path('data/scRNA_datasets')
path.ls()

(#10) [Path('data/scRNA_datasets/robustness_score.txt'),Path('data/scRNA_datasets/README.txt'),Path('data/scRNA_datasets/jtk.results.txt'),Path('data/scRNA_datasets/.snakemake_timestamp'),Path('data/scRNA_datasets/SIM'),Path('data/scRNA_datasets/bootejtk.results.txt'),Path('data/scRNA_datasets/sample_metadata.txt'),Path('data/scRNA_datasets/tpm.by_sample.txt'),Path('data/scRNA_datasets/num_reads.by_sample.txt'),Path('data/scRNA_datasets/study_metadata.txt')]

Not sure why people use tsv..

In [9]:
pd.read_csv?

In [11]:
data = pd.read_csv(path/'tpm.by_sample.txt', sep='\t', low_memory=False)
data.head()

Unnamed: 0,Name,Symbol,GSM2046160,GSM2046184,GSM2046157,GSM2046183,GSM2046155,GSM2046159,GSM2046182,GSM2046180,...,GSM6610744,GSM6610712,GSM6610730,GSM6610746,GSM6610716,GSM6610738,GSM6610724,GSM6610752,GSM6610726,GSM6610740
0,ENSMUSG00000087193,Gm14820,0.047856,0.0,0.011696,0.038796,0.0,0.0,0.0,0.04064,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016459,0.0
1,ENSMUSG00000039545,Flicr,0.043026,0.017274,0.012144,0.035352,0.025837,0.022373,0.012163,0.022002,...,0.036193,0.154141,0.018557,0.233864,0.050934,0.035516,0.078964,0.043414,0.043835,0.048446
2,ENSMUSG00000085599,Gm13449,0.013747,0.012649,0.029259,0.012449,0.080033,0.026012,0.0,0.024225,...,0.217935,0.046051,0.055907,0.243226,0.059228,0.215311,0.200141,0.228735,0.284069,0.234701
3,ENSMUSG00000084908,C79798,0.0,0.0,0.0,0.0,0.0,0.008625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028452,0.0
4,ENSMUSG00000086484,Nron,0.017114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.026489,0.0,0.0,0.0,0.0,0.0,0.0


## Looking at missing values

Some symbols are missing.

In [24]:
data.isna().sum()

Name             0
Symbol        3572
GSM2046160       0
GSM2046184       0
GSM2046157       0
              ... 
GSM6610738       0
GSM6610724       0
GSM6610752       0
GSM6610726       0
GSM6610740       0
Length: 1098, dtype: int64

In [22]:
rows_with_nans = data.isna().any(axis=1)

# Print row indexes with NaN values
rows_with_nans[rows_with_nans].index

Index([  122,   123,   124,   138,   147,   189,   209,   247,   248,   287,
       ...
       40559, 40586, 40589, 40593, 40597, 40598, 40600, 40606, 40610, 40613],
      dtype='int64', length=3572)

In [25]:
# Number of NaN values in each row
num_nans = data.isnull().sum(axis=1)

# Print rows with NaN values
num_nans[num_nans > 0].index

Index([  122,   123,   124,   138,   147,   189,   209,   247,   248,   287,
       ...
       40559, 40586, 40589, 40593, 40597, 40598, 40600, 40606, 40610, 40613],
      dtype='int64', length=3572)

In [26]:
data[data.isna().any(axis=1)]

Unnamed: 0,Name,Symbol,GSM2046160,GSM2046184,GSM2046157,GSM2046183,GSM2046155,GSM2046159,GSM2046182,GSM2046180,...,GSM6610744,GSM6610712,GSM6610730,GSM6610746,GSM6610716,GSM6610738,GSM6610724,GSM6610752,GSM6610726,GSM6610740
122,ENSMUST00000149681,,0.110351,0.213971,0.167431,0.061828,0.320481,0.064312,0.046666,0.079966,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
123,ENSMUST00000143317,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.038849,0.000000,0.000000,0.000000
124,ENSMUST00000149696,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.019611,...,0.000000,0.000000,0.000000,0.000000,0.054180,0.000000,0.000000,0.000000,0.037655,0.000000
138,ENSMUST00000181630,,0.710315,0.070144,0.383745,0.312489,0.000000,0.198669,0.226334,0.590934,...,0.154965,0.476987,0.477021,0.258894,0.000000,0.331951,1.156590,0.776468,1.038660,0.736428
147,ENSMUST00000160268,,0.000000,0.033423,0.079071,0.064323,0.000000,0.000000,0.000000,0.000000,...,0.204347,0.000000,0.148935,0.152106,0.000000,0.085997,0.134906,0.157936,0.165046,0.061408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40598,ENSMUST00000180216,,0.000000,0.000000,0.000000,0.000000,0.000000,0.007698,0.000000,0.000000,...,0.012915,0.100363,0.114297,0.000000,0.000000,0.018394,0.016534,0.000000,0.000000,0.000000
40600,ENSMUST00000165439,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
40606,ENSMUST00000177819,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.166627,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
40610,ENSMUST00000058336,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Delete missing symbols..

In [28]:
data = data.dropna(how='any')
data.isna().sum().sum()

0

In [29]:
data.head()

Unnamed: 0,Name,Symbol,GSM2046160,GSM2046184,GSM2046157,GSM2046183,GSM2046155,GSM2046159,GSM2046182,GSM2046180,...,GSM6610744,GSM6610712,GSM6610730,GSM6610746,GSM6610716,GSM6610738,GSM6610724,GSM6610752,GSM6610726,GSM6610740
0,ENSMUSG00000087193,Gm14820,0.047856,0.0,0.011696,0.038796,0.0,0.0,0.0,0.04064,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016459,0.0
1,ENSMUSG00000039545,Flicr,0.043026,0.017274,0.012144,0.035352,0.025837,0.022373,0.012163,0.022002,...,0.036193,0.154141,0.018557,0.233864,0.050934,0.035516,0.078964,0.043414,0.043835,0.048446
2,ENSMUSG00000085599,Gm13449,0.013747,0.012649,0.029259,0.012449,0.080033,0.026012,0.0,0.024225,...,0.217935,0.046051,0.055907,0.243226,0.059228,0.215311,0.200141,0.228735,0.284069,0.234701
3,ENSMUSG00000084908,C79798,0.0,0.0,0.0,0.0,0.0,0.008625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028452,0.0
4,ENSMUSG00000086484,Nron,0.017114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.026489,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40601,40602,40603,40604,40605,40607,40608,40609,40611,40612
Name,ENSMUSG00000087193,ENSMUSG00000039545,ENSMUSG00000085599,ENSMUSG00000084908,ENSMUSG00000086484,ENSMUSG00000085087,ENSMUSG00000085224,ENSMUSG00000085491,ENSMUSG00000086994,ENSMUSG00000086669,...,ENSMUSG00000024810,ENSMUSG00000083761,ENSMUSG00000092935,ENSMUSG00000073482,ENSMUSG00000092496,ENSMUSG00000083580,ENSMUSG00000085891,ENSMUSG00000063488,ENSMUSG00000025754,ENSMUSG00000092464
Symbol,Gm14820,Flicr,Gm13449,C79798,Nron,Gm13528,Gm13425,4930527E20Rik,Gm13402,AA645442,...,Il33,Pgam1-ps1,Gm22697,Gm10517,Vmn1r-ps84,Rpsa-ps3,Gm14634,Zkscan7,Agbl1,Gm8902
GSM2046160,0.047856,0.043026,0.013747,0.0,0.017114,0.184585,0.0,0.0,0.0,0.0,...,0.585986,0.04335,0.0,0.0,0.0,0.0,0.029685,2.69082,0.0,0.0
GSM2046184,0.0,0.017274,0.012649,0.0,0.0,0.225739,0.0,0.0,0.0,0.084449,...,0.682593,0.038878,0.0,0.0,0.0,0.017092,0.031688,2.20585,0.0,0.0
GSM2046157,0.011696,0.012144,0.029259,0.0,0.0,0.246204,0.0,0.0,0.0,0.0,...,0.521939,0.045233,0.0,0.1197,0.0,0.0,0.0,2.56285,0.0,0.015267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM6610738,0.0,0.035516,0.215311,0.0,0.0,0.0,0.0,0.0,0.130749,0.316231,...,0.794336,0.3189,0.0,0.0,0.0,0.0,0.023322,1.85593,0.0,0.0
GSM6610724,0.0,0.078964,0.200141,0.0,0.0,0.806385,0.0,0.0,0.0,0.0,...,0.752421,0.165695,0.0,0.110278,0.0,0.0,0.070138,1.70914,0.0,0.0
GSM6610752,0.0,0.043414,0.228735,0.0,0.0,0.447329,0.0,0.0,0.0,0.405717,...,0.796097,0.265712,0.0,0.0,0.0,0.0,0.054193,2.7101,0.0,0.0
GSM6610726,0.016459,0.043835,0.284069,0.028452,0.0,1.18465,0.0,0.0,0.0,0.829571,...,0.715682,0.034111,0.0,0.276602,0.0,0.0,0.047538,1.66776,0.0,0.0


In [34]:
data = data.T.drop(index="Name")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40601,40602,40603,40604,40605,40607,40608,40609,40611,40612
Symbol,Gm14820,Flicr,Gm13449,C79798,Nron,Gm13528,Gm13425,4930527E20Rik,Gm13402,AA645442,...,Il33,Pgam1-ps1,Gm22697,Gm10517,Vmn1r-ps84,Rpsa-ps3,Gm14634,Zkscan7,Agbl1,Gm8902
GSM2046160,0.047856,0.043026,0.013747,0.0,0.017114,0.184585,0.0,0.0,0.0,0.0,...,0.585986,0.04335,0.0,0.0,0.0,0.0,0.029685,2.69082,0.0,0.0
GSM2046184,0.0,0.017274,0.012649,0.0,0.0,0.225739,0.0,0.0,0.0,0.084449,...,0.682593,0.038878,0.0,0.0,0.0,0.017092,0.031688,2.20585,0.0,0.0
GSM2046157,0.011696,0.012144,0.029259,0.0,0.0,0.246204,0.0,0.0,0.0,0.0,...,0.521939,0.045233,0.0,0.1197,0.0,0.0,0.0,2.56285,0.0,0.015267
GSM2046183,0.038796,0.035352,0.012449,0.0,0.0,0.207048,0.0,0.0,0.0,0.0,...,0.404564,0.017379,0.0,0.170666,0.0,0.0,0.009918,2.47665,0.0,0.0


In [36]:
data.columns = data.loc['Symbol']

In [39]:
data = data.drop(index='Symbol')
data.head()

Symbol,Gm14820,Flicr,Gm13449,C79798,Nron,Gm13528,Gm13425,4930527E20Rik,Gm13402,AA645442,...,Il33,Pgam1-ps1,Gm22697,Gm10517,Vmn1r-ps84,Rpsa-ps3,Gm14634,Zkscan7,Agbl1,Gm8902
GSM2046160,0.047856,0.043026,0.013747,0.0,0.017114,0.184585,0.0,0.0,0.0,0.0,...,0.585986,0.04335,0.0,0.0,0.0,0.0,0.029685,2.69082,0.0,0.0
GSM2046184,0.0,0.017274,0.012649,0.0,0.0,0.225739,0.0,0.0,0.0,0.084449,...,0.682593,0.038878,0.0,0.0,0.0,0.017092,0.031688,2.20585,0.0,0.0
GSM2046157,0.011696,0.012144,0.029259,0.0,0.0,0.246204,0.0,0.0,0.0,0.0,...,0.521939,0.045233,0.0,0.1197,0.0,0.0,0.0,2.56285,0.0,0.015267
GSM2046183,0.038796,0.035352,0.012449,0.0,0.0,0.207048,0.0,0.0,0.0,0.0,...,0.404564,0.017379,0.0,0.170666,0.0,0.0,0.009918,2.47665,0.0,0.0
GSM2046155,0.0,0.025837,0.080033,0.0,0.0,0.157396,0.0,0.0,0.0,0.619891,...,0.36724,0.088547,0.0,0.134558,0.0,0.0,0.007446,2.71046,0.0,0.011883


## Meta data

In [12]:
meta_data = pd.read_csv(path/'sample_metadata.txt', sep='\t', low_memory=False)
meta_data

Unnamed: 0,sample,study,time,outlier
0,GSM2046160,Weger19A,22.0,False
1,GSM2046184,Weger19A,22.0,False
2,GSM2046157,Weger19A,10.0,False
3,GSM2046183,Weger19A,18.0,False
4,GSM2046155,Weger19A,2.0,False


In [43]:
data.shape, meta_data.shape

((1096, 37042), (1096, 4))

In [46]:
meta_data['time']

0       22.0
1       22.0
2       10.0
3       18.0
4        2.0
        ... 
1091    13.5
1092     7.5
1093    19.5
1094     7.5
1095    13.5
Name: time, Length: 1096, dtype: float64

In [47]:
data['code'] = meta_data['time']
data

Symbol,Gm14820,Flicr,Gm13449,C79798,Nron,Gm13528,Gm13425,4930527E20Rik,Gm13402,AA645442,...,Pgam1-ps1,Gm22697,Gm10517,Vmn1r-ps84,Rpsa-ps3,Gm14634,Zkscan7,Agbl1,Gm8902,code
GSM2046160,0.047856,0.043026,0.013747,0.0,0.017114,0.184585,0.0,0.0,0.0,0.0,...,0.04335,0.0,0.0,0.0,0.0,0.029685,2.69082,0.0,0.0,
GSM2046184,0.0,0.017274,0.012649,0.0,0.0,0.225739,0.0,0.0,0.0,0.084449,...,0.038878,0.0,0.0,0.0,0.017092,0.031688,2.20585,0.0,0.0,
GSM2046157,0.011696,0.012144,0.029259,0.0,0.0,0.246204,0.0,0.0,0.0,0.0,...,0.045233,0.0,0.1197,0.0,0.0,0.0,2.56285,0.0,0.015267,
GSM2046183,0.038796,0.035352,0.012449,0.0,0.0,0.207048,0.0,0.0,0.0,0.0,...,0.017379,0.0,0.170666,0.0,0.0,0.009918,2.47665,0.0,0.0,
GSM2046155,0.0,0.025837,0.080033,0.0,0.0,0.157396,0.0,0.0,0.0,0.619891,...,0.088547,0.0,0.134558,0.0,0.0,0.007446,2.71046,0.0,0.011883,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM6610738,0.0,0.035516,0.215311,0.0,0.0,0.0,0.0,0.0,0.130749,0.316231,...,0.3189,0.0,0.0,0.0,0.0,0.023322,1.85593,0.0,0.0,
GSM6610724,0.0,0.078964,0.200141,0.0,0.0,0.806385,0.0,0.0,0.0,0.0,...,0.165695,0.0,0.110278,0.0,0.0,0.070138,1.70914,0.0,0.0,
GSM6610752,0.0,0.043414,0.228735,0.0,0.0,0.447329,0.0,0.0,0.0,0.405717,...,0.265712,0.0,0.0,0.0,0.0,0.054193,2.7101,0.0,0.0,
GSM6610726,0.016459,0.043835,0.284069,0.028452,0.0,1.18465,0.0,0.0,0.0,0.829571,...,0.034111,0.0,0.276602,0.0,0.0,0.047538,1.66776,0.0,0.0,
