FYI: MNN method requires genes in rows, cells in columns

First data set: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81076
Second data set: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85241

How are we going to define our batches? 

-For the first data set, we can define batches by the organ donors --> donors 2, 3, 7, 10, 17 (5 different batches)
-For the second data set, we can define batches by the organ donors --> donors 28, 29, 30, and 31

-The cells studied for the organ donors in the second data set are consistent (8 libraries of cells for each donor), whereas the cells studied for the organ donors in the first data set are inconsistent (different cell types, different number of libraries of cells studied).

In [1]:
import pandas as pd
import sklearn # scikit learn package will be very helpful
import numpy as np

from sklearn.neighbors import NearestNeighbors  # for finding nearest neighbors between two data sets

In [2]:
# reading in the first pancreas data set

pancreas_data = pd.read_table("GSE81076_D2_3_7_10_17.txt")  
pancreas_data

Unnamed: 0.1,Unnamed: 0,D2ex_1,D2ex_2,D2ex_3,D2ex_4,D2ex_5,D2ex_6,D2ex_7,D2ex_8,D2ex_9,...,D17TGFB_87,D17TGFB_88,D17TGFB_89,D17TGFB_90,D17TGFB_91,D17TGFB_92,D17TGFB_93,D17TGFB_94,D17TGFB_95,D17TGFB_96
0,A1BG-AS1__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,A1BG__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.001958,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,A1CF__chr10,0.000000,2.007853,1.001958,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.001958,0.000000,0.000000,1.001958,1.001958,1.001958,1.001958,0.000000,0.0
3,A2M-AS1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,A2ML1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.007853,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20143,ZYG11A__chr1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
20144,ZYG11B__chr1,0.000000,1.001958,0.000000,5.049473,1.001958,1.001958,1.001958,1.001958,0.000000,...,1.001958,1.001958,2.007853,0.000000,0.000000,1.001958,1.001958,1.001958,1.001958,0.0
20145,ZYX__chr7,1.001958,0.000000,3.017717,2.007853,0.000000,4.031579,0.000000,1.001958,2.007853,...,7.097484,4.031579,0.000000,1.001958,1.001958,0.000000,0.000000,0.000000,1.001958,0.0
20146,ZZEF1__chr17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.001958,0.000000,...,0.000000,0.000000,1.001958,0.000000,0.000000,1.001958,0.000000,0.000000,0.000000,0.0


In [3]:
# setting the gene names as the row names

pancreas_data = pancreas_data.rename(columns = {"Unnamed: 0":"Genes"}) 
pancreas_data.set_index(["Genes"], inplace = True)
pancreas_data

Unnamed: 0_level_0,D2ex_1,D2ex_2,D2ex_3,D2ex_4,D2ex_5,D2ex_6,D2ex_7,D2ex_8,D2ex_9,D2ex_10,...,D17TGFB_87,D17TGFB_88,D17TGFB_89,D17TGFB_90,D17TGFB_91,D17TGFB_92,D17TGFB_93,D17TGFB_94,D17TGFB_95,D17TGFB_96
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG-AS1__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
A1BG__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,1.001958,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
A1CF__chr10,0.000000,2.007853,1.001958,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,1.001958,0.000000,0.000000,1.001958,1.001958,1.001958,1.001958,0.000000,0.0
A2M-AS1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
A2ML1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.007853,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A__chr1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
ZYG11B__chr1,0.000000,1.001958,0.000000,5.049473,1.001958,1.001958,1.001958,1.001958,0.000000,0.0,...,1.001958,1.001958,2.007853,0.000000,0.000000,1.001958,1.001958,1.001958,1.001958,0.0
ZYX__chr7,1.001958,0.000000,3.017717,2.007853,0.000000,4.031579,0.000000,1.001958,2.007853,0.0,...,7.097484,4.031579,0.000000,1.001958,1.001958,0.000000,0.000000,0.000000,1.001958,0.0
ZZEF1__chr17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.001958,0.000000,0.0,...,0.000000,0.000000,1.001958,0.000000,0.000000,1.001958,0.000000,0.000000,0.000000,0.0


In [4]:
# for cosine normalization

from sklearn.preprocessing import Normalizer


In [5]:
# cosine normalization of first pancreas data set - gets rid of row names and column names though...

transformer = Normalizer().fit(pancreas_data)
pancreas_data_cnorm = pd.DataFrame(transformer.transform(pancreas_data))
pancreas_data_cnorm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1718,1719,1720,1721,1722,1723,1724,1725,1726,1727
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.033422,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,0.000000,0.066485,0.033177,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.033177,0.000000,0.000000,0.033177,0.033177,0.033177,0.033177,0.000000,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.121129,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20143,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
20144,0.000000,0.007053,0.000000,0.035542,0.007053,0.007053,0.007053,0.007053,0.000000,0.0,...,0.007053,0.007053,0.014133,0.000000,0.000000,0.007053,0.007053,0.007053,0.007053,0.0
20145,0.013294,0.000000,0.040040,0.026641,0.000000,0.053492,0.000000,0.013294,0.026641,0.0,...,0.094171,0.053492,0.000000,0.013294,0.013294,0.000000,0.000000,0.000000,0.013294,0.0
20146,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.042107,0.000000,0.0,...,0.000000,0.000000,0.042107,0.000000,0.000000,0.042107,0.000000,0.000000,0.000000,0.0


In [18]:
# function adapted from HW1 - cosine normalization 

def normalizing_cells_MaxAbsScaler(data):
    
    """
        input data: data frame with gene expression data  
            columns are the cells and rows are genes
        output data_normalized_data_values: normalized (scaled) data
        function:  dividing the value of each gene for each cell 
                by the maximum value of that cell.
    """
    import sklearn.preprocessing
    
    # creating the list of patients
    cells = data.columns.values
    
    #using Sklearn to scale the data 
    scaler = sklearn.preprocessing.MaxAbsScaler()
    data_scaled = scaler.fit_transform(data)
    
    #creating the dataframe, the output of sklearn MaxAbsScaler is an array
    data_normalized_data_values = pd.DataFrame(data_scaled, columns= cells, index = data.index)
      
    return data_normalized_data_values

In [19]:
# cosine normalization here keeps the gene and cell names in the rows and columns

normalized_pancreas = normalizing_cells_MaxAbsScaler(pancreas_data)
normalized_pancreas

Unnamed: 0_level_0,D2ex_1,D2ex_2,D2ex_3,D2ex_4,D2ex_5,D2ex_6,D2ex_7,D2ex_8,D2ex_9,D2ex_10,...,D17TGFB_87,D17TGFB_88,D17TGFB_89,D17TGFB_90,D17TGFB_91,D17TGFB_92,D17TGFB_93,D17TGFB_94,D17TGFB_95,D17TGFB_96
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG-AS1__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.0
A1BG__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.005776,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.0
A1CF__chr10,0.000000,0.001993,0.000941,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.005776,0.000000,0.000000,0.00088,0.00088,0.000627,0.001313,0.000000,0.0
A2M-AS1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.0
A2ML1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.002632,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A__chr1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.0
ZYG11B__chr1,0.000000,0.000994,0.000000,0.038319,0.012049,0.007604,0.008911,0.01077,0.00000,0.0,...,0.005340,0.005776,0.001257,0.000000,0.00000,0.00088,0.000627,0.001313,0.000941,0.0
ZYX__chr7,0.003588,0.000000,0.002834,0.015237,0.000000,0.030595,0.000000,0.01077,0.01070,0.0,...,0.037824,0.023240,0.000000,0.001595,0.00088,0.00000,0.000000,0.000000,0.000941,0.0
ZZEF1__chr17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.01077,0.00000,0.0,...,0.000000,0.000000,0.000627,0.000000,0.00000,0.00088,0.000000,0.000000,0.000000,0.0


In [15]:
# dropping missing values - looks like there aren't any here

normalized_pancreas.dropna(axis=1) 
normalized_pancreas

Unnamed: 0_level_0,D2ex_1,D2ex_2,D2ex_3,D2ex_4,D2ex_5,D2ex_6,D2ex_7,D2ex_8,D2ex_9,D2ex_10,...,D17TGFB_87,D17TGFB_88,D17TGFB_89,D17TGFB_90,D17TGFB_91,D17TGFB_92,D17TGFB_93,D17TGFB_94,D17TGFB_95,D17TGFB_96
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG-AS1__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.0
A1BG__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.005776,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.0
A1CF__chr10,0.000000,0.001993,0.000941,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.005776,0.000000,0.000000,0.00088,0.00088,0.000627,0.001313,0.000000,0.0
A2M-AS1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.0
A2ML1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.002632,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A__chr1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.0
ZYG11B__chr1,0.000000,0.000994,0.000000,0.038319,0.012049,0.007604,0.008911,0.01077,0.00000,0.0,...,0.005340,0.005776,0.001257,0.000000,0.00000,0.00088,0.000627,0.001313,0.000941,0.0
ZYX__chr7,0.003588,0.000000,0.002834,0.015237,0.000000,0.030595,0.000000,0.01077,0.01070,0.0,...,0.037824,0.023240,0.000000,0.001595,0.00088,0.00000,0.000000,0.000000,0.000941,0.0
ZZEF1__chr17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.01077,0.00000,0.0,...,0.000000,0.000000,0.000627,0.000000,0.00000,0.00088,0.000000,0.000000,0.000000,0.0


In [None]:
# calculating Euclidean distances for the first pancreas data set - how we determine which pairs of cells to calculate
# these distances for? Probably would have to make a for loop and calculate distances for every pair of cells and store
# those distances in a vector or data frame

from scipy.spatial import distance

d = distance.euclidean(p1, p2) # p1 and p2 would be pairs of cells here

In [6]:
# reading in the second pancreas data set - gene names are already the row names here

pancreas_data_2 = pd.read_table("GSE85241_cellsystems_dataset_4donors_updated.csv")  
pancreas_data_2

Unnamed: 0,D28-1_1,D28-1_2,D28-1_3,D28-1_4,D28-1_5,D28-1_6,D28-1_7,D28-1_8,D28-1_9,D28-1_10,...,D30-8_87,D30-8_88,D30-8_89,D30-8_90,D30-8_91,D30-8_92,D30-8_93,D30-8_94,D30-8_95,D30-8_96
A1BG-AS1__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
A1BG__chr19,0.000000,0.000000,1.001958,1.001958,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.007853,0.0,0.0
A1CF__chr10,6.071431,0.000000,2.007853,6.071431,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,1.001958,7.097484,3.017717,0.000000,3.017717,0.000000,0.000000,0.0,0.0
A2M-AS1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,2.007853,0.000000,1.001958,0.000000,0.000000,0.000000,0.0,0.0
A2ML1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A__chr1,0.000000,0.000000,0.000000,1.001958,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
ZYG11B__chr1,2.007853,6.071431,1.001958,6.071431,3.017717,0.0,0.0,0.0,0.0,0.0,...,9.162012,2.007853,2.007853,1.001958,4.031579,2.007853,3.017717,9.162012,0.0,0.0
ZYX__chr7,0.000000,5.049473,0.000000,1.001958,0.000000,0.0,0.0,0.0,0.0,0.0,...,3.017717,0.000000,0.000000,0.000000,0.000000,1.001958,8.127667,5.049473,0.0,0.0
ZZEF1__chr17,0.000000,4.031579,1.001958,3.017717,0.000000,0.0,0.0,0.0,0.0,0.0,...,2.007853,0.000000,1.001958,0.000000,4.031579,0.000000,2.007853,3.017717,0.0,0.0


In [7]:
# cosine normalization for the second pancreas data set - gets rid of row names and column names though...

transformer = Normalizer().fit(pancreas_data_2)
pancreas_data_2_cnorm = pd.DataFrame(transformer.transform(pancreas_data_2))
pancreas_data_2_cnorm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,0.000000,0.000000,0.048765,0.048765,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.097721,0.0,0.0
2,0.030688,0.000000,0.010149,0.030688,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.005064,0.035874,0.015253,0.000000,0.015253,0.000000,0.000000,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.236062,0.000000,0.117800,0.000000,0.000000,0.000000,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19135,0.000000,0.000000,0.000000,0.179605,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
19136,0.013151,0.039767,0.006563,0.039767,0.019765,0.0,0.0,0.0,0.0,0.0,...,0.060009,0.013151,0.013151,0.006563,0.026406,0.013151,0.019765,0.060009,0.0,0.0
19137,0.000000,0.045033,0.000000,0.008936,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.026913,0.000000,0.000000,0.000000,0.000000,0.008936,0.072486,0.045033,0.0,0.0
19138,0.000000,0.042288,0.010510,0.031653,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.021061,0.000000,0.010510,0.000000,0.042288,0.000000,0.021061,0.031653,0.0,0.0


In [21]:
# cosine normalization here keeps the gene and cell names in the rows and columns

normalized_pancreas_2 = normalizing_cells_MaxAbsScaler(pancreas_data_2)
normalized_pancreas_2

Unnamed: 0,D28-1_1,D28-1_2,D28-1_3,D28-1_4,D28-1_5,D28-1_6,D28-1_7,D28-1_8,D28-1_9,D28-1_10,...,D30-8_87,D30-8_88,D30-8_89,D30-8_90,D30-8_91,D30-8_92,D30-8_93,D30-8_94,D30-8_95,D30-8_96
A1BG-AS1__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
A1BG__chr19,0.000000,0.000000,0.000627,0.001043,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002492,0.0,0.0
A1CF__chr10,0.003802,0.000000,0.001257,0.006319,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000627,0.004444,0.002429,0.000000,0.001890,0.000000,0.000000,0.0,0.0
A2M-AS1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.001257,0.000000,0.001505,0.000000,0.000000,0.000000,0.0,0.0
A2ML1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A__chr1,0.000000,0.000000,0.000000,0.001043,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
ZYG11B__chr1,0.001257,0.007314,0.000627,0.006319,0.011157,0.0,0.0,0.0,0.0,0.0,...,0.016433,0.001257,0.001257,0.000807,0.006055,0.001257,0.005413,0.011371,0.0,0.0
ZYX__chr7,0.000000,0.006083,0.000000,0.001043,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.005413,0.000000,0.000000,0.000000,0.000000,0.000627,0.014578,0.006267,0.0,0.0
ZZEF1__chr17,0.000000,0.004857,0.000627,0.003141,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.003601,0.000000,0.000627,0.000000,0.006055,0.000000,0.003601,0.003745,0.0,0.0


In [22]:
# dropping missing values - looks like there aren't any here

normalized_pancreas_2.dropna(axis=1) 
normalized_pancreas_2

Unnamed: 0,D28-1_1,D28-1_2,D28-1_3,D28-1_4,D28-1_5,D28-1_6,D28-1_7,D28-1_8,D28-1_9,D28-1_10,...,D30-8_87,D30-8_88,D30-8_89,D30-8_90,D30-8_91,D30-8_92,D30-8_93,D30-8_94,D30-8_95,D30-8_96
A1BG-AS1__chr19,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
A1BG__chr19,0.000000,0.000000,0.000627,0.001043,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002492,0.0,0.0
A1CF__chr10,0.003802,0.000000,0.001257,0.006319,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000627,0.004444,0.002429,0.000000,0.001890,0.000000,0.000000,0.0,0.0
A2M-AS1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.001257,0.000000,0.001505,0.000000,0.000000,0.000000,0.0,0.0
A2ML1__chr12,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A__chr1,0.000000,0.000000,0.000000,0.001043,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
ZYG11B__chr1,0.001257,0.007314,0.000627,0.006319,0.011157,0.0,0.0,0.0,0.0,0.0,...,0.016433,0.001257,0.001257,0.000807,0.006055,0.001257,0.005413,0.011371,0.0,0.0
ZYX__chr7,0.000000,0.006083,0.000000,0.001043,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.005413,0.000000,0.000000,0.000000,0.000000,0.000627,0.014578,0.006267,0.0,0.0
ZZEF1__chr17,0.000000,0.004857,0.000627,0.003141,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.003601,0.000000,0.000627,0.000000,0.006055,0.000000,0.003601,0.003745,0.0,0.0


In [None]:
# calculating Euclidean distances for the second pancreas data set - how we determine which pairs of cells to calculate
# these distances for? Probably would have to make a for loop and calculate distances for every pair of cells and store
# those distances in a vector or data frame

d = distance.euclidean(p1, p2) # p1 and p2 would be pairs of cells here

In [8]:
pancreas_data_3 = pd.read_table("GSE86469_GEO.islet.single.cell.processed.data.RSEM.raw.expected.counts.csv") 
pancreas_data_3

# only column here --> ??

Unnamed: 0,",""10th_C10_S104"",""10th_C11_S96"",""10th_C13_S61"",""10th_C14_S53"",""10th_C16_S105"",""10th_C17_S97"",""10th_C19_S62"",""10th_C1_S59"",""10th_C20_S54"",""10th_C23_S98"",""10th_C24_S90"",""10th_C28_S91"",""10th_C30_S107"",""10th_C31_S48"",""10th_C32_S56"",""10th_C33_S64"",""10th_C34_S92"",""10th_C37_S49"",""10th_C41_S101"",""10th_C42_S109"",""10th_C43_S50"",""10th_C45_S66"",""10th_C46_S94"",""10th_C4_S103"",""10th_C50_S73"",""10th_C51_S80"",""10th_C52_S110"",""10th_C53_S118"",""10th_C54_S124"",""10th_C56_S74"",""10th_C57_S81"",""10th_C58_S111"",""10th_C5_S95"",""10th_C61_S68"",""10th_C63_S82"",""10th_C64_S112"",""10th_C66_S126"",""10th_C68_S75"",""10th_C6_S88"",""10th_C73_S84"",""10th_C74_S76"",""10th_C76_S128"",""10th_C77_S120"",""10th_C78_S114"",""10th_C7_S60"",""10th_C80_S77"",""10th_C81_S70"",""10th_C84_S115"",""10th_C85_S86"",""10th_C86_S78"",""10th_C87_S71"",""10th_C88_S130"",""10th_C8_S52"",""10th_C90_S116"",""10th_C95_S123"",""10th_C96_S117"",""11th-C10_S63"",""11th-C12_S47"",""11th-C16_S64"",""11th-C21_S4"",""11th-C22_S65"",""11th-C24_S49"",""11th-C26_S12"",""11th-C27_S19"",""11th-C2_S8"",""11th-C30_S66"",""11th-C33_S20"",""11th-C36_S67"",""11th-C37_S6"",""11th-C39_S21"",""11th-C40_S52"",""11th-C41_S60"",""11th-C43_S7"",""11th-C44_S15"",""11th-C49_S23"",""11th-C4_S62"",""11th-C51_S39"",""11th-C53_S78"",""11th-C54_S86"",""11th-C55_S24"",""11th-C56_S32"",""11th-C57_S40"",""11th-C58_S71"",""11th-C5_S54"",""11th-C61_S25"",""11th-C62_S33"",""11th-C64_S72"",""11th-C66_S88"",""11th-C67_S26"",""11th-C68_S34"",""11th-C77_S82"",""11th-C7_S16"",""11th-C93_S30"",""12th-C10_S155"",""12th-C11_S147"",""12th-C12_S139"",""12th-C14_S190"",""12th-C15_S183"",""12th-C1_S195"",""12th-C20_S191"",""12th-C21_S184"",""12th-C23_S149"",""12th-C24_S141"",""12th-C25_S185"",""12th-C26_S192"",""12th-C28_S142"",""12th-C29_S150"",""12th-C30_S157"",""12th-C33_S200"",""12th-C35_S151"",""12th-C36_S158"",""12th-C37_S187"",""12th-C38_S194"",""12th-C39_S201"",""12th-C3_S181"",""12th-C40_S144"",""12th-C42_S159"",""12th-C46_S145"",""12th-C47_S153"",""12th-C48_S160"",""12th-C4_S154"",""12th-C50_S210"",""12th-C51_S217"",""12th-C53_S168"",""12th-C55_S203"",""12th-C56_S211"",""12th-C57_S218"",""12th-C58_S162"",""12th-C59_S169"",""12th-C60_S176"",""12th-C63_S219"",""12th-C64_S163"",""12th-C65_S170"",""12th-C67_S205"",""12th-C69_S135"",""12th-C71_S171"",""12th-C74_S214"",""12th-C75_S206"",""12th-C76_S177"",""12th-C7_S196"",""12th-C80_S215"",""12th-C81_S207"",""12th-C84_S165"",""12th-C85_S137"",""12th-C8_S189"",""12th-C90_S166"",""12th-C93_S209"",""12th-C94_S180"",""12th-C95_S174"",""12th-C96_S167"",""13th_C12_S178"",""13th_C13_S149"",""13th_C15_S134"",""13th_C16_S193"",""13th_C17_S186"",""13th_C18_S179"",""13th_C1_S147"",""13th_C20_S142"",""13th_C21_S135"",""13th_C23_S187"",""13th_C27_S151"",""13th_C29_S188"",""13th_C31_S137"",""13th_C34_S181"",""13th_C36_S196"",""13th_C38_S145"",""13th_C39_S153"",""13th_C3_S132"",""13th_C46_S183"",""13th_C49_S155"",""13th_C4_S191"",""13th_C51_S170"",""13th_C52_S198"",""13th_C53_S205"",""13th_C54_S212"",""13th_C55_S156"",""13th_C57_S171"",""13th_C5_S184"",""13th_C60_S213"",""13th_C63_S172"",""13th_C66_S214"",""13th_C68_S165"",""13th_C69_S173"",""13th_C6_S177"",""13th_C70_S201"",""13th_C71_S208"",""13th_C72_S215"",""13th_C75_S159"",""13th_C76_S216"",""13th_C77_S209"",""13th_C79_S174"",""13th_C80_S167"",""13th_C81_S160"",""13th_C86_S168"",""13th_C88_S218"",""13th_C89_S211"",""13th_C8_S140"",""13th_C92_S169"",""13th_C93_S162"",""13th_C94_S219"",""13th_C96_S204"",""1st-61_S27"",""1st-C11_S58"",""1st-C13_S19"",""1st-C15_S3"",""1st-C18_S51"",""1st-C19_S20"",""1st-C20_S12"",""1st-C21_S4"",""1st-C32_S14"",""1st-C34_S54"",""1st-C39_S23"",""1st-C42_S71"",""1st-C50_S33"",""1st-C51_S41"",""1st-C53_S74"",""1st-C56_S34"",""1st-C57_S42"",""1st-C58_S76"",""1st-C59_S77"",""1st-C62_S35"",""1st-C64_S79"",""1st-C68_S36"",""1st-C69_S44"",""1st-C71_S83"",""1st-C73_S45"",""1st-C74_S37"",""1st-C75_S29"",""1st-C77_S86"",""1st-C78_S85"",""1st-C79_S46"",""1st-C80_S38"",""1st-C8_S10"",""1st-C90_S91"",""1st-C96_S94"",""2nd-C11_S20"",""2nd-C15_S25"",""2nd-C16_S33"",""2nd-C17_S32"",""2nd-C19_S39"",""2nd-C1_S3"",""2nd-C21_S37"",""2nd-C25_S49"",""2nd-C26_S50"",""2nd-C27_S51"",""2nd-C28_S55"",""2nd-C29_S56"",""2nd-C2_S2"",""2nd-C31_S61"",""2nd-C32_S62"",""2nd-C36_S69"",""2nd-C37_S73"",""2nd-C38_S74"",""2nd-C39_S75"",""2nd-C3_S1"",""2nd-C47_S92"",""2nd-C49_S4"",""2nd-C4_S9"",""2nd-C51_S6"",""2nd-C52_S10"",""2nd-C54_S12"",""2nd-C57_S18"",""2nd-C58_S22"",""2nd-C5_S8"",""2nd-C62_S29"",""2nd-C67_S40"",""2nd-C69_S42"",""2nd-C6_S7"",""2nd-C72_S48"",""2nd-C74_S53"",""2nd-C75_S52"",""2nd-C76_S60"",""2nd-C77_S59"",""2nd-C7_S15"",""2nd-C80_S65"",""2nd-C81_S64"",""2nd-C83_S71"",""2nd-C86_S77"",""2nd-C87_S76"",""2nd-C88_S84"",""2nd-C8_S14"",""2nd-C93_S88"",""2nd-C94_S96"",""2nd-C95_S95"",""3rd-C11_S58"",""3rd-C12_S68"",""3rd-C18_S80"",""3rd-C1_S39"",""3rd-C20_S22"",""3rd-C21_S11"",""3rd-C23_S94"",""3rd-C25_S15"",""3rd-C26_S28"",""3rd-C27_S49"",""3rd-C32_S31"",""3rd-C33_S50"",""3rd-C35_S23"",""3rd-C36_S2"",""3rd-C3_S2"",""3rd-C40_S57"",""3rd-C42_S74"",""3rd-C44_S32"",""3rd-C46_S93"",""3rd-C47_S35"",""3rd-C49_S60"",""3rd-C4_S25"",""3rd-C51_S87"",""3rd-C52_S86"",""3rd-C53_S67"",""3rd-C56_S90"",""3rd-C57_S5"",""3rd-C58_S3"",""3rd-C59_S91"",""3rd-C60_S10"",""3rd-C61_S64"",""3rd-C63_S54"",""3rd-C65_S20"",""3rd-C67_S69"",""3rd-C68_S13"",""3rd-C69_S78"",""3rd-C6_S44"",""3rd-C71_S80"",""3rd-C76_S22"",""3rd-C79_S67"",""3rd-C7_S40"",""3rd-C82_S11"",""3rd-C84_S75"",""3rd-C85_S79"",""3rd-C85_S83"",""3rd-C86_S85"",""3rd-C87_S88"",""3rd-C88_S24"",""3rd-C91_S91"",""3rd-C92_S86"",""3rd-C95_S81"",""3rd-C96_S79"",""3rd-C9_S5"",""4th-61_S28"",""4th-C12_S19"",""4th-C14_S26"",""4th-C18_S31"",""4th-C22_S45"",""4th-C23_S44"",""4th-C24_S43"",""4th-C26_S50"",""4th-C27_S51"",""4th-C28_S55"",""4th-C29_S56"",""4th-C32_S62"",""4th-C33_S63"",""4th-C35_S68"",""4th-C36_S69"",""4th-C37_S73"",""4th-C3_S1"",""4th-C40_S79"",""4th-C43_S85"",""4th-C45_S87"",""4th-C49_S4"",""4th-C4_S9"",""4th-C50_S5"",""4th-C51_S6"",""4th-C56_S17"",""4th-C57_S18"",""4th-C59_S23"",""4th-C5_S8"",""4th-C62_S29"",""4th-C63_S30"",""4th-C64_S34"",""4th-C66_S36"",""4th-C67_S40"",""4th-C68_S41"",""4th-C69_S42"",""4th-C70_S46"",""4th-C73_S54"",""4th-C75_S52"",""4th-C78_S58"",""4th-C79_S66"",""4th-C7_S15"",""4th-C80_S65"",""4th-C81_S64"",""4th-C82_S72"",""4th-C83_S71"",""4th-C85_S78"",""4th-C86_S77"",""4th-C87_S76"",""4th-C88_S84"",""4th-C89_S83"",""4th-C8_S14"",""4th-C92_S89"",""4th-C93_S88"",""4th-C94_S96"",""4th-C95_S95"",""4th-C96_S94"",""4th-C9_S13"",""5th-C10_S92"",""5th-C11_S91"",""5th-C12_S90"",""5th-C14_S97"",""5th-C16_S98"",""5th-C18_S96"",""5th-C19_S104"",""5th-C21_S102"",""5th-C25_S108"",""5th-C30_S105"",""5th-C34_S108"",""5th-C35_S109"",""5th-C40_S112"",""5th-C41_S113"",""5th-C42_S114"",""5th-C43_S121"",""5th-C45_S123"",""5th-C47_S118"",""5th-C49_S89"",""5th-C50_S90"",""5th-C54_S89"",""5th-C56_S95"",""5th-C57_S96"",""5th-C58_S93"",""5th-C59_S94"",""5th-C5_S85"",""5th-C64_S99"",""5th-C66_S101"",""5th-C67_S105"",""5th-C68_S106"",""5th-C69_S107"",""5th-C6_S84"",""5th-C70_S103"",""5th-C73_S111"",""5th-C74_S110"",""5th-C77_S107"",""5th-C7_S93"",""5th-C80_S116"",""5th-C81_S115"",""5th-C83_S111"",""5th-C87_S119"",""5th-C88_S117"",""5th-C90_S115"",""5th-C92_S124"",""5th-C9_S92"",""6th-C12_S18"",""6th-C15_S22"",""6th-C16_S29"",""6th-C19_S35"",""6th-C1_S3"",""6th-C20_S34"",""6th-C21_S33"",""6th-C22_S39"",""6th-C23_S38"",""6th-C24_S37"",""6th-C25_S43"",""6th-C28_S49"",""6th-C29_S50"",""6th-C2_S2"",""6th-C30_S51"",""6th-C34_S61"",""6th-C40_S73"",""6th-C42_S75"",""6th-C45_S81"",""6th-C51_S6"",""6th-C53_S10"",""6th-C54_S11"",""6th-C57_S17"",""6th-C58_S20"",""6th-C59_S21"",""6th-C5_S8"",""6th-C62_S25"",""6th-C63_S26"",""6th-C64_S30"",""6th-C65_S31"",""6th-C66_S32"",""6th-C67_S36"",""6th-C71_S41"",""6th-C73_S48"",""6th-C75_S46"",""6th-C77_S53"",""6th-C79_S60"",""6th-C80_S59"",""6th-C82_S66"",""6th-C83_S65"",""6th-C84_S64"",""6th-C85_S72"",""6th-C87_S70"",""6th-C89_S77"",""6th-C90_S76"",""6th-C91_S84"",""6th-C93_S82"",""6th-C9_S12"",""7th-C10_S19"",""7th-C11_S18"",""7th-C12_S17"",""7th-C14_S24"",""7th-C15_S23"",""7th-C17_S29"",""7th-C19_S36"",""7th-C1_S3"",""7th-C20_S35"",""7th-C21_S34"",""7th-C24_S40"",""7th-C25_S43"",""7th-C28_S48"",""7th-C2_S2"",""7th-C31_S53"",""7th-C32_S54"",""7th-C34_S59"",""7th-C36_S61"",""7th-C3_S1"",""7th-C40_S70"",""7th-C44_S76"",""7th-C45_S77"",""7th-C47_S81"",""7th-C49_S4"",""7th-C4_S8"",""7th-C50_S5"",""7th-C52_S9"",""7th-C53_S10"",""7th-C56_S15"",""7th-C57_S16"",""7th-C58_S20"",""7th-C59_S21"",""7th-C60_S22"",""7th-C62_S27"",""7th-C64_S31"",""7th-C65_S32"",""7th-C66_S33"",""7th-C68_S38"",""7th-C69_S39"",""7th-C72_S42"",""7th-C73_S47"",""7th-C76_S52"",""7th-C77_S51"",""7th-C79_S58"",""7th-C80_S57"",""7th-C81_S56"",""7th-C82_S64"",""7th-C84_S62"",""7th-C85_S69"",""7th-C88_S75"",""7th-C89_S74"",""7th-C8_S13"",""7th-C90_S73"",""7th-C94_S83"",""7th-C96_S82"",""8th-C10_S100"",""8th-C11_S99"",""8th-C18_S110"",""8th-C20_S116"",""8th-C21_S115"",""8th-C23_S121"",""8th-C24_S120"",""8th-C26_S126"",""8th-C28_S131"",""8th-C2_S85"",""8th-C34_S141"",""8th-C37_S146"",""8th-C38_S147"",""8th-C44_S155"",""8th-C47_S160"",""8th-C55_S96"",""8th-C57_S98"",""8th-C58_S101"",""8th-C59_S102"",""8th-C61_S107"",""8th-C64_S112"",""8th-C65_S113"",""8th-C67_S117"",""8th-C69_S119"",""8th-C6_S88"",""8th-C70_S123"",""8th-C73_S130"",""8th-C74_S129"",""8th-C75_S128"",""8th-C79_S140"",""8th-C7_S95"",""8th-C84_S143"",""8th-C85_S150"",""8th-C87_S148"",""8th-C90_S152"",""8th-C91_S158"",""8th-C92_S157"",""8th-C94_S162"",""8th-C95_S161"",""9th-C10_S18"",""9th-C11_S17"",""9th-C13_S24"",""9th-C14_S23"",""9th-C15_S22"",""9th-C16_S30"",""9th-C17_S29"",""9th-C19_S36"",""9th-C1_S3"",""9th-C20_S35"",""9th-C22_S41"",""9th-C25_S45"",""9th-C28_S50"",""9th-C29_S51"",""9th-C2_S2"",""9th-C30_S52"",""9th-C36_S59"",""9th-C38_S64"",""9th-C3_S1"",""9th-C40_S69"",""9th-C41_S70"",""9th-C46_S79"",""9th-C49_S4"",""9th-C51_S6"",""9th-C52_S10"",""9th-C57_S16"",""9th-C59_S20"",""9th-C5_S8"",""9th-C61_S25"",""9th-C62_S26"",""9th-C64_S31"",""9th-C65_S32"",""9th-C66_S33"",""9th-C67_S37"",""9th-C69_S39"",""9th-C6_S7"",""9th-C70_S42"",""9th-C74_S48"",""9th-C75_S47"",""9th-C77_S53"",""9th-C79_S57"",""9th-C80_S56"",""9th-C81_S55"",""9th-C83_S61"",""9th-C84_S60"",""9th-C85_S68"",""9th-C86_S67"",""9th-C87_S66"",""9th-C88_S73"",""9th-C89_S72"",""9th-C8_S14"",""9th-C91_S78"",""9th-C92_S77"",""9th-C93_S76"",""9th-C94_S83"",""9th-C96_S81"",""9th-C9_S13"""
0,"ENSG00000229483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
1,"ENSG00000232849,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
2,"ENSG00000229558,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
3,"ENSG00000232977,0,0,1.64,1.64,0,0.88,0,1.7,2.4..."
4,"ENSG00000227893,22.02,2.04,6.92,5.64,4.47,0,21..."
...,...
26611,"ENSG00000232746,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
26612,"ENSG00000150867,1,233,52,252,0,0,940,195,31,0,..."
26613,"ENSG00000255021,5.69,0,0,0,4.06,0,0,0,0,0,0,0,..."
26614,"ENSG00000251576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."


In [9]:
blood_stem_cell = pd.read_table("GSE81682_HTSeq_counts.txt") 
blood_stem_cell

Unnamed: 0,ID,HSPC_007,HSPC_013,HSPC_019,HSPC_025,HSPC_031,HSPC_037,LT-HSC_001,HSPC_001,HSPC_008,...,Prog_851,Prog_809,Prog_816,Prog_822,Prog_828,Prog_834,Prog_840,Prog_846,Prog_852,Prog_810
0,ENSMUSG00000000001,0,7,1,185,2,2,136,232,354,...,43,1652,182,500,516,137,267,317,85,676
1,ENSMUSG00000000003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSMUSG00000000028,4,1,2,4,3,1,1,2,2,...,5,4,50,401,293,5,596,649,102,457
3,ENSMUSG00000000031,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSMUSG00000000037,0,0,0,0,0,0,0,0,1,...,3,1,0,0,0,0,0,0,1,134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46170,__no_feature,194829,110530,86825,212206,690411,242472,88984,766298,219500,...,390750,470982,260675,569426,580394,471273,421195,337564,200193,857257
46171,__ambiguous,5022,15271,2708,107278,227480,126874,38690,460445,97666,...,188416,422187,218086,401965,388629,296060,317394,253387,136478,396247
46172,__too_low_aQual,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46173,__not_aligned,5820455,1562724,1407254,1810368,6097116,2267894,1019658,5568688,2325621,...,3797989,4347485,2139170,4017762,3821324,4312094,3770447,2549458,1442745,4518455


In [10]:
blood_stem_cell_2 = pd.read_table("GSE72857_umitab.txt")
blood_stem_cell_2

Unnamed: 0,W29953,W29954,W29955,W29956,W29957,W29958,W29959,W29960,W29961,W29962,...,W76327,W76328,W76329,W76330,W76331,W76332,W76333,W76334,W76335,W76336
0610007C21Rik;Apr3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007L01Rik,0,2,1,1,2,0,0,0,1,1,...,0,0,0,0,0,1,1,0,0,0
0610007P08Rik;Rad26l,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,0,0,0,1,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
0610007P22Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rp9,0,0,0,1,0,0,0,2,0,0,...,0,0,1,0,1,4,3,0,0,0
scmh1;Scmh1,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
slc43a2;Slc43a2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tsec-1;Tex9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
