# 3.2 Human GEX Data

In [1]:
from clustergrammer2 import net
df = {}

import numpy as np
import pandas as pd

clustergrammer2 backend version 0.2.7


In [23]:
import matplotlib.pyplot as plt

In [19]:
from copy import deepcopy
from scipy.spatial.distance import pdist
import itertools as it

In [3]:
filename = '../data/CITE-seq_data/GSE100866_CBMC_8K_13AB_10X-RNA_umi_HUMAN.csv.gz'
df['gex-ini'] = pd.read_csv(filename, compression='gzip', index_col=0)
df['gex-ini'].shape

(20400, 7339)

In [4]:
net.load_file('../data/CITE-seq_data/adt_ashz_trim_cats.txt')
df['adt-ini'] = net.export_df()
df['adt-ini'].shape

(10, 7265)

In [5]:
cols = df['adt-ini'].columns.tolist()
keep_cells = [x[0] for x in cols]
print(len(keep_cells))

7265


#### Remove HUMAN prefix from genes

In [6]:
rows = df['gex-ini'].index.tolist()
new_rows = [x.replace('HUMAN_','') for x in rows]
df['gex-ini'].index = new_rows

### Filter for trimmed cells only and arcsinh transform

In [7]:
df['gex-trim'] = df['gex-ini'][keep_cells]
df['gex-trim'] = np.arcsinh(df['gex-trim']/5)

#### Drop ribosomal and mitochondrial genes

In [9]:
print(df['gex-trim'].shape)
df['gex'] = deepcopy(df['gex-trim'])
all_genes = df['gex'].index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))

df['gex'] = df['gex'].loc[keep_genes]
df['gex'].shape

# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
                'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']

all_genes = df['gex'].index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or 
             x.split('_')[0] in list_mito_genes]

print(mito_genes)

keep_genes = [x for x in all_genes if x not in mito_genes]
df['gex'] = df['gex'].loc[keep_genes]
print(df['gex'].shape)

# transfer categories
cols = df['adt-ini'].columns.tolist()
ct_dict = {}
for inst_col in cols:
    ct_dict[inst_col[0]] = inst_col[1]
    
cols = df['gex'].columns.tolist()
new_cols = [(x, 'Cell Type: ' + ct_dict[x]) for x in cols]
df['gex'].columns = new_cols    

# normalize by UMI count
barcode_umi_sum = df['gex'].sum()
df['gex-umi'] = deepcopy(df['gex'].div(b
                                       arcode_umi_sum))

(20400, 7265)
20400
20223
['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB', 'MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5', 'MT-ND6', 'MT-RNR1', 'MT-RNR2', 'MT-TD', 'MT-TF', 'MT-TG', 'MT-TH', 'MT-TI', 'MT-TL1', 'MT-TP', 'MT-TT', 'MT-TW', 'MTRF1', 'MTRF1L', 'MTRNR2L1', 'MTRNR2L10', 'MTRNR2L11', 'MTRNR2L12', 'MTRNR2L3', 'MTRNR2L4', 'MTRNR2L5', 'MTRNR2L6', 'MTRNR2L7', 'MTRNR2L8']
(20187, 7265)


#### Keep top 5K genes by sum

In [10]:
ser_sum = df['gex'].sum(axis=1).sort_values(ascending=False)
keep_genes = ser_sum.index.tolist()[:5000]
df['gex-filt'] = df['gex'].loc[keep_genes]
df['gex-filt'].shape

(5000, 7265)

In [14]:
ser_sum = df['gex-umi'].sum(axis=1).sort_values(ascending=False)
keep_genes = ser_sum.index.tolist()[:5000]
df['gex-umi-filt'] = df['gex-umi'].loc[keep_genes]
df['gex-umi-filt'].shape

(5000, 7265)

In [18]:
print(df['adt-ini'].shape)
print(df['gex-filt'].shape)
print(df['gex-filt-umi'].shape)

(10, 7265)
(5000, 7265)
(5000, 7265)


### Compare Sample-Sample Similarity Across Datasets

In [31]:
def corr_datasets(name_1, name_2):
    dist_arr_1 = pdist(df[name_1].transpose(), metric='cosine')
    ser_dist_1 = pd.Series(data=dist_arr_1, name=name_1)

    dist_arr_2 = pdist(df[name_2].transpose(), metric='cosine')
    ser_dist_2 = pd.Series(data=dist_arr_2, name=name_2)
    df_dist = pd.concat([ser_dist_1, ser_dist_2], axis=1)

    inst_corr = 1 - pdist(df_dist.transpose(), metric='correlation')
    print(name_1, 'vs', name_2, inst_corr[0])

#### ADT vs GEX

In [32]:
corr_datasets('adt-ini', 'gex-filt')

adt-ini vs gex-filt 0.670118808363


#### ADT GEX vs UMI

In [33]:
corr_datasets('adt-ini', 'gex-filt-umi')

adt-ini vs gex-filt-umi 0.669929648801


In [30]:
corr_datasets('adt-ini', 'gex-filt-umi')

0.66992964880080408

##### Make Z-scored versions of the data

In [None]:
# # z-scored ADT
# net.load_df(df['adt-ini'])
# net.normalize(axis='row', norm_type='zscore')
# df['adt-z'] = net.export_df()

df['adt-z'] = df['adt-ini']

# Z-scored 5K gex
net.load_df(df['gex-filt'])
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-z'] = net.export_df()

# Z-scored 5KV-UMI gex
net.load_df(df['gex-filt-umi'])
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-umi-z'] = net.export_df()

# Z-scored 5K-1K gex
net.load_df(df['gex-filt'])
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1K-z'] = net.export_df()

# Z-scored 5KV-1K-UMI gex
net.load_df(df['gex-filt-umi'])
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1K-umi-z'] = net.export_df()

# Z-scored 5K-1H gex
net.load_df(df['gex-filt'])
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1H-z'] = net.export_df()

# Z-scored 5KV-1H-UMI gex
net.load_df(df['gex-filt-umi'])
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1H-umi-z'] = net.export_df()

# Z-scored 5K-50 gex
net.load_df(df['gex-filt'])
net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-50-z'] = net.export_df()

# Z-scored 5KV-50-UMI gex
net.load_df(df['gex-filt-umi'])
net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-50-umi-z'] = net.export_df()

In [None]:
corr_datasets('adt-z', 'gex-5K-z')
corr_datasets('adt-z', 'gex-5K-umi-z')

In [44]:
corr_datasets('adt-z', 'gex-5K-1K-z')
corr_datasets('adt-z', 'gex-5K-1K-umi-z')

adt-z vs gex-5K-1K-z 0.630417530262
adt-z vs gex-5K-1K-umi-z 0.706515334859


In [45]:
corr_datasets('adt-z', 'gex-5K-1H-z')
corr_datasets('adt-z', 'gex-5K-1H-umi-z')

adt-z vs gex-5K-1H-z 0.685659326402
adt-z vs gex-5K-1H-umi-z 0.729042130795


In [None]:
corr_datasets('adt-z', 'gex-5K-50-z')
corr_datasets('adt-z', 'gex-5K-50-umi-z')

In [42]:
net.load_df(df['adt-ini'])
# net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
# net.normalize(axis='row', norm_type='zscore')
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "CD3", "ini": 10, "clust": 1, "rank": 4, "rankvar": 2, "group":…

In [12]:
# net.load_df(df['gex-cat-filt'])
# net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
# net.normalize(axis='row', norm_type='zscore')
# net.widget()