In [56]:
import cellex
import numpy as np # needed for formatting data for this tutorial
import pandas as pd # needed for formatting data for this tutorial
import requests

# Import and prepare data

#### Expression data

In [58]:
data = pd.read_csv('/home/cbmr/kzd307/gitte/hippocampus/data/Zhong2.csv')

In [59]:
data.rename(columns={'Unnamed: 0':'bla'}, inplace=True )
data = data.set_index('bla')
data.index.name = None

In [60]:
data.shape

(21344, 31951)

In [61]:
data.head()

Unnamed: 0,AAACCTGAGGAGTTGC-1,AAACCTGAGGGTCGAT-1,AAACCTGAGTATCGAA-1,AAACCTGAGTTAAGTG-1,AAACCTGCAATGCCAT-1,AAACCTGCACCCTATC-1,AAACCTGCAGTAAGCG-1,AAACCTGTCACAGGCC-1,AAACCTGTCCACTGGG-1,AAACCTGTCCATGCTC-1,...,TTTGTCAAGGGCACTA-8,TTTGTCAGTAAGGATT-8,TTTGTCAGTCTGCCAG-8,TTTGTCAGTGATGATA-8,TTTGTCATCAACGCTA-8,TTTGTCATCAATACCG-8,TTTGTCATCACCAGGC-8,TTTGTCATCAGAAATG-8,TTTGTCATCCTTAATC-8,TTTGTCATCGCACTCT-8
MIR1302-10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-34P13.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RP11-34P13.8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AP006222.2,1,0,2,1,0,0,1,0,0,2,...,1,0,0,0,0,1,0,0,0,0


#### Metadata

In [62]:
metadata = pd.read_csv('/home/cbmr/kzd307/gitte/hippocampus/data/Zhong2_cluster_annotation.csv')

In [63]:
metadata.rename(columns={'Unnamed: 0':'bla','V2':'cluster'}, inplace=True )
metadata = metadata.set_index('V1')
metadata.index.name = None
metadata = metadata.drop(["bla"],axis=1)

In [64]:
metadata.shape

(31951, 1)

In [65]:
metadata.head()

Unnamed: 0,cluster
AAACCTGAGGAGTTGC-1,2
AAACCTGAGGGTCGAT-1,1
AAACCTGAGTATCGAA-1,2
AAACCTGAGTTAAGTG-1,2
AAACCTGCAATGCCAT-1,8


### Convert gene ID's to ENSG

In [66]:
# Convert genes from mouse to human
r = requests.post(
    url='https://biit.cs.ut.ee/gprofiler/api/orth/orth/',
    json={
        'organism':'mmusculus',
        'target':'hsapiens',
        'query':data.index.tolist(),
    }
    )

In [67]:
human_id = pd.DataFrame(r.json()['result'])

In [68]:
filtered_ids = human_id[human_id["n_result"]==1]
filtered_id_index = human_id[human_id["n_result"]==1].index.tolist()
filtered_ids.shape

(21895, 11)

In [69]:
filtered_ids2 = filtered_ids[filtered_ids["n_converted"]==1]
filtered_id_index2 = filtered_ids[filtered_ids["n_converted"]==1].index.tolist()
filtered_ids2.shape

(21344, 11)

____________________________________


#### Find missing gene

In [13]:
difference = set(data.index.tolist()).symmetric_difference(set(filtered_ids2["incoming"].tolist()))
list_difference = list(difference)
list_difference

['GM1123']

In [15]:
'GM1123' in data.index.tolist()

True

In [16]:
'GM1123' in filtered_ids2["incoming"]

False

___

#### Prepare three dataframes:
mouse_data (containing mouse gene ID's)
<br>
human_data (containing human gene ID's)
<br>
name_data (containing gene name)

In [70]:
# Merge expression data with ID conversions
merged_left = pd.merge(left=filtered_ids2, right=data, how='left', left_on='incoming', right_on=data.index)
merged_left.head()

Unnamed: 0,converted,description,incoming,n_converted,n_incoming,name,namespaces,disambiguate,ortholog_ensg,n_result,...,TTTGTCAAGGGCACTA-8,TTTGTCAGTAAGGATT-8,TTTGTCAGTCTGCCAG-8,TTTGTCAGTGATGATA-8,TTTGTCATCAACGCTA-8,TTTGTCATCAATACCG-8,TTTGTCATCACCAGGC-8,TTTGTCATCAGAAATG-8,TTTGTCATCCTTAATC-8,TTTGTCATCGCACTCT-8
0,,,MIR1302-10,1,1,,,False,,1,...,0,0,0,0,0,0,0,0,0,0
1,,,RP11-34P13.7,1,2,,,False,,1,...,0,0,0,0,0,0,0,0,0,0
2,,,RP11-34P13.8,1,3,,,False,,1,...,0,0,0,0,0,0,0,0,0,0
3,,,AL627309.1,1,4,,,False,,1,...,0,0,0,0,0,0,0,0,0,0
4,,,AP006222.2,1,5,,,False,,1,...,1,0,0,0,0,1,0,0,0,0


In [71]:
# Remove gene's with no ID
remove_NA = merged_left[merged_left['ortholog_ensg']=='N/A'].index
data_full = merged_left.drop(index=remove_NA)

##### Mouse data

In [72]:
mouse_data = data_full.drop(['description', 'incoming','n_converted','n_incoming','name','namespaces','disambiguate','ortholog_ensg','n_result','query'], axis=1)

mouse_data = mouse_data.set_index('converted')
mouse_data.index.name = None

mouse_data.head()

Unnamed: 0,AAACCTGAGGAGTTGC-1,AAACCTGAGGGTCGAT-1,AAACCTGAGTATCGAA-1,AAACCTGAGTTAAGTG-1,AAACCTGCAATGCCAT-1,AAACCTGCACCCTATC-1,AAACCTGCAGTAAGCG-1,AAACCTGTCACAGGCC-1,AAACCTGTCCACTGGG-1,AAACCTGTCCATGCTC-1,...,TTTGTCAAGGGCACTA-8,TTTGTCAGTAAGGATT-8,TTTGTCAGTCTGCCAG-8,TTTGTCAGTGATGATA-8,TTTGTCATCAACGCTA-8,TTTGTCATCAATACCG-8,TTTGTCATCACCAGGC-8,TTTGTCATCAGAAATG-8,TTTGTCATCCTTAATC-8,TTTGTCATCGCACTCT-8
ENSMUSG00000096351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000095567,1,0,0,0,0,1,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
ENSMUSG00000078485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000035692,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
ENSMUSG00000041936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


##### Human data

In [73]:
human_data = data_full.drop(['description', 'incoming','n_converted','n_incoming','name','namespaces','disambiguate','n_result','query','converted'], axis=1)

human_data = human_data.set_index('ortholog_ensg')
human_data.index.name = None

human_data.head()

Unnamed: 0,AAACCTGAGGAGTTGC-1,AAACCTGAGGGTCGAT-1,AAACCTGAGTATCGAA-1,AAACCTGAGTTAAGTG-1,AAACCTGCAATGCCAT-1,AAACCTGCACCCTATC-1,AAACCTGCAGTAAGCG-1,AAACCTGTCACAGGCC-1,AAACCTGTCCACTGGG-1,AAACCTGTCCATGCTC-1,...,TTTGTCAAGGGCACTA-8,TTTGTCAGTAAGGATT-8,TTTGTCAGTCTGCCAG-8,TTTGTCAGTGATGATA-8,TTTGTCATCAACGCTA-8,TTTGTCATCAATACCG-8,TTTGTCATCACCAGGC-8,TTTGTCATCAGAAATG-8,TTTGTCATCCTTAATC-8,TTTGTCATCGCACTCT-8
ENSG00000187634,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000188976,1,0,0,0,0,1,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
ENSG00000187961,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000187608,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
ENSG00000188157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


##### Name data

In [74]:
name_data = data_full.drop(['description', 'incoming','n_converted','n_incoming','namespaces','disambiguate','n_result','query','converted','ortholog_ensg'], axis=1)

name_data = name_data.set_index('name')
name_data.index.name = None

name_data.head()

Unnamed: 0,AAACCTGAGGAGTTGC-1,AAACCTGAGGGTCGAT-1,AAACCTGAGTATCGAA-1,AAACCTGAGTTAAGTG-1,AAACCTGCAATGCCAT-1,AAACCTGCACCCTATC-1,AAACCTGCAGTAAGCG-1,AAACCTGTCACAGGCC-1,AAACCTGTCCACTGGG-1,AAACCTGTCCATGCTC-1,...,TTTGTCAAGGGCACTA-8,TTTGTCAGTAAGGATT-8,TTTGTCAGTCTGCCAG-8,TTTGTCAGTGATGATA-8,TTTGTCATCAACGCTA-8,TTTGTCATCAATACCG-8,TTTGTCATCACCAGGC-8,TTTGTCATCAGAAATG-8,TTTGTCATCCTTAATC-8,TTTGTCATCGCACTCT-8
SAMD11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NOC2L,1,0,0,0,0,1,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
KLHL17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ISG15,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
AGRN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


##### Original data

In [75]:
original_data = data_full.drop(['description','n_converted','n_incoming','namespaces','disambiguate','n_result','query','converted','ortholog_ensg','name'], axis=1)

original_data = original_data.set_index('incoming')
original_data.index.name = None

original_data.head()

Unnamed: 0,AAACCTGAGGAGTTGC-1,AAACCTGAGGGTCGAT-1,AAACCTGAGTATCGAA-1,AAACCTGAGTTAAGTG-1,AAACCTGCAATGCCAT-1,AAACCTGCACCCTATC-1,AAACCTGCAGTAAGCG-1,AAACCTGTCACAGGCC-1,AAACCTGTCCACTGGG-1,AAACCTGTCCATGCTC-1,...,TTTGTCAAGGGCACTA-8,TTTGTCAGTAAGGATT-8,TTTGTCAGTCTGCCAG-8,TTTGTCAGTGATGATA-8,TTTGTCATCAACGCTA-8,TTTGTCATCAATACCG-8,TTTGTCATCACCAGGC-8,TTTGTCATCAGAAATG-8,TTTGTCATCCTTAATC-8,TTTGTCATCGCACTCT-8
SAMD11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NOC2L,1,0,0,0,0,1,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
KLHL17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ISG15,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
AGRN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


___
## Create ESObject and compute expression specificity

In [76]:
eso = cellex.ESObject(data=human_data, annotation=metadata, verbose=True)

Preprocessing - checking input ... input parsed in 0 min 0 sec
Preprocessing - running remove_non_expressed ... excluded 1 / 13914 genes in 0 min 11 sec
Preprocessing - normalizing data ... data normalized in 0 min 16 sec
Preprocessing - running ANOVA ... excluded 1006 / 13913 genes in 0 min 21 sec


In [77]:
eso.compute(verbose=True)

Computing DET ... 
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 23 sec
Computing EP ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing GES ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 5 sec
Computing NSI ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 1 sec
Computing ESmu ...
    finished in 0 min 0 sec
Computing ESsd ...
    finished in 0 min 0 sec
Computed ['det.esw', 'det.esw_null', 'det.pvals', 'det.esw_s', 'ep.esw', 'ep.esw_null', 'ep.pvals', 'ep.esw_s', 'ges.esw', 'ges.esw_null', 'ges.pvals', 'ges.esw_s', 'nsi.esw', 'nsi.esw_null', 'nsi.pvals', 'nsi.esw_s', 'esmu', 'essd'].


In [78]:
eso.save_as_csv(path='/home/cbmr/kzd307/gitte/hippocampus/data/Zhong2_cellex_human', file_prefix='Zhong2_hippocampus_mouse_cells', verbose=True)


Saving results as csv to disk ...
  Saved: /home/cbmr/kzd307/gitte/hippocampus/data/Zhong2_cellex_human/Zhong2_hippocampus_mouse_cells.esmu.csv.gz
  Saved: /home/cbmr/kzd307/gitte/hippocampus/data/Zhong2_cellex_human/Zhong2_hippocampus_mouse_cells.essd.csv.gz
Finished saving results to /home/cbmr/kzd307/gitte/hippocampus/data/Zhong2_cellex_human


In [79]:
eso.results["esmu"].head()

Unnamed: 0_level_0,1,10,11,12,13,14,15,16,17,18,...,20,21,22,3,4,5,6,7,8,9
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000188976,0.247604,0.048633,0.171437,0.0,0.0,0.0,0.0,0.0,0.0,0.04651,...,0.0,0.0,0.0,0.021631,0.186517,0.0,0.0,0.0,0.0,0.206777
ENSG00000187608,0.0,0.525092,0.0,0.243906,0.0,0.158081,0.147162,0.616548,0.539468,0.402612,...,0.454993,0.278376,0.567356,0.0,0.0,0.0,0.275353,0.0,0.0,0.0
ENSG00000188157,0.005036,0.185603,0.0,0.247268,0.0,0.0,0.0,0.799187,0.0,0.045429,...,0.0,0.379779,0.447114,0.0,0.033493,0.0,0.0,0.0,0.0,0.208066
ENSG00000162571,0.0,0.077331,0.0,0.0,0.0,0.0,0.0,0.0,0.227958,0.318758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000186891,0.241628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.936248,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
eso.results["essd"].head()

Unnamed: 0_level_0,1,10,11,12,13,14,15,16,17,18,...,20,21,22,3,4,5,6,7,8,9
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000188976,0.259657,0.084235,0.296937,0.0,0.0,0.0,0.0,0.0,0.0,0.080558,...,0.0,0.0,0.0,0.034219,0.234517,0.0,0.0,0.0,0.0,0.293167
ENSG00000187608,0.0,0.414469,0.0,0.246314,0.0,0.162849,0.153748,0.372347,0.338706,0.403092,...,0.332855,0.278533,0.349679,0.0,0.0,0.0,0.276037,0.0,0.0,0.0
ENSG00000188157,0.008722,0.186624,0.0,0.249114,0.0,0.0,0.0,0.220968,0.0,0.047822,...,0.0,0.380304,0.295461,0.0,0.057269,0.0,0.0,0.0,0.0,0.222571
ENSG00000162571,0.0,0.13394,0.0,0.0,0.0,0.0,0.0,0.0,0.211299,0.294271,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000186891,0.418512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038602,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
