# CellGuide data pipeline prototype

This assumes a local snapshot has been downloaded to the same location as this notebook.

Here is a one-liner for downloading the latest prod snapshot:

```
AWS_PROFILE=single-cell-prod aws s3 sync s3://cellxgene-wmg-prod/$(AWS_PROFILE=single-cell-prod aws s3 cp s3://cellxgene-wmg-prod/latest_snapshot_identifier -)/ prod-snapshot/
```

**This file should be in the root folder of the `single-cell-data-portal` repo**.

In [1]:
cd ../

/home/ubuntu/single-cell-data-portal


In [2]:
from backend.wmg.api.v1 import build_filter_dims_values
from backend.wmg.data.ontology_labels import ontology_term_label, ontology_term_id_labels
from backend.wmg.data.snapshot import WmgSnapshot
from backend.wmg.data.query import WmgFiltersQueryCriteria
import tiledb
import pandas as pd
import json
import numpy as np
import os
import openai
from backend.wmg.data.utils import get_datasets_from_curation_api, get_collections_from_curation_api
from backend.wmg.data.rollup import (
    rollup_across_cell_type_descendants,
    rollup_across_cell_type_descendants_array,
    are_cell_types_colinear,
    _descendants,
)
import glob
import requests
import itertools
from scipy import stats
import owlready2
from pronto import Ontology



In [3]:
sn = "prod-snapshot"
snapshot = WmgSnapshot(snapshot_identifier="",
    expression_summary_cube=tiledb.open(f'{sn}/expression_summary'),
    marker_genes_cube=tiledb.open(f'{sn}/marker_genes'),
    expression_summary_default_cube=tiledb.open(f'{sn}/expression_summary_default'),
    expression_summary_fmg_cube=tiledb.open(f'{sn}/expression_summary_fmg'),                       
    cell_counts_cube=tiledb.open(f'{sn}/cell_counts'),
    cell_type_orderings=pd.read_json(f'{sn}/cell_type_orderings.json'),
    primary_filter_dimensions=json.load(open(f'{sn}/primary_filter_dimensions.json','r')),
    dataset_to_gene_ids=json.load(open(f'{sn}/dataset_to_gene_ids.json','r')), 
    filter_relationships=json.load(open(f'{sn}/filter_relationships.json','r')))

## Generate all cell types

In [4]:
ontology = Ontology("https://github.com/obophenotype/cell-ontology/releases/latest/download/cl-basic.obo")


all_cell_types = json.load(open('frontend/src/views/CellCards/common/fixtures/allCellTypes.json','r'))

## Generate cell type descriptions using GPT 3.5

System role:
> You are a knowledgeable cell biologist that has professional experience writing and curating accurate and informative descriptions of cell types.

User role:
> I am making a knowledge-base about cell types. Each cell type is a term from the Cell Ontology and will have its own page with a detailed description of that cell type and its function. Please write me a description for "{cell_type_name}". Please return only the description and no other dialogue. The description should include information about the cell type's function. The description should be at least three paragraphs long.

In [5]:
current_descs = {}#json.load(open('frontend/src/views/CellCards/common/fixtures/allCellTypeDescriptions.json','r'))

In [6]:
import time

openai.organization = "org-4kBCayJVUBGqH42cJhzZYQ6o"
openai.api_key = "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"
openai.Model.list()

def func(cname):
    print(cname)
    time.sleep(0.1)

    try:
        result = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a knowledgeable cell biologist that has professional experience writing and curating accurate and informative descriptions of cell types."},
                {"role": "user", "content": f"I am making a knowledge-base about cell types. Each cell type is a term from the Cell Ontology and will have its own page with a detailed description of that cell type and its function. Please write me a description for \"{cname}\". Please return only the description and no other dialogue. The description should include information about the cell type's function. The description should be at least three paragraphs long."},
            ]
        ) 
        message = result['choices'][0]['message']['content']
    except Exception as e:
        print(e)
        message = ''

    return message

In [15]:
current_descs = all_cell_type_descriptions

In [16]:
current_descs = {k: current_descs[k] for k in current_descs if current_descs[k]!=''}

In [18]:
import requests
import concurrent.futures

all_cell_type_descriptions = {}
z=0
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Start the load operations and mark each future with its URL
    futures = {executor.submit(func, cell_type['label']): cell_type['id'] for cell_type in all_cell_types if cell_type['id'] not in current_descs}    

    for future in concurrent.futures.as_completed(futures):
        cid = futures[future]
        
        if z%10==0:
            print(z)
        z+=1
        
        try:
            all_cell_type_descriptions[cid] = data = future.result()
        except Exception as e:
            pass
        
all_cell_type_descriptions.update(current_descs)

professional antigen presenting cell
chromaffin cell
0


In [20]:
json.dump(all_cell_type_descriptions,open('build_graph_output/allCellTypeDescriptions.json','w'))

## Generate source data per cell type

In [None]:
http://purl.org/ccf/ccf.owl

In [20]:
%env DEPLOYMENT_STAGE=test

def get_title_from_doi(doi):
    url = f"https://api.crossref.org/works/{doi}"

    # Send a GET request to the API
    response = requests.get(url)

    # If the GET request is successful, the status code will be 200
    if response.status_code == 200:
        # Get the response data
        data = response.json()

        # Get the title and citation count from the data
        try:
            title = data['message']['title'][0]
        except:
            try:
                title = data['message']['items'][0]['title'][0]
            except:
                return doi
        return title
    else:
        return doi
    
def format_citation(metadata):
    first_author = metadata['publisher_metadata']['authors'][0]
    if "family" in first_author:
        author_str = f"{first_author['family']}, {first_author['given']} et al."
    else:
        author_str = f"{first_author['name']} et al."
    
    journal = metadata['publisher_metadata']['journal']
    year = metadata['publisher_metadata']['published_year']
    
    return f"{author_str} ({year}) {journal}"

snapshot.build_dataset_metadata_dict()

datasets = get_datasets_from_curation_api()
collections = get_collections_from_curation_api()

collections_dict = {collection['collection_id']: collection for collection in collections}
datasets_dict = {dataset['dataset_id']: dataset for dataset in datasets}

env: DEPLOYMENT_STAGE=test


In [29]:
cts = [i['id'] for i in all_cell_types]

DATA = {}

for i in cts:
    seen_datasets = []
    lineage =_descendants(i)    
    assert i in lineage
    datasets=[]
    for organism in snapshot.primary_filter_dimensions['tissue_terms']:    
        criteria = WmgFiltersQueryCriteria(organism_ontology_term_id=organism,
                                           cell_type_ontology_term_ids=lineage)
        res = build_filter_dims_values(criteria, snapshot)
        data = res['datasets']
        for datum in data:
            if datum['id'] not in seen_datasets and datum['collection_id']!='':
                seen_datasets.append(datum['id'])
                datasets.append(datum)

    collections_to_datasets = {}
    for dataset in datasets:
        dataset = datasets_dict[dataset['id']]
        
        a = collections_to_datasets.get(dataset['collection_id'],{})

        a['collection_name'] = collections_dict[dataset['collection_id']]['name']
        a['collection_url'] = collections_dict[dataset['collection_id']]['collection_url']
        a['publication_url'] = collections_dict[dataset['collection_id']]['doi']
        if collections_dict[dataset['collection_id']]['publisher_metadata']:
            a['publication_title'] = format_citation(collections_dict[dataset['collection_id']])
        else:
            a['publication_title'] = "Publication"
            
        tissues = a.get("tissue", [])
        diseases = a.get("disease", [])
        organisms = a.get("organism", [])
        for tissue in dataset['tissue']:
            if tissue['ontology_term_id'] not in [i['ontology_term_id'] for i in tissues]:
                tissues.append(tissue)
            
        for disease in dataset['disease']:
            if disease['ontology_term_id'] not in [i['ontology_term_id'] for i in diseases]:
                diseases.append(disease)
            
        for organism in dataset['organism']:
            if organism['ontology_term_id'] not in [i['ontology_term_id'] for i in organisms]:
                organisms.append(organism)
            
        a['tissue'] = tissues
        a['disease'] = diseases
        a['organism'] = organisms
    
        collections_to_datasets[dataset['collection_id']]=a

    DATA[i] = list(collections_to_datasets.values())                                 
json.dump(DATA,open('build_graph_output/allSourceData.json','w'))

## Generate enriched genes and expression metrics per cell type

In [123]:
import time
from numba import njit, prange
import numpy as np

@njit(parallel=True)
def nanpercentile_2d(arr, percentile, axis):
    """
    Calculate the specified percentile of a 2D array along an axis, ignoring NaN values.

    Parameters:
        arr: 2D array to calculate percentile of
        percentile: percentile to calculate, as a number between 0 and 100
        axis: axis along which to calculate percentile

    Returns:
        The specified percentile of the 2D array along the specified axis.
    """
    if axis == 0:
        result = np.empty(arr.shape[1])
        for i in prange(arr.shape[1]):
            arr_column = arr[:, i]
            result[i] = nanpercentile(arr_column, percentile)
        return result
    else:
        result = np.empty(arr.shape[0])
        for i in prange(arr.shape[0]):
            arr_row = arr[i, :]
            result[i] = nanpercentile(arr_row, percentile)
        return result

@njit
def nanpercentile(arr, percentile):
    arr_without_nan = arr[np.logical_not(np.isnan(arr))]
    length = len(arr_without_nan)

    if length == 0:
        return np.nan

    return np.percentile(arr_without_nan,percentile)

def _run_ttest(sum1, sumsq1, n1, sum2, sumsq2, n2):
    with np.errstate(divide="ignore", invalid="ignore"):
        mean1 = sum1 / n1
        meansq1 = sumsq1 / n1

        mean2 = sum2 / n2
        meansq2 = sumsq2 / n2

        var1 = meansq1 - mean1**2
        var1[var1 < 0] = 0
        var2 = meansq2 - mean2**2
        var2[var2 < 0] = 0

        var1_n = var1 / n1
        var2_n = var2 / n2
        sum_var_n = var1_n + var2_n
        dof = sum_var_n**2 / (var1_n**2 / (n1 - 1) + var2_n**2 / (n2 - 1))
        tscores = (mean1 - mean2) / np.sqrt(sum_var_n)
        effects = (mean1 - mean2) / np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 1))

    pvals = stats.t.sf(tscores, dof)
    return pvals, effects

def _post_process_stats(
    cell_type_target,
    cell_types_context,
    genes,
    pvals,
    effects,
    percentile=0.05
):
    is_colinear = np.array(
        [are_cell_types_colinear(cell_type, cell_type_target) for cell_type in cell_types_context]
    )
    effects[is_colinear] = np.nan
    pvals[is_colinear] = np.nan
    
    pvals[:,np.all(np.isnan(pvals),axis=0)]=1
    effects[:,np.all(np.isnan(effects),axis=0)]=0

    # aggregate
    effects = nanpercentile_2d(effects, percentile * 100, 0)
    
    effects[effects==0]=np.nan
    
    # pvals = np.array([stats.combine_pvalues(x[np.invert(np.isnan(x))] + 1e-300)[-1] for x in pvals.T])
    pvals = np.sort(pvals,axis=0)[int(np.round(0.05*pvals.shape[0]))]
    
    markers = np.array(genes)[np.argsort(-effects)]
    p = pvals[np.argsort(-effects)]
    effects = effects[np.argsort(-effects)]
    
    statistics = []
    final_markers = []
    for i in range(len(p)):
        pi = p[i]
        ei = effects[i]
        if ei is not np.nan and pi is not np.nan:
            statistics.append({f"p_value": pi, f"effect_size": ei})
            final_markers.append(markers[i])
    return dict(zip(list(final_markers), statistics))

In [18]:
organism_id_to_name = {}
[organism_id_to_name.update(i) for i in snapshot.primary_filter_dimensions['organism_terms']];

cell_counts_df = snapshot.cell_counts_cube.df[:]
cell_counts_df = cell_counts_df.groupby(['organism_ontology_term_id','cell_type_ontology_term_id']).sum(numeric_only=True)

expressions_df = snapshot.expression_summary_fmg_cube.df[:]
expressions_df = expressions_df.groupby(['organism_ontology_term_id','cell_type_ontology_term_id','gene_ontology_term_id']).sum(numeric_only=True)
expressions_df = expressions_df.reset_index()
expressions_df = expressions_df[expressions_df['cell_type_ontology_term_id'].isin(all_cell_type_ids)]

all_cell_type_ids = [i['id'] for i in all_cell_types]
cell_counts_df = cell_counts_df[cell_counts_df.index.get_level_values('cell_type_ontology_term_id').isin(all_cell_type_ids)]

index = pd.Index(list(itertools.product(cell_counts_df.index.get_level_values('organism_ontology_term_id').unique(),all_cell_type_ids)))
index = index.set_names(['organism_ontology_term_id','cell_type_ontology_term_id'])
universe_cell_counts_df = pd.DataFrame(index = index)
universe_cell_counts_df['n_cells']=0
universe_cell_counts_df['n_cells'][cell_counts_df.index] = cell_counts_df['n_cells']

universe_cell_counts_df = rollup_across_cell_type_descendants(
    universe_cell_counts_df.reset_index()
)
universe_cell_counts_df=universe_cell_counts_df[universe_cell_counts_df['n_cells']>0]
universe_cell_counts_df = universe_cell_counts_df.groupby(['organism_ontology_term_id','cell_type_ontology_term_id']).sum()

In [19]:
a = list(set(list(universe_cell_counts_df.index.get_level_values('cell_type_ontology_term_id'))))

In [20]:
all_cell_types = [c for c in all_cell_types if c['id'] in a]

In [21]:
json.dump(all_cell_types,open('build_graph_output/allCellTypes.json','w'))

In [None]:
x = list(zip(*expressions_df[['organism_ontology_term_id','cell_type_ontology_term_id']].values.T))
y = list(expressions_df['gene_ontology_term_id'])

xu = list(set(x))
yu = list(set(y))

x_index = pd.Series(index=pd.Index(xu),data=np.arange(len(xu)))
y_index = pd.Series(index=pd.Index(yu),data=np.arange(len(yu)))

e_nnz = np.zeros((len(xu),len(yu)))
e_sum = np.zeros((len(xu),len(yu)))
e_sqsum = np.zeros((len(xu),len(yu)))

e_nnz[x_index[x].values, y_index[y].values] = expressions_df['nnz']
e_sum[x_index[x].values, y_index[y].values] = expressions_df['sum']
e_sqsum[x_index[x].values, y_index[y].values] = expressions_df['sqsum']

available_combinations = set(list(universe_cell_counts_df.index.values))
missing_combinations = available_combinations.difference(xu)

xu = xu + list(missing_combinations)
x_index = pd.Series(index=pd.Index(xu),data=np.arange(len(xu)))

e_nnz = np.vstack((e_nnz,np.zeros((len(missing_combinations),e_nnz.shape[1]))))
e_sum = np.vstack((e_sum,np.zeros((len(missing_combinations),e_sum.shape[1]))))
e_sqsum = np.vstack((e_sqsum,np.zeros((len(missing_combinations),e_sqsum.shape[1]))))

organisms = x_index.index.get_level_values(0)
organisms_u = list(set(organisms))
cell_types = x_index.index.get_level_values(1)

e_nnz_rollup = np.zeros_like(e_nnz)
e_sum_rollup = np.zeros_like(e_sum)
e_sqsum_rollup = np.zeros_like(e_sqsum)

n_cells = universe_cell_counts_df['n_cells'][x_index.index]
n_cells = np.tile(n_cells.values[:,None],(1,e_nnz_rollup.shape[1]))

In [124]:
all_results = []
for organism in organisms_u:
    cell_types_o = cell_types[organisms==organism]
    e_nnz_o = e_nnz[organisms==organism]
    e_sum_o = e_sum[organisms==organism]    
    e_sqsum_o = e_sqsum[organisms==organism]        
    n_cells_o = n_cells[organisms==organism]
    
    e_nnz_o = rollup_across_cell_type_descendants_array(e_nnz_o,cell_types_o)
    e_sum_o = rollup_across_cell_type_descendants_array(e_sum_o,cell_types_o)  
    e_sqsum_o = rollup_across_cell_type_descendants_array(e_sqsum_o,cell_types_o)  
    
    e_nnz_rollup[organisms==organism]=e_nnz_o
    e_sum_rollup[organisms==organism]=e_sum_o
    e_sqsum_rollup[organisms==organism]=e_sqsum_o 
    
    i_range = np.arange(e_sum_o.shape[0])
    for i in range(e_sum_o.shape[0]):
        print(i)
        sum1 = e_sum_o[i][None,:].copy()
        sumsq1 = e_sqsum_o[i][None,:].copy()
        n1 = n_cells_o[i][None,:].copy()


        pvals, effects = _run_ttest(sum1,sumsq1,n1,
                   e_sum_o, e_sqsum_o, n_cells_o)

        pvals[i] = np.nan
        effects[i] = np.nan
     
        res = _post_process_stats(
            cell_types_o[i],
            cell_types_o,
            y_index.index.values,
            pvals,
            effects,
            percentile=0.05
        )

        res2 = pd.DataFrame()
        res2.index = pd.Index([k for k in res])
        res2['p_value'] = [res[k]['p_value'] for k in res]
        res2['effect_size'] = [res[k]['effect_size'] for k in res]
        res = res2     

        res['cell_type_ontology_term_id'] = cell_types_o[i]
        res['organism_ontology_term_id'] = organism
        res['gene_ontology_term_id'] = res.index
        res = res.reset_index(drop=True)
        res = res[res['effect_size'].notnull()]
        res = res[res['effect_size']>0]        
        all_results.append(res)
        
x_new,y_new = (e_nnz_rollup+e_sum_rollup).nonzero()

r_x_index = pd.Series(index=x_index.values,data=x_index.index.values)
r_y_index = pd.Series(index=y_index.values,data=y_index.index.values)

y_r_new = r_y_index[y_new].values

x_r_new = r_x_index[x_new].values

new_index = pd.Index([i+(j,) for i,j in zip(x_r_new,y_r_new)])

nnz_flat = e_nnz_rollup[x_new,y_new]
sum_flat = e_sum_rollup[x_new,y_new]

new_index = new_index.set_names(['organism_ontology_term_id','cell_type_ontology_term_id','gene_ontology_term_id'])
new_expression_rollup = pd.DataFrame(index=new_index)

new_expression_rollup['nnz']=nnz_flat
new_expression_rollup['sum']=sum_flat

new_expression_rollup=new_expression_rollup.reset_index()

markers_df = pd.concat(all_results,axis=0)
markers_df=markers_df[markers_df['p_value']<1e-5]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
23

In [146]:
dims = ["organism_ontology_term_id","cell_type_ontology_term_id","gene_ontology_term_id"]
markers_df = markers_df.groupby(dims).first()
new_expression_rollup = new_expression_rollup.groupby(dims).first()

In [155]:
new_expression_rollup = new_expression_rollup.join(markers_df,on=dims)

In [158]:
new_expression_rollup = new_expression_rollup.reset_index()
markers_df = markers_df.reset_index()

In [None]:
top_per_group = markers_df.groupby(['organism_ontology_term_id','cell_type_ontology_term_id']).apply(lambda x: x.nlargest(100, 'effect_size'))

columns = ['organism_ontology_term_id','cell_type_ontology_term_id','gene_ontology_term_id']
o_ct_genes = list(zip(*top_per_group[columns].values.T))


filt1 = new_expression_rollup['cell_type_ontology_term_id'].isin(top_per_group['cell_type_ontology_term_id'].unique())

filt2 = new_expression_rollup['gene_ontology_term_id'].isin(top_per_group['gene_ontology_term_id'].unique())

filt = np.logical_and(filt1,filt2)

new_expression_rollup = new_expression_rollup[filt]

new_expression_rollup.index = pd.Index(list(zip(*new_expression_rollup[['organism_ontology_term_id','cell_type_ontology_term_id','gene_ontology_term_id']].values.T)))

universe_cell_counts_df = universe_cell_counts_df.groupby(['organism_ontology_term_id','cell_type_ontology_term_id']).sum()['n_cells']

gene_id_to_symbol={}
all_genes = []
for k in snapshot.primary_filter_dimensions['gene_terms']:
    for i in snapshot.primary_filter_dimensions['gene_terms'][k]:
        gene_id_to_symbol.update(i)
        all_genes.append(list(i.keys())[0])

gene_id_to_name = pd.read_csv('ensembl_gene_ids_to_descriptions.tsv.gz',sep='\t')

gene_id_to_name = gene_id_to_name.set_index('Ensembl GeneIDs')['Description'].to_dict()

data={}
for i in o_ct_genes:
    o,ct,gene = i
    
    nnz = new_expression_rollup['nnz'][i]
    s = new_expression_rollup['sum'][i]
    es = new_expression_rollup['effect_size'][i]
    n_cells = universe_cell_counts_df[(o,ct)]
    
    a = data.get(ct,[])
    a.append({
        'me': s/nnz if nnz > 0 else 0,
        'pc': nnz/n_cells,
        'marker_score': es,
        'symbol': gene_id_to_symbol[gene],
        'name': gene_id_to_name.get(gene,gene_id_to_symbol[gene]),
        'organism': organism_id_to_name[o]
    })
    data[ct]=a


json.dump(data,open('build_graph_output/allEnrichedGenes.json','w'))

## Generate canonical marker genes data

In [None]:
from io import StringIO
def get_asctb_file(file):
    version=4
    succeeded=False
    while not succeeded and version > 0:
        a = requests.get(f"https://hubmapconsortium.github.io/ccf-releases/v1.{version}/markdown/{file}")
        if a.status_code == 200:
            succeeded=True
        else:
            version -= 1
            
    if a.status_code != 200:
        return None
    
    csv_file = 'https://'+a.content.decode().split('Data Table:** |[')[-1].split('.csv')[0].split('https://')[-1]+'.csv'
    a = requests.get(csv_file)
    strio = StringIO(a.content.decode())
    return pd.read_csv(strio,skiprows=10)

def get_all_prefix_cols(prefix, cols):
    i=1
    prefix_cols = []
    while True:
        col = f"{prefix}{i}"
        if col in cols:
            prefix_cols.append(col)
            i+=1
        elif col.upper() in cols:
            prefix_cols.append(col.upper())
            i+=1            
        elif col.lower() in cols:
            prefix_cols.append(col.lower())
            i+=1
        else:
            break   
    return prefix_cols

def get_all_suffix_cols(prefix,suffix, cols):
    i=1
    suffix_cols = []
    while True:
        col = f"{prefix}{i}{suffix}"
        if col in cols:
            suffix_cols.append(col)
            i+=1
        elif col.upper() in cols:
            suffix_cols.append(col.upper())
            i+=1            
        elif col.lower() in cols:
            suffix_cols.append(col.lower())
            i+=1
        else:
            break    
    return suffix_cols

def get_gene_name(gene):
    a = requests.get(f"https://api.cellxgene.dev.single-cell.czi.technology/gene_info/v1/gene_info?gene={gene}")
    if a.status_code == 200:
        r = a.json()
        return r['name']
    else:
        return gene
    
def try_delete(d, k):
    try:
        del d[k]
    except:
        try:
            del d[k[0]+k[1:].lower()]
        except:
            pass
    
def get_title_and_citation_from_doi(doi):
    url = f"https://api.crossref.org/works/{doi}"

    # Send a GET request to the API
    response = requests.get(url)

    # If the GET request is successful, the status code will be 200
    if response.status_code == 200:
        # Get the response data
        data = response.json()

        # Get the title and citation count from the data
        try:
            title = data['message']['title'][0]
            citation = format_citation_mg(data['message'])
        except:
            try:
                title = data['message']['items'][0]['title'][0]
                citation = format_citation_mg(data['message']['items'][0])                
            except:
                return doi
        return f"{title}\n\n - {citation}"
    else:
        return doi
    
def format_citation_mg(message):
    first_author = message['author'][0]
    if "family" in first_author:
        author_str = f"{first_author['family']}, {first_author['given']} et al."
    else:
        author_str = f"{first_author['name']} et al."
    
    journal = message['container-title'][0]
    year = message['created']['date-parts'][0][0]
    
    return f"{author_str} ({year}) {journal}"

def get_tissue_name(t):
    t=t.replace(':','_')
    urls = [
        f"https://www.ebi.ac.uk/ols4/api/ontologies/clo/terms/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252F{t}",
        f"https://www.ebi.ac.uk/ols4/api/ontologies/envo/terms/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252F{t}",
        f"https://www.ebi.ac.uk/ols4/api/ontologies/flopo/terms/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252F{t}",
        f"https://www.ebi.ac.uk/ols4/api/ontologies/doid/terms/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252F{t}",
    ]
    for url in urls:    
        response = requests.get(url)
        if response.status_code==200:
            r = response.json()
            return r['label']
    return t

pf = json.load(open('prod-snapshot/primary_filter_dimensions.json','r'))
all_human_genes = [list(i.values())[0] for i in pf['gene_terms']['NCBITaxon:9606']]

X = tiledb.open('prod-snapshot/cell_counts/')
cc = X.df[:]
tissue_original = list(set(cc['tissue_original_ontology_term_id']))
tissue = list(set(cc['tissue_ontology_term_id']))

m = {}
[m.update(i) for i in pf['tissue_terms']['NCBITaxon:9606']];
for i in tissue:
    if i not in m:
        m[i]=i
        
for i in tissue_original:
    if i not in m:
        m[i]=i


a = requests.get('https://hubmapconsortium.github.io/ccf-releases/v1.4/docs/index.html')
content = a.content.decode()
start=1
files = []
while start!=0:
    i = content.find('href="asct-b/',start)
    files.append(content[i:].split('.html')[0].split('href="')[-1]+".md")
    start=i+1
    


dfs = [get_asctb_file(f) for f in files]
dfs = [i for i in dfs if i is not None]
parsed_table_entries = []

seen=[]
for df in dfs:
    print(df.iloc[0,0])
    
    assert df.columns[0]=='AS/1'

    cols = list(df.columns)

    ref_prefix = "Ref/"
    ref_doi_suffix = "/DOI"

    ref_prefixes = get_all_prefix_cols(ref_prefix,cols)
    if len(ref_prefixes):
        prefix=ref_prefixes[0].split('/')[0]+"/"
        ref_suffixes=get_all_suffix_cols(prefix,ref_doi_suffix,cols)
        ref_suffixes_notes=get_all_suffix_cols(prefix,"/NOTES",cols)
    else:
        ref_suffixes=[]
        ref_suffixes_notes=[]


    gene_prefix = "BGene/"
    gene_label_suffix = "/LABEL"

    gene_prefixes = get_all_prefix_cols(gene_prefix,cols)
    if len(gene_prefixes):
        prefix=gene_prefixes[0].split('/')[0]+"/"
        gene_suffixes=get_all_suffix_cols(prefix,gene_label_suffix,cols)
    else:
        gene_suffixes=[]

    tissue_prefix = "AS/"
    protein_label_suffix = "/ID"

    tissue_prefixes = get_all_prefix_cols(tissue_prefix,cols)
    if len(tissue_prefixes):
        prefix=tissue_prefixes[0].split('/')[0]+"/"
        tissue_suffixes=get_all_suffix_cols(prefix,protein_label_suffix,cols) 
    else:
        tissue_suffixes=[]

    ct = "CT/1"
    ctid = "CT/1/ID"
    try:
        assert len(ref_prefixes) > 0
        assert len(ref_suffixes) > 0
        assert len(gene_prefixes) > 0 
        assert len(gene_suffixes) > 0 
        assert len(tissue_suffixes) > 0
    except:
        print("Skipping")
        continue
    
    
    for n in range(df.shape[0]):
        res = df.iloc[n].to_dict()
        data_tmp = {i: res[i] for i in res if i in [ct, ctid] + ref_prefixes+ref_suffixes+gene_prefixes+gene_suffixes+tissue_suffixes+ref_suffixes_notes}
        data = data_tmp.copy()
        genes = []
        gene_to_key = {}
        for i in gene_prefixes:
            data_tmp[i] = str(data_tmp[i]).split(' ')[0]
            if data_tmp[i].upper() not in all_human_genes or data_tmp[i]=='nan':
                try_delete(data,i)
                try_delete(data,i+'/LABEL')
            else:
                data[i] = data_tmp[i].upper()
                genes.append(data[i])
                gene_to_key[data[i]] = i

        valid_ref_accessors = []
        for i in ref_suffixes:
            i_prefix = '/'.join(i.split('/')[:-1])
            
            if str(data[i_prefix +'/DOI'])=='nan' or str(data[i_prefix +'/DOI'])=='No DOI':
                try_delete(data,i)
                try_delete(data,i_prefix)
                try_delete(data,i_prefix+'/NOTES')
            else:
                data[i_prefix+'/DOI'] = data[i_prefix+'/DOI'].split(' ')[-1]
                try_delete(data,i_prefix)
                try_delete(data,i_prefix+'/NOTES')                
                valid_ref_accessors.append(i_prefix)

        refs = []
        titles = []
        for i in valid_ref_accessors:
            doi = data[i+'/DOI']
            if doi != "":
                if doi[-1] == ".":
                    doi = doi[:-1]

            title = get_title_and_citation_from_doi(doi)
            refs.append(doi)
            titles.append(title)
            
        refs = ';;'.join(refs)
        titles = ';;'.join(titles)
            
        if not str(data[ctid]).startswith('CL:'):         
            continue
        
        tissue_general = None
        for i in tissue_suffixes[::-1]:
            if data[i] in tissue:
                tissue_general = data[i]
                break

        tissue_specific = None
        for i in tissue_suffixes[::-1]:
            if data[i] in tissue_original:
                tissue_specific = data[i]
                break

        if tissue_general is None:
            for i in tissue_suffixes:
                if data[i].startswith("UBERON:"):
                    tissue_general=data[i]
                    break
                    
        if tissue_specific is None:
            for i in tissue_suffixes:
                if data[i].startswith("UBERON:"):
                    tissue_specific=data[i]                    
                    break
                    
        assert tissue_general is not None
        assert tissue_specific is not None

        for gene in genes:
            label = str(data[gene_to_key[gene]+'/LABEL'])
            if gene == label.upper() or label == 'nan':
                label = get_gene_name(gene)


            gene_dict = {
                "tissue_general": tissue_general,
                "tissue_specific": tissue_specific,
                "symbol": gene,
                "name": label,
                "publication": refs,
                "publication_titles": titles,
                "cell_type_ontology_term_id": data[ctid]
            }
            hashed_dict = hash(json.dumps(gene_dict))
            if hashed_dict not in seen:
                parsed_table_entries.append(gene_dict)
                seen.append(hashed_dict)
                

ts = list(set([i['tissue_general'] for i in parsed_table_entries]+[i['tissue_specific'] for i in parsed_table_entries]))

tissues_by_id = {t: get_tissue_name(t) for t in ts}

gene_infos = {}
for entry in parsed_table_entries:
    entry = entry.copy()
    ct = entry['cell_type_ontology_term_id']
    del entry['cell_type_ontology_term_id']
    
    a = gene_infos.get(ct,[])
    entry['tissue_general'] = tissues_by_id.get(entry['tissue_general'],entry['tissue_general'])
    entry['tissue_specific'] =tissues_by_id.get(entry['tissue_specific'],entry['tissue_specific'])
    a.append(entry)
    gene_infos[ct]=a

  return pd.read_csv(strio,skiprows=10)



.md
brain


In [7]:
def func1(x):
    y = [y for y in x.values if y != '']
    z = []
    for i in y:
        if i not in z:
            z.append(i)
    res = ';;'.join(z)
    return res

def func2(x):
    res = x.values
    res2=[i not in gi2['symbol'].values for i in res]
    index = 0
    try:
        index = res2.index(True)
    except:
        pass
    return res[index]

for key in gene_infos:
    gi = pd.DataFrame(gene_infos[key])
    gi2 = pd.DataFrame(gi)
    
    gi = gi.groupby(["tissue_general","symbol"]).agg({'name': func2,'publication': func1, 'publication_titles': func1}).reset_index().to_dict(orient="records")
    
    gi2 = pd.DataFrame(gi)
    gi2['n']=1
    gi3 = gi2.groupby(['symbol','tissue_general']).sum(numeric_only=True)['n']
    valid_genes = list(set(gi3.index[gi3>0].get_level_values('symbol')))
    gi2 = pd.DataFrame(gi)

    gi2 = gi2[gi2['symbol'].isin(valid_genes)]
    gi3 = gi2.groupby('symbol').agg({'name': func2,'publication': func1, 'publication_titles': func1})
    gi3 = gi3.reset_index()
    gi3['tissue_general']='All Tissues'
    gi3['tissue_specific']='All Tissues'
    gi3 = gi3[gi2.columns]
    gi.extend(gi3.to_dict(orient="records"))
    gene_infos[key]=gi

In [8]:
json.dump(gene_infos,open('frontend/src/views/CellCards/common/fixtures/allCellTypeMarkerGenes.json','w'))