In [1]:
import requests
import pandas as pd
from Postgres_Controller import PostgresConnection

In [2]:
server_name = 'http://77.83.99.86:3030'
service_name = 'ds'
fuseki_url = server_name + '/' + service_name

In [19]:
URI_prefix = 'http://www.semanticweb.org/alicia/ontologies/2020/8/singleCellRepositories#'

def fuseki_to_df(response):
    headers = response.json()["head"]["vars"]
    results = response.json()["results"]['bindings']

    if not results:
        return []

    rows = []

    for result in results:
        result_dict = {}
        for header in headers:
            if header not in result:
                result_dict[header] = None
            else:
                value = result[header]['value']

                if URI_prefix in value:
                    value = value.split('#')[-1]
                result_dict[header] = value

        rows.append(result_dict)

    df = pd.DataFrame(rows)

    return df

In [4]:
query = '''
PREFIX a: <http://www.semanticweb.org/alicia/ontologies/2020/8/singleCellRepositories#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT
    ?projectId
    ?projectTitle
    ?specie
    (COUNT(*) AS ?specimens)
    (MIN(?minAge) as ?min_age)
    (MAX(?maxAge) as ?max_age)
    ?ageUnit
WHERE {
    ?project rdf:type a:Project ;
             a:PR.hasProjectID ?projectId ;
              a:SPR.hasProjectTitle ?projectTitle .
    ?specimen rdf:type a:Specimen ;
              a:SPR.hasProjectTitle ?projectTitle ;
              a:SPR.hasMinAge ?minAge ;
              a:SPR.hasMaxAge ?maxAge ;
              a:SPR.hasSpecie ?specie ;
              a:SPR.hasAgeUnit ?ageUnit .
}
GROUP BY 
    ?projectId
    ?projectTitle
    ?specie
    ?ageUnit
ORDER BY ?projectTitle
'''

In [5]:
response = requests.post(fuseki_url, data={'query': query})

In [6]:
response.json()

{'head': {'vars': ['projectId',
   'projectTitle',
   'specie',
   'specimens',
   'min_age',
   'max_age',
   'ageUnit']},
 'results': {'bindings': [{'projectId': {'type': 'literal',
     'value': '74b6d569-3b11-42ef-b6b1-a0454522b4a0'},
    'projectTitle': {'type': 'literal',
     'value': '1.3 Million Brain Cells from E18 Mice'},
    'specie': {'type': 'uri',
     'value': 'http://www.semanticweb.org/alicia/ontologies/2020/8/singleCellRepositories#MusMusculus'},
    'specimens': {'type': 'literal',
     'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'value': '2'},
    'min_age': {'type': 'literal',
     'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'value': '18'},
    'max_age': {'type': 'literal',
     'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'value': '18'},
    'ageUnit': {'type': 'literal', 'value': 'day'}},
   {'projectId': {'type': 'literal',
     'value': 'f86f1ab4-1fbb-4510-ae35-3ffd752d4dfc'},
    'projectTitle': {'type': 'lit

In [21]:
specimens = fuseki_to_df(response)
specimens

Unnamed: 0,projectId,projectTitle,specie,specimens,min_age,max_age,ageUnit
0,74b6d569-3b11-42ef-b6b1-a0454522b4a0,1.3 Million Brain Cells from E18 Mice,MusMusculus,2,18,18,day
1,f86f1ab4-1fbb-4510-ae35-3ffd752d4dfc,A Single-Cell Transcriptomic Map of the Human ...,HomoSapiens,4,17,59,year
2,E-GEOD-141273,A single cell transcriptome atlas of myeloid b...,DrosophilaMelanogaster,4,72,120,hour
3,E-GEOD-150728,A single-cell atlas of the peripheral immune r...,HomoSapiens,12,20,69,year
4,1defdada-a365-44ad-9b29-443b06bd11d6,A single-cell molecular map of mouse gastrulat...,MusMusculus,26,7,9,week
...,...,...,...,...,...,...,...
139,E-MTAB-7678,scRNA-seq analysis of lung CD64-expressing mon...,MusMusculus,1,10,29,week
140,E-CURD-57,scRNA-seq of Anopheles gambiae hemocytes after...,AnophelesGambiae,3,4,4,day
141,E-MTAB-6677,scRNA-seq of Mus musculus subcutaneous Lin- ce...,MusMusculus,1,8,8,week
142,E-MTAB-8221,scRNA-seq of human fetal lung primary tissues ...,HomoSapiens,9,-1,21,week


In [17]:
query = '''
    SELECT DISTINCT project_id FROM percentil_groups;
'''

In [18]:
with PostgresConnection() as conn:
    cur = conn.cursor()

    print("Querying number of percentiles")
    cur.execute(query)
    project_ids = cur.fetchall()
    
    # close communication with the PostgreSQL database server
    cur.close()
    
    project_ids = [project_id[0] for project_id in project_ids]
    print(project_ids)

Querying number of percentiles
['E-MTAB-7703', 'E-MTAB-7381', 'E-GEOD-81383', 'E-GEOD-149689', 'E-MTAB-6818', 'E-GEOD-109159', 'E-MTAB-7901', 'E-GEOD-99795', 'E-ENAD-16', 'E-GEOD-36552', 'E-GEOD-100058', 'E-GEOD-139324', 'E-GEOD-100426', 'E-CURD-13', 'E-GEOD-124472', 'E-GEOD-111727', 'E-GEOD-100618', 'E-MTAB-7660', 'E-MTAB-6653', 'E-CURD-3', 'E-GEOD-90848', 'E-GEOD-75367', '116965f3-f094-4769-9d28-ae675c1b569c', 'E-MTAB-7142', 'E-GEOD-146122', 'E-MTAB-6505', 'E-GEOD-124263', 'E-ENAD-21', 'E-MTAB-7678', 'E-MTAB-8077', 'E-GEOD-86618', 'E-ENAD-19', 'E-MTAB-6946', 'E-GEOD-99058', 'E-MTAB-7324', 'E-GEOD-99235', 'E-GEOD-109979', 'E-MTAB-7094', 'E-MTAB-7311', 'E-GEOD-98556', 'E-GEOD-125970', 'E-CURD-21', 'E-MTAB-6362', 'E-GEOD-71585', 'E-MTAB-6819', 'E-GEOD-137537', 'E-MTAB-7407', 'E-MTAB-6677', 'E-GEOD-135922', 'E-MTAB-7325', 'E-ENAD-18', 'E-MTAB-6976', 'E-MTAB-5530', 'E-MTAB-6308', 'E-MTAB-7427', 'E-MTAB-6945', 'E-GEOD-81608', 'E-MTAB-7376', 'E-GEOD-130473', 'E-MTAB-5485', '4d6f6c96-2a83-43

In [25]:
specimens = specimens[specimens['projectId'].isin(project_ids)]
specimens

Unnamed: 0,projectId,projectTitle,specie,specimens,min_age,max_age,ageUnit
2,E-GEOD-141273,A single cell transcriptome atlas of myeloid b...,DrosophilaMelanogaster,4,72,120,hour
3,E-GEOD-150728,A single-cell atlas of the peripheral immune r...,HomoSapiens,12,20,69,year
5,4a95101c-9ffc-4f30-a809-f04518a23803,A single-cell reference map of transcriptional...,HomoSapiens,16,50,65,year
7,E-GEOD-98556,A versatile approach for assessing human stem ...,HomoSapiens,1,-1,-1,notApplicable
8,E-GEOD-98556,A versatile approach for assessing human stem ...,HomoSapiens,1,64,64,year
...,...,...,...,...,...,...,...
139,E-MTAB-7678,scRNA-seq analysis of lung CD64-expressing mon...,MusMusculus,1,10,29,week
140,E-CURD-57,scRNA-seq of Anopheles gambiae hemocytes after...,AnophelesGambiae,3,4,4,day
141,E-MTAB-6677,scRNA-seq of Mus musculus subcutaneous Lin- ce...,MusMusculus,1,8,8,week
142,E-MTAB-8221,scRNA-seq of human fetal lung primary tissues ...,HomoSapiens,9,-1,21,week


In [27]:
specimens['ageUnit'].unique()

array(['hour', 'year', 'notApplicable', 'week', 'day', 'gestationalWeek',
       'month', 'notAvailable', 'embryonicDay'], dtype=object)

In [29]:
specimens = specimens[~specimens['ageUnit'].isin(['notApplicable', 'notAvailable'])]
specimens

Unnamed: 0,projectId,projectTitle,specie,specimens,min_age,max_age,ageUnit
2,E-GEOD-141273,A single cell transcriptome atlas of myeloid b...,DrosophilaMelanogaster,4,72,120,hour
3,E-GEOD-150728,A single-cell atlas of the peripheral immune r...,HomoSapiens,12,20,69,year
5,4a95101c-9ffc-4f30-a809-f04518a23803,A single-cell reference map of transcriptional...,HomoSapiens,16,50,65,year
8,E-GEOD-98556,A versatile approach for assessing human stem ...,HomoSapiens,1,64,64,year
9,E-MTAB-6308,An integrated approach to profile lung tumor e...,HomoSapiens,17,57,74,year
...,...,...,...,...,...,...,...
139,E-MTAB-7678,scRNA-seq analysis of lung CD64-expressing mon...,MusMusculus,1,10,29,week
140,E-CURD-57,scRNA-seq of Anopheles gambiae hemocytes after...,AnophelesGambiae,3,4,4,day
141,E-MTAB-6677,scRNA-seq of Mus musculus subcutaneous Lin- ce...,MusMusculus,1,8,8,week
142,E-MTAB-8221,scRNA-seq of human fetal lung primary tissues ...,HomoSapiens,9,-1,21,week


In [32]:
specimens = specimens.sort_values(by='projectId')
specimens

Unnamed: 0,projectId,projectTitle,specie,specimens,min_age,max_age,ageUnit
37,091cf39b-01bc-42e5-9437-f419a66c8a45,Profiling of CD34+ cells from human bone marro...,HomoSapiens,3,19,35,year
5,4a95101c-9ffc-4f30-a809-f04518a23803,A single-cell reference map of transcriptional...,HomoSapiens,16,50,65,year
14,4d6f6c96-2a83-43d8-8fe1-0f53bffd4674,Dissecting the human liver cellular landscape ...,HomoSapiens,5,21,65,year
30,8c3c290d-dfff-4553-8868-54ce45f4ba7f,Melanoma infiltration of stromal and immune cells,MusMusculus,54,6,12,week
125,E-CURD-10,Single-cell transcriptome profiling for metast...,HomoSapiens,3,43,43,year
...,...,...,...,...,...,...,...
127,abe1a013-af7a-45ed-8c26-f3793c24a1f4,Spatio-temporal immune zonation of the human k...,HomoSapiens,6,7,16,week
100,ae71be1d-ddd8-4feb-9bed-24c3ddb6e1ad,Single-cell RNA-seq analysis of human pancreas...,HomoSapiens,10,22,57,year
12,cc95ff89-2e68-4a08-a234-480eca21ce79,Census of Immune Cells,HomoSapiens,127,0,52,year
130,f8aa201c-4ff1-45a4-890e-840d63459ca2,Structural Remodeling of the Human Colonic Mes...,MusMusculus,6,10,12,week


In [33]:
specimens.to_csv('percentile_specimens.csv', sep=',', index=False)

In [36]:
query = '''
SELECT 
    gene_name,
    percentile,
    project_id,
    metadata,
    number_genes,
    number_cells
FROM 
    percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
WHERE 
    gene_name IN ('ENSG00000287846', 'ENSDARG00000034326') AND 
    metadata->>'cell type' IN ('MemoryBcell', 'BloodCell') AND 
    metadata->>'organism' = 'HomoSapiens'
'''

In [38]:
with PostgresConnection() as conn:
    cur = conn.cursor()

    print("Querying number of percentiles")
    cur.execute(query)
    answer = cur.fetchall()
    
    # close communication with the PostgreSQL database server
    cur.close()
    
    display(answer)

Querying number of percentiles


[('ENSG00000287846',
  37.29756029749079,
  'E-MTAB-6386',
  {'organism': 'HomoSapiens',
   'cell type': 'MemoryBcell',
   'developmental stage': 'Adult',
   'disease': 'Control',
   'organism part': 'Blood'},
  14387,
  30),
 ('ENSG00000287846',
  10.537021969080554,
  'cc95ff89-2e68-4a08-a234-480eca21ce79',
  {'organism': 'HomoSapiens',
   'cell type': 'BloodCell',
   'developmental stage': 'Infant',
   'organism part': 'UmbilicalCordBlood'},
  24580,
  288305)]