In [1]:
import pandas as pd

from Postgres_Controller import PostgresConnection

# Queries to know the data base

## Counting percentiles and samplings

In [4]:
query_n_percentiles = "SELECT COUNT(*) FROM percentiles;"
query_n_samplings = "SELECT COUNT(*) FROM percentil_groups;"

with PostgresConnection() as conn:
    cur = conn.cursor()

    print("Querying number of percentiles")
    %time cur.execute(query_n_percentiles)
    n_percentiles = cur.fetchone()[0]
    
    print("Querying number of samplings")
    %time cur.execute(query_n_samplings)
    n_samplings = cur.fetchone()[0]
    
    # close communication with the PostgreSQL database server
    cur.close()

Querying number of percentiles
CPU times: user 1.38 ms, sys: 1.15 ms, total: 2.53 ms
Wall time: 13.3 s
Querying number of samplings
CPU times: user 814 µs, sys: 481 µs, total: 1.3 ms
Wall time: 27.8 ms


In [3]:
print(f"We have {n_percentiles} percentiles and {n_samplings} samplings in the database.")

We have 20368177 percentiles and 923 samplings in the database.


## Querying percentiles by gene_name

In [4]:
gene_name = "ENSG00000167468" # GPX4

query_percentile = f"""
    SELECT * 
    FROM percentiles
    WHERE
        gene_name = '{gene_name}';
"""

with PostgresConnection() as conn:
    cur = conn.cursor()

    print("Querying percentiles by gene name")
    %time cur.execute(query_percentile)
    percentiles = cur.fetchall()
    
    # close communication with the PostgreSQL database server
    cur.close()
    
percentiles[:5]

Querying percentiles by gene name
CPU times: user 1.79 ms, sys: 721 µs, total: 2.52 ms
Wall time: 52.6 ms


[(10846, 'ENSG00000167468', 98.54159621237741, 1),
 (34502, 'ENSG00000167468', 98.16482828380136, 2),
 (58158, 'ENSG00000167468', 96.70213329098513, 3),
 (81814, 'ENSG00000167468', 98.5434641432415, 4),
 (105470, 'ENSG00000167468', 98.97387216345719, 5)]

In [5]:
gene_name = "ENSG00000167468" # GPX4

query_percentile = f"""
    SELECT * 
    FROM percentiles
    WHERE
        gene_name = '{gene_name}';
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 127 ms, sys: 97.1 ms, total: 224 ms
Wall time: 297 ms


Unnamed: 0,id,gene_name,percentile,percentil_group
0,10846,ENSG00000167468,98.541596,1
1,34502,ENSG00000167468,98.164828,2
2,58158,ENSG00000167468,96.702133,3
3,81814,ENSG00000167468,98.543464,4
4,105470,ENSG00000167468,98.973872,5
...,...,...,...,...
538,20276006,ENSG00000167468,95.508828,919
539,20296644,ENSG00000167468,96.830944,920
540,20317282,ENSG00000167468,97.083483,921
541,20337920,ENSG00000167468,95.525727,922


## Querying samplings by metadata

In [6]:
specie = "HomoSapiens"

query_sampling = f"""
    SELECT * 
    FROM percentil_groups
    WHERE
        metadata->>'organism' = '{specie}';
"""

with PostgresConnection() as conn:
    cur = conn.cursor()

    print("Querying samplings by specie")
    %time cur.execute(query_sampling)
    samplings = cur.fetchall()
    
    # close communication with the PostgreSQL database server
    cur.close()

samplings[:5]

Querying samplings by specie
CPU times: user 484 µs, sys: 456 µs, total: 940 µs
Wall time: 35.7 ms


[(1,
  '091cf39b-01bc-42e5-9437-f419a66c8a45',
  {'organism': 'HomoSapiens',
   'cell type': 'HematopoieticStemCell',
   'developmental stage': 'Adult',
   'disease': 'Control',
   'organism part': 'BoneMarrow'},
  23656,
  32445),
 (2,
  '091cf39b-01bc-42e5-9437-f419a66c8a45',
  {'organism': 'HomoSapiens',
   'developmental stage': 'Adult',
   'inferred cell type - ontology labels': 'CommonDendriticProgenitor'},
  20761,
  2075),
 (3,
  '091cf39b-01bc-42e5-9437-f419a66c8a45',
  {'organism': 'HomoSapiens',
   'developmental stage': 'Adult',
   'inferred cell type - ontology labels': 'CommonLymphoidProgenitor'},
  18891,
  3237),
 (4,
  '091cf39b-01bc-42e5-9437-f419a66c8a45',
  {'organism': 'HomoSapiens',
   'developmental stage': 'Adult',
   'inferred cell type - ontology labels': 'CommonMyeloidProgenitor'},
  21558,
  2328),
 (5,
  '091cf39b-01bc-42e5-9437-f419a66c8a45',
  {'organism': 'HomoSapiens',
   'developmental stage': 'Adult',
   'inferred cell type - ontology labels': 'Erythr

In [7]:
specie = "HomoSapiens"

query_sampling = f"""
    SELECT * 
    FROM percentil_groups
    WHERE
        metadata->>'organism' = '{specie}';
"""

with PostgresConnection() as conn:
     %time samplings = pd.read_sql_query(query_sampling, conn)
    
samplings

CPU times: user 6.59 ms, sys: 874 µs, total: 7.46 ms
Wall time: 40.8 ms


Unnamed: 0,id,project_id,metadata,number_genes,number_cells
0,1,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'cell type': 'Hema...",23656,32445
1,2,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'developmental sta...",20761,2075
2,3,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'developmental sta...",18891,3237
3,4,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'developmental sta...",21558,2328
4,5,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'developmental sta...",22122,3463
...,...,...,...,...,...
538,919,f8aa201c-4ff1-45a4-890e-840d63459ca2,"{'organism': 'HomoSapiens', 'developmental sta...",13649,63
539,920,f8aa201c-4ff1-45a4-890e-840d63459ca2,"{'organism': 'HomoSapiens', 'developmental sta...",16882,471
540,921,f8aa201c-4ff1-45a4-890e-840d63459ca2,"{'organism': 'HomoSapiens', 'developmental sta...",16698,539
541,922,f8aa201c-4ff1-45a4-890e-840d63459ca2,"{'organism': 'HomoSapiens', 'developmental sta...",12963,77


## Join between percentiles and samplings

In [8]:
gene_name = "ENSG00000167468" # GPX4

query_percentile = f"""
    SELECT 
        gene_name,
        percentile,
        project_id,
        metadata,
        number_genes,
        number_cells
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        gene_name = '{gene_name}';
"""

with PostgresConnection() as conn:
    cur = conn.cursor()

    print("Querying percentiles by gene name")
    %time cur.execute(query_percentile)
    percentiles = cur.fetchall()
    
    # close communication with the PostgreSQL database server
    cur.close()
    
percentiles[:5]

Querying percentiles by gene name
CPU times: user 1.09 ms, sys: 685 µs, total: 1.78 ms
Wall time: 35.3 ms


[('ENSG00000167468',
  98.54159621237741,
  '091cf39b-01bc-42e5-9437-f419a66c8a45',
  {'organism': 'HomoSapiens',
   'cell type': 'HematopoieticStemCell',
   'developmental stage': 'Adult',
   'disease': 'Control',
   'organism part': 'BoneMarrow'},
  23656,
  32445),
 ('ENSG00000167468',
  98.16482828380136,
  '091cf39b-01bc-42e5-9437-f419a66c8a45',
  {'organism': 'HomoSapiens',
   'developmental stage': 'Adult',
   'inferred cell type - ontology labels': 'CommonDendriticProgenitor'},
  20761,
  2075),
 ('ENSG00000167468',
  96.70213329098513,
  '091cf39b-01bc-42e5-9437-f419a66c8a45',
  {'organism': 'HomoSapiens',
   'developmental stage': 'Adult',
   'inferred cell type - ontology labels': 'CommonLymphoidProgenitor'},
  18891,
  3237),
 ('ENSG00000167468',
  98.5434641432415,
  '091cf39b-01bc-42e5-9437-f419a66c8a45',
  {'organism': 'HomoSapiens',
   'developmental stage': 'Adult',
   'inferred cell type - ontology labels': 'CommonMyeloidProgenitor'},
  21558,
  2328),
 ('ENSG00000167

In [9]:
gene_name = "ENSG00000167468" # GPX4

query_percentile = f"""
    SELECT 
        gene_name,
        percentile,
        project_id,
        metadata,
        number_genes,
        number_cells
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        gene_name = '{gene_name}';
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 14.1 ms, sys: 1.5 ms, total: 15.6 ms
Wall time: 75.4 ms


Unnamed: 0,gene_name,percentile,project_id,metadata,number_genes,number_cells
0,ENSG00000167468,98.541596,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'cell type': 'Hema...",23656,32445
1,ENSG00000167468,98.164828,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'developmental sta...",20761,2075
2,ENSG00000167468,96.702133,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'developmental sta...",18891,3237
3,ENSG00000167468,98.543464,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'developmental sta...",21558,2328
4,ENSG00000167468,98.973872,091cf39b-01bc-42e5-9437-f419a66c8a45,"{'organism': 'HomoSapiens', 'developmental sta...",22122,3463
...,...,...,...,...,...,...
538,ENSG00000167468,95.508828,f8aa201c-4ff1-45a4-890e-840d63459ca2,"{'organism': 'HomoSapiens', 'developmental sta...",13649,63
539,ENSG00000167468,96.830944,f8aa201c-4ff1-45a4-890e-840d63459ca2,"{'organism': 'HomoSapiens', 'developmental sta...",16882,471
540,ENSG00000167468,97.083483,f8aa201c-4ff1-45a4-890e-840d63459ca2,"{'organism': 'HomoSapiens', 'developmental sta...",16698,539
541,ENSG00000167468,95.525727,f8aa201c-4ff1-45a4-890e-840d63459ca2,"{'organism': 'HomoSapiens', 'developmental sta...",12963,77


## Counting genes

In [10]:
gene_name = "ENSG00000167468" # GPX4

query_percentile = f"""
    SELECT 
        gene_name,
        COUNT (*) AS num_percentiles
    FROM 
        percentiles
    GROUP BY
        gene_name;
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 186 ms, sys: 62 ms, total: 248 ms
Wall time: 5.24 s


Unnamed: 0,gene_name,num_percentiles
0,AGAP000002,3
1,AGAP000005,5
2,AGAP000007,5
3,AGAP000008,5
4,AGAP000009,5
...,...,...
226438,YPR201W,2
226439,YPR202W,2
226440,YPR203W,2
226441,YPR204C-A,2


In [11]:
print(f"In our database we have {len(percentiles)} different genes, with a mean of {percentiles['num_percentiles'].mean():.2f} percentiles per gene.")

In our database we have 226443 different genes, with a mean of 89.95 percentiles per gene.


In [12]:
specie = "HomoSapiens"

query_percentile = f"""
    SELECT 
        gene_name,
        COUNT (*) AS num_percentiles
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        metadata->>'organism' = '{specie}'
    GROUP BY
        gene_name;
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 32.2 ms, sys: 5.05 ms, total: 37.3 ms
Wall time: 7.6 s


Unnamed: 0,gene_name,num_percentiles
0,ENSG00000079805,543
1,ENSG00000164611,543
2,ENSG00000219993,447
3,ENSG00000035115,543
4,ENSG00000215452,482
...,...,...
35003,ENSG00000230376,55
35004,ENSG00000261151,101
35005,ENSG00000250625,14
35006,ENSG00000204705,136


In [13]:
print(f"In our database we have {len(percentiles)} different human genes, with a mean of {percentiles['num_percentiles'].mean():.2f} percentiles per gene.")

In our database we have 35008 different human genes, with a mean of 362.90 percentiles per gene.


In [14]:
specie = "HomoSapiens"

query_percentile = f"""
    SELECT 
        project_id,
        COUNT (*) AS num_percentiles
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    GROUP BY
        project_id;
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 2.06 ms, sys: 238 µs, total: 2.3 ms
Wall time: 5.44 s


Unnamed: 0,project_id,num_percentiles
0,091cf39b-01bc-42e5-9437-f419a66c8a45,212904
1,116965f3-f094-4769-9d28-ae675c1b569c,131005
2,2043c65a-1cf8-4828-a656-9e247d4e64f1,217096
3,4a95101c-9ffc-4f30-a809-f04518a23803,182056
4,4d6f6c96-2a83-43d8-8fe1-0f53bffd4674,286155
...,...,...
164,E-MTAB-8559,43450
165,E-MTAB-8698,35944
166,E-MTAB-8810,382266
167,E-MTAB-9221,151664


In [15]:
print(f"In our database we have {len(percentiles)} different projects, with a mean of {percentiles['num_percentiles'].mean():.2f} percentiles per project.")

In our database we have 169 different projects, with a mean of 120521.76 percentiles per project.


# Application queries examples

We are going to design some queries and check the time they take.

First, we will drop the index and will see the time without it so we can then create the index and see how they improve the expected time by query.

In [2]:
command = '''
    DROP INDEX IF EXISTS index_percentiles_id;
    DROP INDEX IF EXISTS index_percentiles_gene_name;
    DROP INDEX IF EXISTS index_percentiles_percentil_group;
    
    DROP INDEX IF EXISTS index_percentil_groups_id;
    DROP INDEX IF EXISTS index_percentil_groups_project_id;
    DROP INDEX IF EXISTS index_percentil_groups_metadata_cell_type;   
    DROP INDEX IF EXISTS index_percentil_groups_metadata_organism;   
'''

with PostgresConnection() as conn:
        cur = conn.cursor()
        # execute command
        %time cur.execute(command)
        # close communication with the PostgreSQL database server
        cur.close()
        # commit the changes
        conn.commit()


CPU times: user 87 µs, sys: 161 µs, total: 248 µs
Wall time: 60.6 ms


## Application queries examples without index

In this first query, we are looking for a concrete gene for Parkinson's Disease in human.

In [17]:
gene_name = "ENSG00000167468" # GPX4
disease = 'ParkinsonsDisease'
specie = 'HomoSapiens'

query_percentile = f"""
    SELECT 
        gene_name,
        percentile,
        project_id,
        metadata,
        metadata->'disease' AS disease,
        metadata->'cell type' AS cell_type,
        number_genes,
        number_cells
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        gene_name = '{gene_name}' AND
        metadata->>'organism' = '{specie}' AND
        metadata->>'disease' = '{disease}';
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 2.01 ms, sys: 122 µs, total: 2.13 ms
Wall time: 3.09 s


Unnamed: 0,gene_name,percentile,project_id,metadata,disease,cell_type,number_genes,number_cells
0,ENSG00000167468,98.659703,E-MTAB-7303,"{'organism': 'HomoSapiens', 'cell type': 'Dopa...",ParkinsonsDisease,DopaminergicNeuron,15892,37
1,ENSG00000167468,98.659703,E-MTAB-7303,"{'organism': 'HomoSapiens', 'developmental sta...",ParkinsonsDisease,,15892,37


Now, we want to search the same gene in human, but for Dopaminergic Neurons.

In [18]:
gene_name = "ENSG00000167468" # GPX4
cell_type = 'DopaminergicNeuron'
specie = 'HomoSapiens'

query_percentile = f"""
    SELECT 
        gene_name,
        percentile,
        project_id,
        metadata,
        metadata->'disease' AS disease,
        metadata->'cell type' AS cell_type,
        number_genes,
        number_cells
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        gene_name = '{gene_name}' AND
        metadata->>'organism' = '{specie}' AND
        metadata->>'cell type' = '{cell_type}';
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 2.32 ms, sys: 166 µs, total: 2.49 ms
Wall time: 3.15 s


Unnamed: 0,gene_name,percentile,project_id,metadata,disease,cell_type,number_genes,number_cells
0,ENSG00000167468,98.659703,E-MTAB-7303,"{'organism': 'HomoSapiens', 'cell type': 'Dopa...",ParkinsonsDisease,DopaminergicNeuron,15892,37
1,ENSG00000167468,99.178163,E-MTAB-7303,"{'organism': 'HomoSapiens', 'cell type': 'Dopa...",Control,DopaminergicNeuron,17035,86


Another type of query we can do is search genes for a concrete project.

In [19]:
project_id = "E-CURD-55"

query_percentile = f"""
    SELECT 
        gene_name,
        percentile,
        project_id,
        metadata,
        number_genes,
        number_cells
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        project_id = '{project_id}';
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 312 ms, sys: 35.2 ms, total: 347 ms
Wall time: 3.14 s


Unnamed: 0,gene_name,percentile,project_id,metadata,number_genes,number_cells
0,ENSG00000065150,78.076942,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
1,ENSG00000000003,30.307095,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
2,ENSG00000000419,69.510851,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
3,ENSG00000000457,60.312365,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
4,ENSG00000000460,50.079049,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
...,...,...,...,...,...,...
41787,ENSG00000229644,38.748539,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233
41788,ENSG00000133703,91.994104,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233
41789,ENSG00000133704,70.467138,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233
41790,ENSG00000133706,85.579220,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233


Or get a subgroup of percentiles for a project.

In [20]:
project_id = "E-CURD-55"

query_percentile = f"""
    SELECT 
        gene_name,
        percentile,
        project_id,
        metadata,
        number_genes,
        number_cells
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        project_id = '{project_id}' AND
        percentile > 90;
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 24.3 ms, sys: 1.71 ms, total: 26 ms
Wall time: 1.74 s


Unnamed: 0,gene_name,percentile,project_id,metadata,number_genes,number_cells
0,ENSG00000000938,97.365017,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
1,ENSG00000006652,95.822354,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
2,ENSG00000007080,90.365544,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
3,ENSG00000007168,92.564557,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
4,ENSG00000007264,91.294974,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
...,...,...,...,...,...,...
4051,ENSG00000157191,90.087938,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233
4052,ENSG00000176731,95.572612,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233
4053,ENSG00000176978,96.594317,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233
4054,ENSG00000177105,97.799014,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233


Or even get the number of groups (samplings) a project has.

In [21]:
project_id = "E-CURD-55"

query_percentile = f"""
    SELECT 
        project_id,
        COUNT(*)
    FROM 
        percentil_groups
    WHERE
        project_id = '{project_id}'
    GROUP BY
        project_id;

"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 2.99 ms, sys: 254 µs, total: 3.25 ms
Wall time: 49.3 ms


Unnamed: 0,project_id,count
0,E-CURD-55,2


## Creating index

We can now create the index of the columns we are using in the queries.

In [3]:
command = '''
    CREATE INDEX IF NOT EXISTS index_percentiles_id ON percentiles(id);
    CREATE INDEX IF NOT EXISTS index_percentiles_gene_name ON percentiles(gene_name);
    CREATE INDEX IF NOT EXISTS index_percentiles_percentil_group ON percentiles(percentil_group);
    
    CREATE INDEX IF NOT EXISTS index_percentil_groups_id ON percentil_groups(id);
    CREATE INDEX IF NOT EXISTS index_percentil_groups_project_id ON percentil_groups(project_id);
    CREATE INDEX IF NOT EXISTS index_percentil_groups_metadata_cell_type ON percentil_groups((metadata ->> 'cell type'));    
    CREATE INDEX IF NOT EXISTS index_percentil_groups_metadata_organism ON percentil_groups((metadata ->> 'organism'));    
'''

with PostgresConnection() as conn:
        cur = conn.cursor()
        # execute command
        %time cur.execute(command)
        # close communication with the PostgreSQL database server
        cur.close()
        # commit the changes
        conn.commit()


CPU times: user 4.21 ms, sys: 5.97 ms, total: 10.2 ms
Wall time: 3min 11s


## Application queries examples with index

Finally, we repeat all the previous queries and we can appreciate a clear improvement.

In [23]:
gene_name = "ENSG00000167468" # GPX4
disease = 'ParkinsonsDisease'
specie = 'HomoSapiens'

query_percentile = f"""
    SELECT 
        gene_name,
        percentile,
        project_id,
        metadata,
        metadata->'disease' AS disease,
        metadata->'cell type' AS cell_type,
        number_genes,
        number_cells
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        gene_name = '{gene_name}' AND
        metadata->>'organism' = '{specie}' AND
        metadata->>'disease' = '{disease}';
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 3.17 ms, sys: 6.44 ms, total: 9.61 ms
Wall time: 31.2 ms


Unnamed: 0,gene_name,percentile,project_id,metadata,disease,cell_type,number_genes,number_cells
0,ENSG00000167468,98.659703,E-MTAB-7303,"{'organism': 'HomoSapiens', 'cell type': 'Dopa...",ParkinsonsDisease,DopaminergicNeuron,15892,37
1,ENSG00000167468,98.659703,E-MTAB-7303,"{'organism': 'HomoSapiens', 'developmental sta...",ParkinsonsDisease,,15892,37


In [24]:
gene_name = "ENSG00000167468" # GPX4
cell_type = 'DopaminergicNeuron'
specie = 'HomoSapiens'

query_percentile = f"""
    SELECT 
        gene_name,
        percentile,
        project_id,
        metadata,
        metadata->'disease' AS disease,
        metadata->'cell type' AS cell_type,
        number_genes,
        number_cells
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        gene_name = '{gene_name}' AND
        metadata->>'organism' = '{specie}' AND
        metadata->>'cell type' = '{cell_type}';
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 2.95 ms, sys: 227 µs, total: 3.18 ms
Wall time: 12 ms


Unnamed: 0,gene_name,percentile,project_id,metadata,disease,cell_type,number_genes,number_cells
0,ENSG00000167468,98.659703,E-MTAB-7303,"{'organism': 'HomoSapiens', 'cell type': 'Dopa...",ParkinsonsDisease,DopaminergicNeuron,15892,37
1,ENSG00000167468,99.178163,E-MTAB-7303,"{'organism': 'HomoSapiens', 'cell type': 'Dopa...",Control,DopaminergicNeuron,17035,86


In [25]:
project_id = "E-CURD-55"

query_percentile = f"""
    SELECT 
        gene_name,
        percentile,
        project_id,
        metadata,
        number_genes,
        number_cells
    FROM 
        percentiles INNER JOIN percentil_groups ON percentiles.percentil_group = percentil_groups.id
    WHERE
        project_id = '{project_id}';
"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 442 ms, sys: 175 ms, total: 617 ms
Wall time: 1.23 s


Unnamed: 0,gene_name,percentile,project_id,metadata,number_genes,number_cells
0,ENSG00000000003,30.307095,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
1,ENSG00000000419,69.510851,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
2,ENSG00000000457,60.312365,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
3,ENSG00000000460,50.079049,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
4,ENSG00000000938,97.365017,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",20873,74355
...,...,...,...,...,...,...
41787,ENSG00000288520,45.188837,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233
41788,ENSG00000288534,49.184161,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233
41789,ENSG00000288550,47.623647,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233
41790,ENSG00000288558,66.121080,E-CURD-55,"{'organism': 'HomoSapiens', 'developmental sta...",19673,23233


In [26]:
project_id = "E-CURD-55"

query_percentile = f"""
    SELECT 
        project_id,
        COUNT(*)
    FROM 
        percentil_groups
    WHERE
        project_id = '{project_id}'
    GROUP BY
        project_id;

"""

with PostgresConnection() as conn:
    %time percentiles = pd.read_sql_query(query_percentile, conn)
    
percentiles

CPU times: user 1.32 ms, sys: 116 µs, total: 1.44 ms
Wall time: 7.99 ms


Unnamed: 0,project_id,count
0,E-CURD-55,2
