In [5]:
%pip install --upgrade pandas
%pip install --upgrade numpy
%pip install --upgrade psycopg2-binary
%pip install --upgrade owlready2

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting owlready2
  Downloading owlready2-0.45.tar.gz (27.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: owlready2
  Building wheel for owlready2 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for owlready2: filename=owlready2-0.45-cp39-cp39-linux_aarch64.whl size=24039811 sha256=8062c5081f3e0bae98e45d688d6b0858a88fdb960f9cfb468f3e70d2b7427eec
  Stored in directory: /home/ec2-user/.cache/pip/wheels/fd/3d/21/a3354f6877530e6efe0f645ca518f7fb07e9af5be7655b2355
Successfully bu

----
# Loading BTO

In [6]:
from owlready2 import *

# Load the BTO ontology
onto_path.append('../data/ontologies/')
onto = get_ontology('http://purl.obolibrary.org/obo/bto.owl').load()

# classes dictionary: {class_name: class_label} 
#   - class_label is None if no label is found
classes = {c.name: c.label.first() for c in onto.classes()}
bto_objects = {c.name: c for c in onto.classes()}
bto_objects_rev = {c: c.name for c in onto.classes()}

# fetch children of all classes
bto_immediate_children = {c.name: set() for c in onto.classes()}

# generate the immediate children of each class
for bto_class in bto_objects_rev:
    for bto_parent_class in bto_class.is_a:
        if bto_parent_class == owl.Thing:
            # ignore the root class
            continue
        else:
            bto_class_type = type(bto_parent_class)
            if bto_class_type == ThingClass:
                bto_immediate_children[bto_parent_class.name].add(bto_class.name)
            elif bto_class_type == Restriction:
                # Restriction 2202 defines a derives_from/develops_from relationship
                bto_immediate_children[bto_parent_class.value.name].add(bto_class.name)

def get_all_children(bto_immediate_children, bto_class):
    '''
    helper function to get all children of a given class by recursing down the tree
    - base case for recursion when value for the class in bto_children is empty
    - recursive case for recursion when value for the class in bto_children is not empty
    - returns a set of all children of the given class
    '''
    if not bto_immediate_children[bto_class]:
        return set()
    else:
        children = set()
        for child in bto_immediate_children[bto_class]:
            children.add(child)
            children = children.union(get_all_children(bto_immediate_children, child))
        return children


bto_children = {c.name: set() for c in onto.classes()}

for bto_class, _ in bto_immediate_children.items():
    bto_children[bto_class] = get_all_children(bto_immediate_children, bto_class)







----
# Retrieve table from PostgreSQL 
- Want to store the `biosample_tissue` table in a pandas df
    - Could also retrieve data from `tissue_map_rerun.csv` but use `biosample_tissue` to be consistent

In [9]:
import psycopg2
import pandas as pd

# database connection details
host = "serratus-aurora-20210406.cluster-ro-ccz9y6yshbls.us-east-1.rds.amazonaws.com"
database = "summary"
user = "public_reader"
password = "serratus"

try:
    conn = psycopg2.connect(host=host, database=database, user=user, password=password)
    print("Successfully connected to Serratus 🗻")

    # select all records from the biosample_tissue table
    query = """
        SELECT * FROM biosample_tissue;
    """
    print("Running query...")
    # load the data into a Pandas DataFrame
    tissue_df = pd.read_sql(query, conn)

    conn.close()
except Exception as e:
    print("Unable to connect to Serratus 🗻")
    print(e)

Successfully connected to Serratus 🗻
Running query...


  tissue_df = pd.read_sql(query, conn)


In [11]:
tissue_df

Unnamed: 0,biosample_id,srs_id,source,text,tissue,bto_id
0,SAMEA104642966,ERS2244349,description,individual plants were grown in a greenhouse f...,plant,BTO_0001226
1,SAMEA104642967,ERS2244350,biological material preprocessing,ssd derived material from a single seed of a s...,cap,BTO_0000172
2,SAMEA104642967,ERS2244350,biological material preprocessing,ssd derived material from a single seed of a s...,seed,BTO_0001228
3,SAMEA104642967,ERS2244350,plant anatomical entity,leaf,seedling,BTO_0000713
4,SAMEA104642967,ERS2244350,plant structure development stage,seedling,leaf,BTO_0001481
...,...,...,...,...,...,...
21628226,SAMEA104642965,ERS2244348,biological material preprocessing,ssd derived material from a single seed of a s...,seed,BTO_0001228
21628227,SAMEA104642965,ERS2244348,plant anatomical entity,leaf,seedling,BTO_0000713
21628228,SAMEA104642965,ERS2244348,plant structure development stage,seedling,leaf,BTO_0001481
21628229,SAMEA104642965,ERS2244348,description,individual plants were grown in a greenhouse f...,plant,BTO_0001226


----
# Retrieve random samples from dataframe
- want 30000 random biosample ids from the `biosample_tissue` table
    - dataframe will have more than 30000 rows since biosamples can have multiple tissues
    - use `sample` function from pandas to get random samples
- random samples may have false positives...


In [14]:
# get unique biosample ids
biosample_ids = list(tissue_df['biosample_id'].unique())
n = len(biosample_ids)
print(f"Found {n} unique biosample ids")

Found 17675425 unique biosample ids


In [16]:
# generate 30000 random biosample ids
import random
random.seed(42)
random_biosample_ids = random.sample(biosample_ids, 30000)

In [18]:
tissue_df_subset = tissue_df[tissue_df['biosample_id'].isin(random_biosample_ids)]
tissue_df_subset

Unnamed: 0,biosample_id,srs_id,source,text,tissue,bto_id
1513,SAMEA104643266,ERS2244649,biological material preprocessing,ssd derived material from a single seed of a s...,cap,BTO_0000172
1514,SAMEA104643266,ERS2244649,biological material preprocessing,ssd derived material from a single seed of a s...,seed,BTO_0001228
1515,SAMEA104643266,ERS2244649,plant anatomical entity,leaf,seedling,BTO_0000713
1516,SAMEA104643266,ERS2244649,plant structure development stage,seedling,leaf,BTO_0001481
1517,SAMEA104643266,ERS2244649,description,individual plants were grown in a greenhouse f...,plant,BTO_0001226
...,...,...,...,...,...,...
21625866,SAMEA104642498,ERS2243881,biological material preprocessing,ssd derived material from a single seed of a s...,cap,BTO_0000172
21625867,SAMEA104642498,ERS2243881,biological material preprocessing,ssd derived material from a single seed of a s...,seed,BTO_0001228
21625868,SAMEA104642498,ERS2243881,plant anatomical entity,leaf,seedling,BTO_0000713
21625869,SAMEA104642498,ERS2243881,plant structure development stage,seedling,leaf,BTO_0001481


In [19]:
# export the subset of the tissue table to a csv file
tissue_df_subset.to_csv('tissue_subset.csv', index=False)