In [1]:
%pip install psycopg2-binary
%pip install pandas
%pip install requests

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import xml.sax
import pickle
import pandas as pd

# Dictionary to store full mapping between BioProjectID and BioSampleID
bioproject_mapping_dict = {}

----
# Generating BioSample to BioProject mapping
- complete mapping from BioSampleID to BioProjectID does not exist in NCBI
- can be extracted from 3 separate locations:
    - BioProject XML file
    - BioSample XML file
    - SRA BigQuery metadata

In [3]:
# updated mapping folder here when re-running the notebook
mapping_folder = 'pickles/v1.1'

## BioProject XML
- Some BioProject XMLs store the BioSampleIDs of their associated BioSamples
    - ```<Package>``` tag contains a single BioProject
    - ```<ArchiveID>``` contains the BioProject ID
    - ```<LocusTagPrefix biosample_id=''>``` contains the BioSample IDs associated with the BioProject

In [4]:
bioproject_mapping_dict_1 = {}
bioproject_xml_path = "/home/ec2-user/workspace/data/bioproject.xml"

class BioProjectParser(xml.sax.ContentHandler):
    '''
    SAX parser to extract BioSample IDs from a BioProject XML file.
    '''

    def __init__(self, project_dict):
        self.biosample_ids = []
        self.bioproject_id= ""
        self.project_dict = project_dict

    def startElement(self, name, attrs):
        if name == "ArchiveID":
            self.bioproject_id = attrs["accession"]
        elif name == "LocusTagPrefix":
            if "biosample_id" in attrs:
                self.biosample_ids.append(attrs["biosample_id"])

    def endElement(self, name):
        if name == "Package":
            self.project_dict[self.bioproject_id] = self.biosample_ids
            self.bioproject_id = ""
            self.biosample_ids = []


# Try to read the pickled dictionary from disk
try:
    with open(f"{mapping_folder}/bioproject_mapping_dict_1.pickle", "rb") as f:
        bioproject_mapping_dict_1 = pickle.load(f)
    print("Loaded pickled dictionary from disk")
except:
    print("No pickled dictionary found, parsing XML")
    parser = xml.sax.make_parser()
    parser.setContentHandler(BioProjectParser(bioproject_mapping_dict_1))
    parser.parse(bioproject_xml_path)

    with open(f"{mapping_folder}/bioproject_mapping_dict_1.pickle", "wb") as f:
        pickle.dump(bioproject_mapping_dict_1, f)

    print("Pickled dictionary to disk")

Loaded pickled dictionary from disk


In [5]:
print('Stage 1: ')
print(f'Number of BioProjects: {len(bioproject_mapping_dict_1)}')
print(f'Number of mapped BioSamples: {sum([len(v) for v in bioproject_mapping_dict_1.values()])}')
print(f'Number of unique BioSamples: {len(set([item for sublist in bioproject_mapping_dict_1.values() for item in sublist]))}')

Stage 1: 
Number of BioProjects: 822292
Number of mapped BioSamples: 2717532
Number of unique BioSamples: 2601705


- SRA metadata only has 451347 unique BioProjects, BioProject XML should not have more than this
    - likely have BioProjects that have been removed

## BioSample XML
- some BioSample records contain entrez links to the corresponding BioProject (17,700,598 / 37,158,431)
    - small number contain just the uid of the BioProject (17,541 / 17,700,598)
        - cannot be used reliably since complete ID can be PRJ(NA/EB/DB)
    - majority of containing records have the full BioProjectID (17,683,057 / 17,700,598)
        - stored under the `label` attribute of the `Link` tag

In [6]:
bioproject_mapping_dict_2 = {}
biosample_xml_path = "/home/ec2-user/workspace/data/biosample_set.xml"

class BioSampleParser(xml.sax.ContentHandler):
    '''
    SAX parser to extract BioProject IDs from a BioSample XML file.
    '''

    def __init__(self, project_dict):
        self.bioproject_ids = []
        self.biosample_id= ""
        self.project_dict = project_dict

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Link':
            # check if the link is to a BioProject and extract the ID if possible
            isBioproject = 'target' in attrs and attrs['target'] == 'bioproject'
            hasLabel = 'label' in attrs
            if isBioproject and hasLabel:
                self.bioproject_ids.append(attrs['label'])

    def endElement(self, name):
        if name == "BioSample":
            for bioproject_id in self.bioproject_ids:
                # append or add to the dictionary
                if bioproject_id in self.project_dict:
                    self.project_dict[bioproject_id].append(self.biosample_id)
                else:
                    self.project_dict[bioproject_id] = [self.biosample_id]
            self.bioproject_ids = []
            self.biosample_id = ""

# Try to read the pickled dictionary from disk
try:
    with open(f"{mapping_folder}/bioproject_mapping_dict_2.pickle", "rb") as f:
        bioproject_mapping_dict_2 = pickle.load(f)
    print("Loaded pickled dictionary from disk")
except:
    print("No pickled dictionary found, parsing XML")
    parser = xml.sax.make_parser()
    parser.setContentHandler(BioSampleParser(bioproject_mapping_dict_2))
    parser.parse(biosample_xml_path)

    with open(f'{mapping_folder}/bioproject_mapping_dict_2.pickle', 'wb') as f:
        pickle.dump(bioproject_mapping_dict_2, f)

    print("Pickled dictionary to disk")

Loaded pickled dictionary from disk


In [7]:
print('Stage 2: ')
print(f'Number of BioProjects: {len(bioproject_mapping_dict_2)}')
print(f'Number of mapped BioSamples: {sum([len(v) for v in bioproject_mapping_dict_2.values()])}')
print(f'Number of unique BioSamples: {len(set([item for sublist in bioproject_mapping_dict_2.values() for item in sublist]))}')

Stage 2: 
Number of BioProjects: 468882
Number of mapped BioSamples: 18873209
Number of unique BioSamples: 18758458


## SRA BigQuery metadata
- BigQuery table contains metadata for all runs in the SRA
    - table downloaded on 2024-01-23
- database queried for SRA accession, BioSampleID, and BioProjectID:
    - `` SELECT acc, biosample, bioproject FROM `nih-sra-datastore.sra.metadata` ``
    - downloaded as two separate files

In [None]:
cp data/sra_metadata_bp_map_1.csv data/sra_metadata_bp_map.csv
tail -n +2 data/sra_metadata_bp_map_2.csv >> data/sra_metadata_bp_map.csv

In [8]:
# iterate through the file and add all mappings to a dictionary
bioproject_mapping_dict_3 = {}
sra_metadata_bp_map_path = "/home/ec2-user/workspace/mwas/data/sra_metadata_bp_map.csv"

try:
    with open(f"{mapping_folder}/bioproject_mapping_dict_3.pickle", "rb") as f:
        bioproject_mapping_dict_3 = pickle.load(f)
    print("Loaded pickled dictionary from disk")
except:
    df = pd.read_csv(sra_metadata_bp_map_path)
    # remove any null values as we cannot any mapping from them
    df = df.dropna()

    def get_mapping(row):
        # helper function to extract the biosample to bioproject mapping
        bioproject_id = row['bioproject']
        biosample_id = row['biosample']

        if bioproject_id in bioproject_mapping_dict_3:
            bioproject_mapping_dict_3[bioproject_id].append(biosample_id)
        else:
            bioproject_mapping_dict_3[bioproject_id] = [biosample_id]

    df.apply(get_mapping, axis=1)

    # pickle the output
    with open(f'{mapping_folder}/bioproject_mapping_dict_3.pickle', 'wb') as f:
        pickle.dump(bioproject_mapping_dict_3, f)


Loaded pickled dictionary from disk


In [9]:
print('Stage 3: ')
print(f'Number of BioProjects: {len(bioproject_mapping_dict_3)}')
print(f'Number of mapped BioSamples: {sum([len(v) for v in bioproject_mapping_dict_3.values()])}')
print(f'Number of unique BioSamples: {len(set([item for sublist in bioproject_mapping_dict_3.values() for item in sublist]))}')

Stage 3: 
Number of BioProjects: 450725
Number of mapped BioSamples: 31130322
Number of unique BioSamples: 21031084


In [10]:
# check for bioprojects that are in bioproject xml but not the SRA metadata
bp_xml_ids = set(bioproject_mapping_dict_1)
print(f'Unique BioProject Ids from bioproject.xml: {len(bp_xml_ids)}')  

sra_metadata_ids = set(bioproject_mapping_dict_3)
print(f'Unique BioProject Ids from sra metadata: {len(sra_metadata_ids)}\n') 

print(f'Number of Ids in the xml but not the metadata: {len(bp_xml_ids - sra_metadata_ids)}')
print(f'Number of Ids in the metadata but not the xml: {len(sra_metadata_ids - bp_xml_ids)}')



Unique BioProject Ids from bioproject.xml: 822292
Unique BioProject Ids from sra metadata: 450725

Number of Ids in the xml but not the metadata: 372204
Number of Ids in the metadata but not the xml: 637


- want to merge all three sources to get as complete a mapping as possible
- merged dictionary should have same BioProject to BioSample structure
    - can be unwrapped and written to csv for parsing later

In [12]:
try:
    with open(f"{mapping_folder}/bioproject_mapping_dict.pickle", "rb") as f:
        bioproject_mapping_dict = pickle.load(f)
    print("Loaded pickled dictionary from disk")
except:
    # add everything to bioproject_mapping_dict 
    for bioproject_id, biosample_list in bioproject_mapping_dict_1.items():
        if bioproject_id in bioproject_mapping_dict:
            bioproject_mapping_dict[bioproject_id].update(set(biosample_list))
        else:
            bioproject_mapping_dict[bioproject_id] = set(biosample_list)

    for bioproject_id, biosample_list in bioproject_mapping_dict_2.items():
        if bioproject_id in bioproject_mapping_dict:
            bioproject_mapping_dict[bioproject_id].update(set(biosample_list))
        else:
            bioproject_mapping_dict[bioproject_id] = set(biosample_list)

    for bioproject_id, biosample_list in bioproject_mapping_dict_3.items():
        if bioproject_id in bioproject_mapping_dict:
            bioproject_mapping_dict[bioproject_id].update(set(biosample_list))
        else:
            bioproject_mapping_dict[bioproject_id] = set(biosample_list)

    with open(f'{mapping_folder}/bioproject_mapping_dict.pickle', 'wb') as f:
        pickle.dump(bioproject_mapping_dict, f)

In [13]:
print('Total BioProject mapping stats: ')
print(f'Number of BioProjects: {len(bioproject_mapping_dict)}')
print(f'Number of mapped BioSamples: {sum([len(v) for v in bioproject_mapping_dict.values()])}')
print(f'Number of unique BioSamples: {len(set([item for sublist in bioproject_mapping_dict_3.values() for item in sublist]))}')


Total BioProject mapping stats: 
Number of BioProjects: 825023
Number of mapped BioSamples: 27251791
Number of unique BioSamples: 21031084


----
# Merging BioSample csv files


In [34]:
import requests
import boto3
import re
bioproject_ids = ['PRJNA338276']
bioproject_mapping_dict['PRJNA338276']

# connect to s3
s3 = boto3.resource("s3")
bucket = s3.Bucket('serratus-biosamples')

In [49]:
def apply_quote_preprocess(data):
    # helper function to preprocess quotation marks for a csv string
    # fist check if the data is not null
    try:
        # escape all quotes
        return re.sub(r'["]', '""', data)
    except:
        return data


bioproject_data = []
col_set = set()

for biosample_id in bioproject_mapping_dict[bioproject_ids[0]]:
    key = f"biosamples_csv/{biosample_id}.csv"
    try:
        obj = bucket.Object(key).get()["Body"]#.read().decode("utf-8")#.split("\n")
    except Exception as e:
        print(e)
        continue

    # convert to a dataframe
    df = pd.read_csv(obj)
    df = df.map(apply_quote_preprocess) # add proper escaping around all quotes
    # convert to a dictionary
    bioproject_dict = df.to_dict(orient='list') # can orient as list since the csvs are each 1 row

    # add the columns to the set
    col_set.update(bioproject_dict.keys())
    # add the dictionary to the list
    bioproject_data.append(bioproject_dict)

# generate a csv string from the data
csv_str = ','.join(col_set) + '\n'
for row in bioproject_data:
    csv_str += ','.join([f'"{row[col][0]}"' if col in row else '' for col in col_set]) + '\n'

# write the csv string to a file
with open('bioproject_data.csv', 'w') as f:
    f.write(csv_str)

    
