In [21]:
%pip install psycopg2-binary
%pip install pandas
%pip install requests

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


----
# Generating BioProject csv metadata files
- want to generate a csv file containing metadata for each BioProject
- BioSample csv files of metadata located on s3
    - Some BioSamples are missing from individual_parser.py due to parsing errors TODO: Find parsing errors
- loop through all parsed BioSample csvs and create a dataframe of BioSample and BioProject
    - resulting dataframe should be a superset of serratus BioProject and BioSamples
    - additional BioSamples with data not in serratus 

*Update 25/12/23:*
- fetch the mapping from the bioproject xml file
    - ```<Package>``` tag contains a single BioProject
    - ```<ArchiveID>``` contains the BioProject ID
    - ```<LocusTagPrefix biosample_id=''>``` contains the BioSample IDs associated with the BioProject



## BioProject to BioSample mapping
- use the BioProject xml file to find all BioSamples with an associated BioProject
- mapping extracted to dictionary and stored as csv file


In [22]:
import xml.sax

class BioProjectParser(xml.sax.ContentHandler):
    '''
    SAX parser to extract BioSample IDs from a BioProject XML file.
    '''

    def __init__(self, project_dict):
        self.biosample_ids = []
        self.bioproject_id= ""
        self.project_dict = project_dict

    def startElement(self, name, attrs):
        if name == "ArchiveID":
            self.bioproject_id = attrs["accession"]
        elif name == "LocusTagPrefix":
            if "biosample_id" in attrs:
                self.biosample_ids.append(attrs["biosample_id"])

    def endElement(self, name):
        if name == "Package":
            self.project_dict[self.bioproject_id] = self.biosample_ids
            self.bioproject_id = ""
            self.biosample_ids = []
        

In [23]:
project_dict = {}
parser = xml.sax.make_parser()
parser.setContentHandler(BioProjectParser(project_dict))
parser.parse("/home/ec2-user/workspace/data/bioproject.xml")


In [24]:
# unpack the dictionary into separate rows and write to csv
import csv
with open("/home/ec2-user/workspace/data/bioproject.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["bioproject_id", "biosample_id"])
    for bioproject_id, biosample_ids in project_dict.items():
        for biosample_id in biosample_ids:
            writer.writerow([bioproject_id, biosample_id])

In [7]:
# read the csv into a pandas dataframe
import pandas as pd
df = pd.read_csv("/home/ec2-user/workspace/data/bioproject.csv")
df

Unnamed: 0,bioproject_id,biosample_id
0,PRJNA3,SAMN02603966
1,PRJNA3,SAMN11044051
2,PRJNA3,SAMN11044052
3,PRJNA3,SAMN11044053
4,PRJNA3,SAMN11044054
...,...,...
2406643,PRJNA1056452,SAMN39098763
2406644,PRJNA1056462,SAMN39099393
2406645,PRJNA1056479,SAMN39100772
2406646,PRJNA1056518,SAMN39101789


In [2]:
# print the number of biosamples and bioprojects
print("Number of biosamples: {}".format(len(df["biosample_id"].unique())))
print("Number of bioprojects: {}".format(len(df["bioproject_id"].unique())))

Number of biosamples: 2303205
Number of bioprojects: 204697


In [21]:
df.iloc[[612578]]


Unnamed: 0,bioproject_id,biosample_id
612578,PRJNA248792,SAMN04574080


In [11]:
df[df["bioproject_id"] == 'PRJNA248335']


Unnamed: 0,bioproject_id,biosample_id
605821,PRJNA248335,SAMN02797828
605822,PRJNA248335,SAMN02797826


## Merging metdata csv files from the mapping
- for any BioProject that has all BioSample metadata csvs generated, merge into a single csv file and store on s3
- if any BioSample metadata csvs are missing, log an error and store the ids in a separate csv file


In [14]:
import pandas as pd 
import boto3
import logging

# set up logging to a file bioproject.log
logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()
fh = logging.FileHandler("/home/ec2-user/workspace/data/bioproject.log")
formatter = logging.Formatter("%(levelname)s - %(asctime)s - %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.setLevel(logging.INFO)

# read the csv into a dataframe
df = pd.read_csv("/home/ec2-user/workspace/data/bioproject.csv")
bioproject_ids = df["bioproject_id"].unique()

# connect to s3
s3 = boto3.resource("s3")
bucket = s3.Bucket('serratus-biosamples')
logger.info("Connected to s3")

# track the number of bioprojects that are missing > 1 biosample csv file
err_count = 0 

logger.info("Iterating through bioprojects\n")
# iterate through the bioprojects
for bioproject_id in bioproject_ids:
    # get all the biosample ids for this bioproject
    biosample_ids = df[df["bioproject_id"] == bioproject_id]["biosample_id"].tolist()

    # dictionary to store data for each biosmaple in the bioproject
    bioproject_dict = {}

    # set an error flag
    err = False

    # if all the biosample csv files exist, write to a new file titled bioproject_id.csv
    # ensure all the columns are the same if the files exist
    for biosample_id in biosample_ids:
        # get the biosample csv file from s3
        key = f"biosamples_csv/{biosample_id}.csv"
        try:
            obj = bucket.Object(key).get()["Body"].read().decode("utf-8").split("\n")
        except:
            logger.error(f"{biosample_id}.csv does not exist")
            err_count += 1
            err = True
            break

        # get the col string and data string
        col_list = obj[0].split(",")
        data_list = obj[1].split(",")
        # convert the lists to a dictionary
        data_dict = dict(zip(col_list, data_list))
        # add the data_dict to the bioproject_dict
        bioproject_dict[biosample_id] = data_dict
    
    # if there was an error, continue to the next bioproject
    if err:
        err = False
        continue

    # bioproject dict contains biosample data for all biosamples in the bioproject
    # get the headers for all biosamples - slow but required for ordering
    headers = []
    for _, data_dict in bioproject_dict.items():
        for header in data_dict.keys():
            if header not in headers:
                headers.append(header)

    # log if the headers are not the same for all biosamples
    if len(headers) != len(list(bioproject_dict.values())[0].keys()):
        logger.warning(f"Columns are not all the same for {bioproject_id}")

    # create a csv string for the bioproject
    csv_str = ""
    # write the headers
    csv_str += ",".join(headers) + "\n"
    # write the data
    for _, data_dict in bioproject_dict.items():
        csv_str += ",".join([data_dict.get(header, "") for header in headers]) + "\n"
    
    # write the csv string to s3
    try:
        key = f"bioprojects_csv/{bioproject_id}.csv"
        s3.Object("serratus-biosamples", key).put(Body=csv_str)
        logger.info(f"Successfully wrote {key}")
    except Exception as e:
        logger.error(f"Error writing {bioproject_id}.csv: {str(e)}")


logger.info(f"Number of bioprojects missing > 1 biosample csv file: {err_count}")
logger.info("Done")

# started running at 11:02 AM on 31 Dec 2023 GMT +1


----
## Parsing additional BioSample metadata
- biosample csv files that don't exist were highlighted in the log file
    - ONLY THE FIRST MISSING BIOSAMPLE PER BIOPROJECT IS LOGGED -> still will have missing biosamples
- missing biosamples can be extracted from the log file and parsed again using code from individual_parser.py
    - missing biosamples logged with ERROR - timestamp - biosample_id.csv does not exist
    

- Find missing biosamples from log file and add to missing_biosamples.txt

```grep "^ERROR" bioproject.log | sed 's/^ERROR.* \([^ ]*\).csv.*/\1/' > missing_biosamples.txt```

In [None]:
import boto3
import logging
import bs2csv
import xml.sax
import xmltodict
import re

# set up logging to a file bioproject.log
logger = logging.getLogger()
if logger.hasHandlers():
    logger.handlers.clear()
fh = logging.FileHandler("/home/ec2-user/workspace/mwas/missing_biosamples.log")
formatter = logging.Formatter("%(levelname)s - %(asctime)s - %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.setLevel(logging.INFO)

with open("/home/ec2-user/workspace/data/missing_biosamples.txt", "r") as f:
    s3 = boto3.resource('s3')
    bucket  = s3.Bucket('serratus-biosamples')
    for line in f:
        line = line.strip()
        key = f'biosamples_split/{line}.xml'

        try:
            obj = bucket.Object(key).get()['Body'].read().decode('utf-8')
            # get the accession from the xml string
            accession = re.search(r'^<\?xml version="1.0" encoding="UTF-8"\?>\n<BioSample.*accession="(.+)"', obj).group(1)
        except:
            logger.warning(f'{line}.xml does not exist')
            continue
            
        content_dict = {}
        handler = bs2csv.BioSamplesParser(content_dict)
        try:
            xml.sax.parseString(obj, handler)
        except:
            logger.error(f'Error parsing {line}.xml')
            continue

        try:
            csv_str = ""   
            csv_str = 'biosample_id,' + ','.join(content_dict.keys()) + '\n'
            csv_str += accession + ',' + ','.join(content_dict.values()) + '\n'
        except Exception as e:
            logger.error(f'Error creating csv string for {accession}: {e}')
            continue


        try:
            bucket.put_object(Key='biosamples_csv/' + accession + '.csv', Body=csv_str)
            logger.info(f'Successfully wrote {accession}.csv')
        except:
            logger.error(f'Error writing {accession}.csv')
            continue

# started running at 12;20 PM on 1 Jan 2024 GMT +1

- new xml files are not included in the folder on s3 and are missing 
- errors (22) come from issues in creating the csv string to write to file
    - extract error ids and write to s3 with updated csv string
    

----
# Generating missing BioProject csv metadata files
- bioproject xml does not contain biosample information for many bioprojects
- use the generated biosample csv files to find the mapping between bioproject and biosample

In [None]:
aws s3 ls serratus-biosamples/biosamples_split/ > biosample_s3_xml
sed -i '1d' biosample_s3_xml 
sed -E -i 's/.* (\S+\.xml)/\1/' biosample_s3_xml