In [157]:
import re
import xml.etree.ElementTree as ET

import pandas as pd
import numpy as np

In [158]:
# obtained from https://www.ebi.ac.uk/ena/data/view/SRS2412440-SRS2412443
TREATMENT_DICT = {
    'OP': 'operative',
    '2': '2cycleschemo',
    '0': 'pre',
    'Blood': 'blood'
}

In [159]:
xml_metadata_fp = '/Users/erikstorrs/Downloads/ena.xml'
file_table_fp = '/Users/erikstorrs/Downloads/PRJNA396019.txt'

#### read in xml metadata

In [160]:
tree = ET.parse('/Users/erikstorrs/Downloads/ena.xml')
root = tree.getroot()

In [161]:
accession_number_to_metadata = {}

for experiment in root:
    accession_number = experiment.attrib['accession']
    
    library = experiment.find('DESIGN/LIBRARY_DESCRIPTOR')
    strategy = library.find('LIBRARY_STRATEGY').text
    name = library.find('LIBRARY_NAME').text
    
    if 'cell' in name:
        sample_id = re.sub(r'^(.*)cells?.*$', r'\1', name)
        cell_id = int(re.sub(r'^.*cells?(.*)$', r'\1', name))
    else:
        sample_id = name
        cell_id = np.NaN
        
    treatment_id = re.sub(r'^[A-Z]{3}[0-9]{3}(.*)$', r'\1', sample_id).replace('-', '')
    patient_id = re.sub(r'^([A-Z]{3}[0-9]{3}).*$', r'\1', sample_id)
    accession_number_to_metadata[accession_number] = {
        'strategy': strategy,
        'name': name,
        'cell_id': cell_id,
        'sample_id': sample_id,
        'patient_id': patient_id,
        'treatment': TREATMENT_DICT.get(treatment_id, '.'),
        'experiment_accession_number': accession_number
    }


#### link xml metadata with ftp filepath from tabular file

In [162]:
df = pd.read_csv(file_table_fp, sep='\t')
df.head()

Unnamed: 0,study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,tax_id,scientific_name,instrument_model,library_layout,fastq_ftp,fastq_galaxy,submitted_ftp,submitted_galaxy,sra_ftp,sra_galaxy,cram_index_ftp,cram_index_galaxy
0,PRJNA396019,SAMN07457099,SRS2412441,SRX3067795,SRR5906250,9606,Homo sapiens,Illumina HiSeq 4000,PAIRED,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/000/SRR590...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/000/SRR590...,,,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/000/SRR5906250,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/000/SRR5906250,,
1,PRJNA396019,SAMN07457098,SRS2412440,SRX3067794,SRR5906251,9606,Homo sapiens,Illumina HiSeq 4000,PAIRED,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/001/SRR590...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/001/SRR590...,,,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/001/SRR5906251,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/001/SRR5906251,,
2,PRJNA396019,SAMN07457097,SRS2412442,SRX3067793,SRR5906252,9606,Homo sapiens,Illumina HiSeq 4000,PAIRED,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/002/SRR590...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/002/SRR590...,,,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/002/SRR5906252,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/002/SRR5906252,,
3,PRJNA396019,SAMN07457096,SRS2412443,SRX3067792,SRR5906253,9606,Homo sapiens,Illumina HiSeq 4000,PAIRED,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/003/SRR590...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/003/SRR590...,,,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/003/SRR5906253,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/003/SRR5906253,,
4,PRJNA396019,SAMN07459290,SRS2414194,SRX3069884,SRR5908305,9606,Homo sapiens,Illumina HiSeq 4000,PAIRED,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/005/SRR590...,ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/005/SRR590...,,,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/005/SRR5908305,ftp.sra.ebi.ac.uk/vol1/srr/SRR590/005/SRR5908305,,


In [163]:
for i, row in df.iterrows():
    accession_number = row['experiment_accession']
    xml_metadata = accession_number_to_metadata[accession_number]
    
    xml_metadata['fastq_ftp_path'] = row['fastq_ftp']

#### write output

In [164]:
f = open('cell_fastq_locations.tsv', 'w')

columns = ['patient_id', 'sample_id', 'experiment_accession_number', 'cell_id',
           'strategy', 'treatment', 'fastq_ftp_path']
sorted_metadatas = sorted(accession_number_to_metadata.values(),
        key=lambda x: (x['sample_id'], x['strategy'], x['cell_id']))

f.write('\t'.join(columns) + '\n')
for metadata in sorted_metadatas:
    if metadata['strategy'] == 'RNA-Seq':
        f.write('\t'.join([str(metadata[c]) for c in columns]) + '\n')

f.close()