In [1]:
import pandas as pd
import requests
import numpy as np
import bs4
import re
from pathlib import Path
from collections import Counter

# Gouliouris et. al. 2018 UK Enterococcus Paper Metadata

## Add sample and project accessions to Gouliouris 2018 metadata

Starting from table S2 in https://mbio.asm.org/content/9/6/e01780-18 which contains `run_accessions` for deposited raw sequencing data in ENA, use the ENA API to automatically add `study_accession` and `sample_accession`

In [None]:
# table S2 from https://mbio.asm.org/content/9/6/e01780-18
gouliouris_2018_uk_metadata = pd.read_csv('datasets/inline-supplementary-material-3.csv', sep='\t', skiprows=1)
gouliouris_2018_uk_metadata = gouliouris_2018_uk_metadata.rename(columns={'Accession number': 'Run_accession', 
                                          'Isolate ID': 'Isolate_name',
                                          'ST': 'Sequence_Type'})

# break into 500 accession chunks to more efficiently query the API
api_responses = ""

run_accessions = gouliouris_2018_uk_metadata.loc[gouliouris_2018_uk_metadata['Run_accession'].str.startswith('ERR'), 'Run_accession'].values
for run_accession_chunk in [run_accessions[i:i + 500] for i in range(0, len(run_accessions), 500)]:
    run_accession_chunk = ",".join(run_accession_chunk)
    api_response = requests.get(f"https://www.ebi.ac.uk/ena/browser/api/xml/{run_accession_chunk}")    
    api_responses += api_response.text

# parse combined xml records into a soup for easier traversal 
api_response_soup = bs4.BeautifulSoup(api_responses, 'lxml')


def add_study_and_sample_metadata(row, api_response_soup):
    """
    Use the ENA API to get the study and sample accessions
    """
    run_accession = row['Run_accession']
        
    run_soup = api_response_soup.find(accession=run_accession)
    if run_soup:
        for xref in run_soup.find_all('xref_link'):
            if xref.db.text == 'ENA-STUDY':
                row['Study_accession'] = xref.id.text.strip()
            elif xref.db.text == 'ENA-SAMPLE':
                row['Sample_accession'] = xref.id.text.strip()
    else:
        row['Study_accession'] = np.nan
        row['Sample_accession'] = np.nan
    return row

gouliouris_2018_uk_metadata = gouliouris_2018_uk_metadata.apply(lambda x: add_study_and_sample_metadata(x, api_response_soup), axis=1)

Let's grab the collection date to be consistent with Raven 2016

In [None]:
sample_accessions = gouliouris_2018_uk_metadata.loc[~gouliouris_2018_uk_metadata['Sample_accession'].isna(), 'Sample_accession'].values
for sample_accession_chunk in [sample_accessions[i:i + 500] for i in range(0, len(sample_accessions), 500)]:
    sample_accession_chunk_text = ",".join(sample_accession_chunk)
    api_response = requests.get(f"https://www.ebi.ac.uk/ena/browser/api/xml/{sample_accession_chunk_text}")    
    api_responses += api_response.text

# parse combined xml records into a soup for easier traversal 
api_response_soup = bs4.BeautifulSoup(api_responses, 'lxml')

collection_date_dict = {}
for sample in api_response_soup.find_all('sample'):
    sample_acc = sample['accession']
    for tag in sample.find_all('tag'):
        if tag.text == 'collection_date':
            collection_date_dict[sample_acc] = tag.findNext('value').text
            
            
gouliouris_2018_uk_metadata['Isolation_date'] = gouliouris_2018_uk_metadata['Sample_accession'].apply(lambda x: collection_date_dict[x] if x in collection_date_dict else np.nan)

Then let's tidy up the UK metadata sheet by renaming some fields and adding some extra metadata to make our life easier merging in with the alberta data later. 

In [None]:
# tidy up UK metadata for merging
gouliouris_2018_uk_metadata = gouliouris_2018_uk_metadata.rename(columns={'Origin': 'Origin',
                                          'BAPS group': 'BAPS_group',
                                          'Location': 'Location',
                                          'Ampicillin resistance': 'Ampicillin',
                                          'Vancomycin resistance': 'Vancomycin'})

gouliouris_2018_uk_metadata.loc[gouliouris_2018_uk_metadata['Removed in deduplication'] == 'Removed', 'Metadata_status'] = 'Removed for deduplication in original paper (10.1128/mBio.01780-18)'
gouliouris_2018_uk_metadata = gouliouris_2018_uk_metadata.drop('Removed in deduplication', axis=1)

# all in this paper are E. faecium
gouliouris_2018_uk_metadata['Species'] = 'Enterococcus faecium'
gouliouris_2018_uk_metadata['Country/Province'] = 'United Kingdom'

# drop reference strains
gouliouris_2018_uk_metadata = gouliouris_2018_uk_metadata.loc[gouliouris_2018_uk_metadata['Origin'] != 'Reference strain']

## Add read data for Gouliouris 2018 

Then parse the read data for all the re-downloaded reads by FM (instead of those from HS and archived on `deivos.research.cs.dal.ca`

In [None]:
def get_reads(read_dir):
    reads_data = {'Run_accession': [], 'R1': [], 'R2': []}
    for read_dir in Path(read_dir).glob("*/"):
        run_accession = str(read_dir.name)
        reads_data['Run_accession'].append(run_accession)

        r1 = read_dir / f"{run_accession}_1.fastq.gz"
        if r1.exists():
            reads_data['R1'].append(str(r1))
        else:
            raise ValueError(f"{r1} is missing")

        r2 = read_dir / f"{run_accession}_2.fastq.gz"
        if r2.exists():
            reads_data['R2'].append(str(r2))
        else:
            raise ValueError(f"{r2} is missing")
    return pd.DataFrame(reads_data)  

gouliouris_2018_uk_reads = get_reads('genomes/gouliouris_2018_uk')

gouliouris_2018_uk_metadata_and_reads = pd.merge(gouliouris_2018_uk_metadata, 
                                                 gouliouris_2018_uk_reads, 
                                                 on='Run_accession', 
                                                 validate='one_to_one', 
                                                 how='outer')

# Raven et. al. 2016 UK Hospital Enterococcus Paper Metadata

## Add sample and project accessions to Raven 2016 metadata

Starting from table S1 in https://www.nature.com/articles/nmicrobiol201533#Sec11 which 
Starting from table S2 in https://mbio.asm.org/content/9/6/e01780-18 which contains `run_accessions` for deposited raw sequencing data in ENA, use the ENA API to automatically add `study_accession` and `sample_accession`


In [None]:
raven_2016_uk_hospital_metadata = pd.read_excel('datasets/41564_2016_BFnmicrobiol201533_MOESM202_ESM.xlsx', 
                                                skiprows=1,
                                                skipfooter=4)

# we are redoing genomics so just want phenotypic vanco resistance
raven_2016_uk_hospital_metadata['Vancomycin'] = raven_2016_uk_hospital_metadata['Phenotypic vancomycin resistance'].replace({'VanA': 'R', 'VSE**': 'R', 'VanA***': 'R', 'VanB':'R', 'VSE': 'S', 'Unknown': np.nan}) 
raven_2016_uk_hospital_metadata['Location'] = raven_2016_uk_hospital_metadata['Hospital (HA), community (CA) or health-care associated (HCA)'].replace({'HA': 'Hospital', 'CA': 'Community', 'HCA': 'Health-Care Associated', 'Unknown': np.nan}) 

raven_2016_uk_hospital_metadata['Isolation_date'] = raven_2016_uk_hospital_metadata['Year of isolation'].astype(str).str.split('-').str.get(0)

# drop uneeded columns (especially as we are doing our own genomics)
raven_2016_uk_hospital_metadata = raven_2016_uk_hospital_metadata.drop(['Phenotypic vancomycin resistance', 
                                                                        'Hospital (HA), community (CA) or health-care associated (HCA)',
                                                                        'Van transposon', 'Collection', 'Year of isolation'], axis=1)

# rename columns to align better with gouil
raven_2016_uk_hospital_metadata = raven_2016_uk_hospital_metadata.rename(columns={'Sequence ID': 'Run_accession',
                                          'Source': 'Origin',
                                          'ID in collection': 'Isolate_name',
                                          'MLST type': 'Sequence_Type'})

raven_2016_uk_hospital_metadata['Country/Province'] = "United Kingdom"
raven_2016_uk_hospital_metadata['Species'] = "Enterococcus faecalis"

In [None]:
# break into 500 accession chunks to more efficiently query the API
api_responses = ""

run_accessions = raven_2016_uk_hospital_metadata.loc[:,'Run_accession'].values
for run_accession_chunk in [run_accessions[i:i + 500] for i in range(0, len(run_accessions), 500)]:
    run_accession_chunk = ",".join(run_accession_chunk)
    api_response = requests.get(f"https://www.ebi.ac.uk/ena/browser/api/xml/{run_accession_chunk}")    
    api_responses += api_response.text

# parse combined xml records into a soup for easier traversal 
api_response_soup = bs4.BeautifulSoup(api_responses, 'lxml')


def add_study_and_sample_metadata(row, api_response_soup):
    """
    Use the ENA API to get the study and sample accessions
    """
    run_accession = row['Run_accession']
        
    run_soup = api_response_soup.find(accession=run_accession)
    if run_soup:
        for xref in run_soup.find_all('xref_link'):
            if xref.db.text == 'ENA-STUDY':
                row['Study_accession'] = xref.id.text.strip()
            elif xref.db.text == 'ENA-SAMPLE':
                row['Sample_accession'] = xref.id.text.strip()
    else:
        row['Study_accession'] = np.nan
        row['Sample_accession'] = np.nan
    return row

raven_2016_uk_hospital_metadata = raven_2016_uk_hospital_metadata.apply(lambda x: add_study_and_sample_metadata(x, api_response_soup), axis=1)

## Add read data for Raven 2016

Data downloaded from ENA by FM, checksums verified (see `genomes/`)

In [None]:
raven_2016_uk_hospital_reads = get_reads('genomes/raven_2016_uk_hospital')
raven_2016_uk_hospital_metadata_and_reads = pd.merge(raven_2016_uk_hospital_metadata, 
                                                     raven_2016_uk_hospital_reads, 
                                                     on='Run_accession', 
                                                     validate='one_to_one', 
                                                     how='outer')

## Collate All UK Data

All UK data can be collated into one table now that download is reliable instead of original protocol used by HS:
1. get all accessions
2. try to download
3. keep the subset that successfully downloaded as a "random sample with similar habitat distribution as all Alberta E. faecium metadata"

In [None]:
uk_metadata_and_reads = pd.concat([gouliouris_2018_uk_metadata_and_reads, 
                                   raven_2016_uk_hospital_metadata_and_reads])
uk_metadata_and_reads.loc[uk_metadata_and_reads['R1'].isna(), 'Read_status'] = "Removed for deduplication in original paper"
uk_metadata_and_reads.loc[~uk_metadata_and_reads['R1'].isna(), 'Read_status'] = "Validated download"

# # drop extraneous columns
# uk_metadata_and_reads = uk_metadata_and_reads.drop(['Strain_name', 'Species_reads'], axis=1)

# # make species information from metadata the official species for the UK data as this seems reliable for this dataset
# uk_metadata_and_reads = uk_metadata_and_reads.rename(columns={'Species_metadata': 'Species'})

# # re-order columns into natural groupings
uk_metadata_and_reads = uk_metadata_and_reads[
                      ['Isolate_name', 'Study_accession', 'Sample_accession', 
                       'Run_accession',  
                       'Metadata_status',
                       'Species', 'BAPS_group', 'Sequence_Type',
                       'Country/Province', 'Origin', 'Location',
                      'Ampicillin', 'Vancomycin', 'Read_status', 
                      'R1', "R2"]]

# AAFC Enterococcus Paper Metadata

## Collect metadata from NCBI

https://www.nature.com/articles/s41598-020-61002-5

Raw sequencing data is now deposited for this paper under `BioProject` accession (`PRJNA604849`) i.e., `study_accession`.

So first let's get all the metadata from NCBI and link the reads before trying to merge in the metadata from the paper with variable names.

In [None]:
with open('datasets/PRJNA604849_full_biosample_list.xml') as fh:
    PRJNA604849_xml = bs4.BeautifulSoup(fh, 'lxml')

parsed_xml_data = {}
for biosample in PRJNA604849_xml.find_all('biosample'):
    biosample_data = {}
    biosample_data['Sample_name'] = biosample.find(db_label='Sample name').text
    biosample_data['Species'] = biosample.find('organismname').text
    biosample_data['Strain_name'] = biosample.find(attribute_name="strain").text
    biosample_data['Study_accession'] = biosample.find(target="bioproject")['label']
    parsed_xml_data[biosample['accession']] = biosample_data
    
alberta_ncbi = pd.DataFrame(parsed_xml_data).T.reset_index().rename(columns={'index': 'Sample_accession'})

# tidy up as we don't use sample_name for matching because strain_name was a closer match to the isolate name
# in the paper metadata
alberta_ncbi = alberta_ncbi.drop('Sample_name', axis=1)

## Add read data to Alberta metadata from NCBI

Despite reconciling this at various stages the constant changes to metadata has meant this pairing has broken again, I'm applying the same approach as used for merging in the paper metadata to NCBI again. 

Having persuaded AAFC to actually upload the reads I can link biosamples based on SRA accessions.

In [None]:
alberta_sra_data = pd.read_csv('datasets/PRJNA604849_sra_run_info.csv')
alberta_sra_data = alberta_sra_data[['Run','BioSample']]
alberta_sra_data = alberta_sra_data.rename(columns={'BioSample': 'Sample_accession', 
                                                    'Run': 'Run_accession'})
alberta_read_data = get_reads('genomes/zaheer_2020_alberta')
alberta_read_data = pd.merge(alberta_sra_data, 
                              alberta_read_data, 
                              on='Run_accession', 
                              validate='one_to_one', 
                              how='outer')

alberta_ncbi_metadata_and_reads = pd.merge(alberta_ncbi,
                                     alberta_read_data,
                                     on='Sample_accession',
                                     how='outer',
                                     validate='one_to_many', suffixes=['', '_sra'])

There are duplicates where the same sample has multiple sequences (111) of them but there is no way to tell which is the resequencing with how AAFC uploaded the raw data as well as some samples with no reads associated.

In [None]:
alberta_ncbi_metadata_and_reads.loc[alberta_ncbi_metadata_and_reads['Sample_accession'].duplicated(), 'Metadata_status'] = 'Two sets of reads for this sample without time metadata to reconcile'
alberta_ncbi_metadata_and_reads.loc[alberta_ncbi_metadata_and_reads['R1'].isna(), 'Read_status'] = 'No reads associated with sample'
alberta_ncbi_metadata_and_reads.loc[~alberta_ncbi_metadata_and_reads['R1'].isna(), 'Read_status'] = 'Validated download'

## Merge in metadata from paper reconciling name problems

Now we need to parse and tidy up the metadata from https://www.nature.com/articles/s41598-020-61002-5  `41598_2020_61002_MOESM2_ESM.csv` as this seems to have some disconnections with the metadata via NCBI e.g., species assignments we will use the NCBI data when in doubt. Given that NCBI confirm species assignments this seems a prudent choice.

In [None]:
alberta_metadata = pd.read_csv('datasets/41598_2020_61002_MOESM2_ESM.csv', sep='\t')

# drop the 2 inexplicably duplicated rows 
alberta_metadata = alberta_metadata.drop_duplicates()

# get rid of the identical rows apart from _ vs - 
alberta_metadata = alberta_metadata[alberta_metadata['ISOLATE'] != 'SWEntR-0393']
# identical row apart from incorrect species (when compared to NCBI assembly)
alberta_metadata = alberta_metadata[alberta_metadata['ISOLATE'] != 'SWEntR 0262']

# tidy column names to help with merging later
alberta_metadata = alberta_metadata.rename(columns={'ISOLATION SOURCE': 'Origin',
                                                    'SPECIFIC LOCATION': 'Location',
                                                    'VNCO': 'Vancomycin',
                                                    'AMPI': 'Ampicillin',
                                                    'TEIC': 'Teicoplanin',
                                                    'DOXY': 'Doxycycline',
                                                    'ERTH': 'Erythromycin',
                                                    'GENT': 'Gentamicin',
                                                    'LNZD': 'Linezolid',
                                                    'LVFL': 'Levofloxacin',
                                                    'QUIN': 'Quinolone',
                                                    'STEP': 'Streptomycin',
                                                    'NTRO': 'Nitrofurantoin',
                                                    'TGC': 'Tigecycline',
                                                    'SPECIATION': 'Species',
                                                    'SOURCE CODE': 'Source_code',
                                                    'ISOLATE': 'Isolate_name_paper'})

# get rid of useless columns and add extra column to help with merging UK and AB metadata
alberta_metadata = alberta_metadata.drop(['Unnamed: 18', "Resistance count", 'LOCATION'], axis=1)
alberta_metadata['Country/Province'] = 'Canada/Alberta'

# Remove trailing spaces that were left in the paper metadata
alberta_metadata['Species'] = alberta_metadata['Species'].str.strip()

# To ensure we merge the correct identifiers we are going to use the Source_code
# information AS WELL as the isolate_name, therefore let's create a new identifier
# out of the source code and isolate_name (and then remove spaces/underscores/hyphens etc)
def combine_source_and_isolate_name(row):
    """
    Try to combine isolate name with the source code field
    if it isn't already prefixed by the source code information
    """
    # Spaces, hyphens and dashes are a major source of disconnect so just remove them and try to map
    metadata_isolate_name = row['Isolate_name_paper'].replace('_', '').replace('-', '').replace(' ', '')
    if metadata_isolate_name.startswith(row['Source_code']):
        return metadata_isolate_name
    else:
        return row['Source_code'] +  metadata_isolate_name


alberta_metadata['Source_code_and_isolate_name'] = alberta_metadata.apply(\
                                                            combine_source_and_isolate_name, axis=1)

Now we want to merge in the accession data from the bioproject.

The NCBI biosamples have both a distinct `Strain_name` and a distinct `Sample_name` whereas the paper metadata just has an `Isolate_name_paper` (original `ISOLATE`). 

`Isolate_name_paper` seems to correspond to either one of these NCBI identifiers with no apparent pattern. However, `Strain_name` does match the paper metadata `Isolate_name_paper` more often and seems to be generally closer.

Therefore, we are going to try and identify mappings between the NCBI `Strain_name` and the paper metadata `Isolate_name_paper`.  Then we are going to treat the NCBI `Strain_name` as the true `Isolate_name`.

Fortunately, it is usually fairly obvious what the correct mapping is as they usually only differ in hyphens/dashes/spacing/number of leading 0's. 
The `Strain_name` in NCBI often contain part of what is the `Source_code` in the paper metadata (i.e., source), therefore I combined `Source_code` with `Isolate_name_paper` as `Source_code_and_isolate_name` and then searched for mappings using that.   

This means I can match `Strain_name` to `Source_code_and_isolate_name` by finding the `Source_code_and_isolate_name` that shares the longest suffix with each `Strain_name` in NCBI.

For extra security to prevent mis-assignments, I also added the following filter conditions: 

1. All the numerical portions of `Strain_name` and `Source_code_and_isolate_name` had to match to be valid
2. The relationship between the two sets of names had to be one-to-one i.e., each `Strain_name` was assigned to one and only one `Source_code_and_isolate_name`.

Any remaining `Strain_name` in the NCBI data that wasn't assigned a `Source_code_and_isolate_name` can then be manually reviewed.

In [None]:
def attempt_to_reconcile_name_sets(set1, set2):
    """
    Try and find the closest match between strings in set1 to set2
    First find the closest matching string by longest shared suffix
    Then as a safety move extract all digits and confirm they match between
    the biosample identifier and the closest paper identifier
    
    returns: dictionary mapping likely paper identifiers from set1 to set2
    """  
    closest_match_strings_by_suffix_length = []
    # for each name in set1 clean it up then and revese it (to make the suffix a prefix)
    for set1_name in set1:
        set1_name_clean = set1_name.replace('-', '').replace('_', '').lower()[::-1]
        
        # compare to each cleaned name in set2
        distances_between_set1_name_and_all_set2 = []
        longest_suffix = {'suffix_length': 0, 'set2_name': '', 'set1_name': set1_name}
        for set2_name in set2:
            set2_name_clean = set2_name.replace('-', '').replace('_', '').lower()[::-1]
            
            # and recover the string that has the longest shared suffix 
            suffix_length = 0
            for set1_name_char, set2_name_char in zip(set1_name_clean, set2_name_clean):
                if set1_name_char == set2_name_char:
                    suffix_length += 1
                else:
                    break
                    
            if suffix_length > longest_suffix['suffix_length']:
                longest_suffix['suffix_length'] = suffix_length
                longest_suffix['set2_name'] = set2_name

            closest_match_strings_by_suffix_length.append(longest_suffix)
    
    # then check that the best matchs ALSO share all the same numerical components
    closest_match_strings_by_suffix_size_and_numerical = []
    for closest_pairing in closest_match_strings_by_suffix_length:
        
        # handle mapping long date names (failing due to numbers in the date portion)
        if closest_pairing['set1_name'].startswith('ES-'):
            set1_numerical = "".join(re.findall(r'\d+', closest_pairing['set1_name'].split('-')[-1]))
        else:
            set1_numerical = "".join(re.findall(r'\d+', closest_pairing['set1_name']))

        if closest_pairing['set2_name'].startswith('ES-'):
            set2_numerical =  "".join(re.findall(r'\d+', closest_pairing['set2_name'].split('-')[-1]))
        else:
            set2_numerical =  "".join(re.findall(r'\d+', closest_pairing['set2_name']))
        
        if set1_numerical == set2_numerical:
            closest_match_strings_by_suffix_size_and_numerical.append((closest_pairing['set1_name'], 
                                                                       closest_pairing['set2_name']))
        else:
            # check if there is an issue with different numbers of leading 0s
            # as this is a common issue
            if set1_numerical.lstrip('0') == set2_numerical.lstrip('0'):
                 closest_match_strings_by_suffix_size_and_numerical.append((closest_pairing['set1_name'], 
                                                                            closest_pairing['set2_name']))
    
    # create a dictionary from the closest hits and extract any in set1 without a mapping to set2
    closest_match_strings_by_suffix_size_and_numerical = dict(closest_match_strings_by_suffix_size_and_numerical)
    
    # check for any non-one to one mappings in the dicionary i.e., different set1_name -> the same set2_name
    # delete and manually resolve 
    set2_name_counter = Counter(closest_match_strings_by_suffix_size_and_numerical.values())
    duplicates = []
    closest_match_strings_by_suffix_size_and_numerical_no_duplicates = {}
    for set1_name, set2_name in closest_match_strings_by_suffix_size_and_numerical.items():
        # i.e. drop all those who don't have one-to-one mapping
        if set2_name_counter[set2_name] == 1:
            closest_match_strings_by_suffix_size_and_numerical_no_duplicates[set1_name] = set2_name
    
    set1_names_without_matches = set(set1) - set(closest_match_strings_by_suffix_size_and_numerical_no_duplicates.keys())
    return closest_match_strings_by_suffix_size_and_numerical_no_duplicates, set1_names_without_matches

alberta_ncbi_to_metadata_match, alberta_ncbi_without_metadata_match = attempt_to_reconcile_name_sets(alberta_ncbi_metadata_and_reads['Strain_name'].values, 
                                                                                                    alberta_metadata['Source_code_and_isolate_name'].values)

Which NCBI strain_names didn't get a metadata match:

In [None]:
alberta_ncbi_without_metadata_match

Now we can try and manually fix these and identify cases of missing accessions or unresolvably ambiguous assignments (i.e., >1 perfectly valid appearing mapping from `Strain_name` to `Source_code_and_isolate_name` was possible).

In [None]:
missing = {'HC_NS0026',  # ambiguous 
           'HC_NS0078', # ambiguous
           'HC_NS0854', # missing
           'HC_NS1042', # missing
           'HC_NS1090', # missing
           'HC_NS1104', # missing
           'HC_VRE0078', # missing
           'HC_SS0026', #missing
           'WW_0060M', # missing
           'WW_0089I'} # missing

manual_fixes = {'CB_0150': 'CBEntR0150',
                 'CB_0182': 'CBEntR0182',
                 'CB_0383': 'CBSWEntR0383',
                 'ES-C-ST002-07DEC15-0142B': 'WW0142B',
                 'FC_0142B': 'FC0142B',
                 'HC_NS0150': 'NSSNS0150',
                 'HC_NS0238': 'NSSNS0238',
                 'HC_NS0383': 'NSSNS0383',
                 'HC_NS210': 'NSSNS0210',
                 'HC_SS0002': 'SS0002',
                 'HC_SS0025': 'SS0025',
                 'SW_0002': 'NWSEnt0002',
                 'SW_0025': 'NWSEnt0025',
                 'SW_0182': 'NWSEnt0182',
                 'SW_0238': 'NWSEnt0238'}

alberta_ncbi_to_metadata_match.update(manual_fixes)

Unfortunately, this still leaves us with 10 isolates that are in the deposited NCBI data but don't seem to have any supplied metadata

In [None]:
unresolved_ncbi = alberta_ncbi_metadata_and_reads[~alberta_ncbi_metadata_and_reads['Strain_name'].isin(alberta_ncbi_to_metadata_match.keys())].sort_values('Strain_name')['Strain_name'].values
# add a status to the NCBI data indicating missing
alberta_ncbi_metadata_and_reads.loc[alberta_ncbi_metadata_and_reads['Strain_name'].isin(unresolved_ncbi), 'Metadata_status'] = 'No linkable metadata in original paper (10.1038/s41598-020-61002-5)'

Using our mapping from `Strain_name` to `Source_code_and_isolate_name` let's add a `Source_code_and_isolate_name` to the NCBI metadata sheet and merge using that (ensuring valid one-to-one merging).  

We are doing a `left` merge because we only want to keep the metadata from the paper for isolates that correspond to the NCBI bioproject.

In [None]:
# translate the incorrect isolate_names in the paper metadata to the correct ones deposited in NCBI
alberta_ncbi_metadata_and_reads['Source_code_and_isolate_name'] = alberta_ncbi_metadata_and_reads['Strain_name'].apply(lambda x: alberta_ncbi_to_metadata_match[x] if x in alberta_ncbi_to_metadata_match else f"UNMATCHED: {x}")

# add the 10 unresolved identifiers to the paper metadata before merging
alberta_metadata = pd.concat([alberta_metadata, pd.DataFrame({'Source_code_and_isolate_name': unresolved_ncbi})])

alberta_merged = pd.merge(alberta_ncbi_metadata_and_reads, alberta_metadata, 
                          how='left',
                          on='Source_code_and_isolate_name', suffixes=['_ncbi', '_paper'])

Finally, we want to tidy up the alberta metadata to make our ultimate merging with the UK data easier

In [None]:
# rename NCBI species information as official species information
# and change the NCBI `Strain_name` to `Isolate_name` to match better with other datasets (even though what
# NCBI calls things should remain the master)
alberta_merged = alberta_merged.rename(columns={'Species_ncbi': 'Species',
                                                'Strain_name': 'Isolate_name'})

# drop the superflous and likely erroneous species information from paper metadata sheet
alberta_merged = alberta_merged.drop(['Species_paper', 'Source_code_and_isolate_name'], axis=1)


Final tidy up before merging of UK and AAFC data

In [None]:
# finally get rid of our franken colunm with the source code and the species names from the reads
# finally reorder the columns into logical groupings to better understand the data (even though we have to do this again
# after the big final merge)
alberta_merged = alberta_merged[['Study_accession', 'Sample_accession',
                                     'Isolate_name', 'Isolate_name_paper', 
                                     'Metadata_status', 'Species', 'Country/Province',
                                     'Origin', 'Location', 'Source_code', 'Ampicillin', 'Vancomycin',
                                     'Teicoplanin', 'Doxycycline', 'Erythromycin', 'Nitrofurantoin',
                                     'Gentamicin', 'Linezolid', 'Levofloxacin', 'Quinolone', 'Streptomycin',
                                     'Tigecycline', 'Read_status', 'R1', 'R2']]

# Merging the two datasets

Combine the two datasets and tidy the names

In [None]:
all_data = pd.concat([alberta_merged, uk_metadata_and_reads], 
                     join='outer', axis=1, ignore_index=True)

# all_data = all_data[['Study_accession', 'Sample_accession', 'Run_accession', 'Isolate_name',
#                      'Isolate_name_paper', 'Isolate_name_reads', 
#                      'Metadata_status', 'Species', 'BAPS_group','Sequence_Type', 'Country/Province', 
#                      'Origin', 'Location', 'Source_code', 
#                      'Ampicillin', 'Vancomycin', 'Teicoplanin', 'Doxycycline',
#                      'Erythromycin', 'Nitrofurantoin', 'Gentamicin', 'Linezolid',
#                      'Levofloxacin', 'Quinolone', 'Streptomycin', 'Tigecycline',
#                      'R1', 'R2']]

all_data.to_csv('all_combined_enterococcus_metadata.tsv', index=False, sep='\t')