The EN-TEX biosamples need special handling as we need to track the upstream split from and donor information.

I had originally started registering them in encode-y4q1-submission. But then we had more to submit. So felt it would be best to split out the validation I was doing into a new notebook.

In [1]:
import pandas
import gcat
import numpy

In [2]:
from curation_common import *

In [3]:
def strip_donor(cell):
    if pandas.isnull(cell):
        return cell
    return cell.replace('Donor', '').strip()
    
def strip_comment_from_id(cell):
    if pandas.isnull(cell):
        return cell
    
    for i, c in enumerate(cell):
        if not c.isalnum():
            return cell[:i]
    return cell

def suppress_updated(cell):
    """Help emptying fields from the footer so we can drop them
    """
    if cell == 'Updated on:':
        return numpy.nan
    
    return cell

In [4]:
entex_production_url = "https://docs.google.com/spreadsheets/d/1nceX7J2rtyPEmqeIW7ypVSDDLQ6ipqP-l6k-LlYr6rM/export?format=xlsx&id=1nceX7J2rtyPEmqeIW7ypVSDDLQ6ipqP-l6k-LlYr6rM"

In [5]:
entex_scale_up_male = pandas.read_excel(
    entex_production_url, 
    sheetname="EN-TEX Scale-Up (Male)", 
    skip_rows=2, header=2, parse_cols=[0,1,2,3,4,5],
    converters={
        'Tissue': suppress_updated,
        'GTEX Donor': strip_donor,
        'ENCODE Parent BioSample Accession #': strip_comment_from_id,
        'CSHL RNA BioSample Accession # (RIN#)': strip_comment_from_id,
    }
)
#entex_scale_up_male

In [6]:
entex_scale_up_female = pandas.read_excel(
    entex_production_url, 
    sheetname="EN-TEX Scale-Up (Female)", 
    skip_rows=2, header=2, parse_cols=[0,1,2,3,4,5],
    converters={
        'GTEX Donor': strip_donor,
        'ENCODE Parent BioSample Accession #': strip_comment_from_id,
        'CSHL RNA BioSample Accession # (RIN#)': strip_comment_from_id,
    }
)
#entex_scale_up_female

make sure column names match

In [7]:
set(entex_scale_up_male.columns).difference(set(entex_scale_up_female.columns))

set()

In [8]:
entex_scale_up = pandas.concat([entex_scale_up_male, entex_scale_up_female])
#entex_scale_up

In [9]:
spreadsheet_name = 'ENCODE Submission for Y4Q1'
sheet_name = 'Biosamples'

In [10]:
plate_id_to_part_of = {
    '9IT1C': 'ENCBS890WYO',
    '9JLPD': 'ENCBS890WYO',

    '9JLPJ': 'ENCBS562VSE',
    '9JLPP': 'ENCBS562VSE',
    
    #'     ': 'ENCBS027MVW',
    #'     ': 'ENCBS027MVW',
    
    #'    ': 'ENCBS380CLT',
    #'     ': 'ENCBS380CLT',

    '9IT1F': 'ENCBS335JSV',
    '9JLPG': 'ENCBS335JSV',
    
    '9JLPK': 'ENCBS692CGI',
    '9JLPQ': 'ENCBS692CGI',    
    
    '9IT1E': 'ENCBS564MPZ',
    '9JLPF': 'ENCBS564MPZ',
    
    '9JLPO': 'ENCBS494DUH',
    '9JLPU': 'ENCBS494DUH',
    
    '91TIG': 'ENCBS380GWR',
    '9JLPH': 'ENCBS380GWR',
    
    '9JLPN': 'ENCBS005MNT',
    '9JLPT': 'ENCBS005MNT',
    
    '9JLPI': 'ENCBS644JRA',
    #'     ': 'ENCBS644JRA',
    
    '9JLPM': 'ENCBS855YAD',
    '9JLPS': 'ENCBS855YAD',
}

In [11]:
gtex_donor_to_encode = {
  '1JKYN': 'ENCDO845WKR',
  '1K2DA': 'ENCDO451RUA',
}

In [12]:
def lookup_broad(broad_id):
    if pandas.isnull(broad_id):
        return None
    
    last = broad_id.split('-')[-1]
    return plate_id_to_part_of.get(last)

def lookup_gtex_donor_id(donor_id):
    if pandas.isnull(donor_id):
        return None
                    
    last = donor_id.split('-')[-1]
    return gtex_donor_to_encode[last]

class LookupBiosample:
    def __init__(self):
        self._cache = {}
        self._server = ENCODED('www.encodeproject.org')
        self._server.load_netrc()
        
    def biosample_term_id(self, biosample_id):
        if pandas.isnull(biosample_id):
            return None
        
        biosample = self._cache.setdefault(biosample_id, self._server.get_json(biosample_id))
        return biosample['biosample_term_id']

    def biosample_term_name(self, biosample_id):
        if pandas.isnull(biosample_id):
            return None
        
        biosample = self._cache.setdefault(biosample_id, self._server.get_json(biosample_id))
        return biosample['biosample_term_name']

def compute_part_of(row):
    part_of = row['part_of']
    if pandas.isnull(part_of):
        return lookup_broad(row['broad_biosample:skip'])
    
    return part_of

def compute_donor(row):
    donor = row['donor']
    if pandas.isnull(donor):
        return lookup_gtex_donor_id(row['gtex_donor:skip'])
    
    return donor
    
    
def validate_from_production(table):
    def validate_callable(row):
        # skip rows weve already done.
        if pandas.notnull(row['biosample_accession']):
            return True
        
        # we're keying off of part of, so if that's not present we cant do anything
        part_of = row['part_of']
        if pandas.isnull(part_of):
            return True
        
        # filter metadata table
        upstream_rows = table[table['ENCODE Parent BioSample Accession #'] == part_of]
        if len(upstream_rows) == 0:
            # No maches
            print("No metadata for: {}".format(part_of))
            return False
        
        # make sure tissue and description match
        description = next(iter(upstream_rows['Tissue']))
        if description != row['description']:
            print('{} description {} != {}'.format(part_of, description, row['description']))
            return False
            
        # make sure GTEX donors match.
        gtex = next(iter(upstream_rows['GTEX Donor']))
        gtex = gtex.replace('GTEX-', 'PT-')
        if gtex != row['gtex_donor:skip']:
            print('{} gtex {} != {}'.format(part_of, gtex, row['gtex_donor:skip']))
            return False
        
        # Make sure ENCODE donor matches
        donor = next(iter(upstream_rows['ENCODE Donor Accession']))
        if donor != row['donor']:
            print('{} donor {} != {}'.format(part_of, donor, row['donor']))
            return False

        # Make sure age matches
        age_sex = next(iter(upstream_rows['Age Sex']))
        age = float(age_sex[:-1])
        sex = age_sex[-1:]
        if age != row['model_organism_age']:
            print('{} age {} != {}'.format(part_of, age, row['model_organism_age']))
            return False
        
        return True
    return validate_callable

In [13]:
y4q1 = gcat.get_file(spreadsheet_name, fmt='pandas_excel')
biosample = y4q1.parse(sheet_name, header=0)

In [14]:
lookup_biosample = LookupBiosample()

In [15]:
# these were needed for the pilot 
#biosample['part_of'] = biosample.apply(compute_part_of, axis=1)
#biosample['donor'] = biosample.apply(compute_donor, axis=1)

In [16]:
numpy.all(biosample.apply(validate_from_production(entex_scale_up), axis=1))

True

In [17]:
biosample['biosample_term_id'] = biosample['part_of'].map(lookup_biosample.biosample_term_id)
biosample['biosample_term_name'] = biosample['part_of'].map(lookup_biosample.biosample_term_name)

In [18]:
biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)