# Introduction

I feel like I should report to brian what's actually posted

In [1]:
import pandas
import collections
import sys
import os
import rdflib

In [2]:
HTSW=os.path.expanduser('~diane/proj/htsworkflow')
if HTSW not in sys.path:
    sys.path.append(HTSW)
from htsworkflow.submission import encoded

In [3]:
class JumpgateInfo:
    def __init__(self):
        self.cache = {}
        
    def __call__(self, library_id):
        return self.cache.setdefault(library_id, self.get_library_info(library_id))
        
    def get_library_info(self, library_id):
        if library_id is None:
            return None

        library = rdflib.term.URIRef(f'http://jumpgate.caltech.edu/library/{library_id}/')
        g = rdflib.Graph()
        g.parse(library)    
        query = """
        prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>

        select ?name
        where {
            ?library libns:name ?name .
        }
        """

        rows = list(g.query(query, initBindings={'library': library}))
        assert len(rows) == 1, f'missing {library_id}'
        fields = rows[0]
        return {
            'name': fields[0].toPython(),
        }

get_library_info = JumpgateInfo()

In [4]:
get_library_info('17288')

{'name': 'Illumina index__708_517_Paired_ends_e10.5_mouse_forebrain_rep1'}

In [5]:
server = encoded.ENCODED('www.encodeproject.org')
#server = encoded.ENCODED('test.encodedcc.org')
server.load_netrc()
validator = encoded.DCCValidator(server)

In [6]:
publication_sets = ['ENCSR574CRQ', 'ENCSR226XLF']

In [7]:
publication_set = server.get_json('ENCSR574CRQ')

In [8]:
def get_publication_set_details(server, publication_accession):
    records = []
    publication_set = server.get_json(publication_accession)
    for file_accession in publication_set['related_files']:
        f = server.get_json(file_accession)
        library = f.get('library', {})
        aliases = library.get('aliases', [])
        if len(aliases) == 1:
            library_id = aliases[0].replace('barbara-wold:', '')
            if '_' in library_id:
                library_id = library_id.split('_')[0]
            jumpgate = get_library_info(library_id)
        else:
            library_id = None
            jumpgate = None
        file_status = f.get('status')
        library_status = library.get('status')
        records.append({
            'filename': f.get('submitted_file_name'),
            'file': f['accession'],
            'file_status': file_status,
            'library': library.get('accession'),
            'library_status': library_status,
            'jumpgate': library_id,
            'name': jumpgate.get('name') if jumpgate is not None else None,
        })
        
    return pandas.DataFrame(
        records, 
        columns=['file', 'file_status', 'library', 'library_status', 'jumpgate', 'name', 'filename'])

In [9]:
ENCSR574CRQ = get_publication_set_details(server, 'ENCSR574CRQ')

In [10]:
ENCSR574CRQ.to_excel('ENCSR574CRQ.xlsx', 'ENCSR574CRQ')

In [11]:
ENCSR226XLF = get_publication_set_details(server, 'ENCSR226XLF')
ENCSR226XLF.shape

(7360, 7)

In [12]:
ENCSR226XLF.to_excel('ENCSR226XLF.xlsx', 'ENCSR226XLF')

In [13]:
ENCSR574CRQ_jumpgate = ENCSR574CRQ[['jumpgate', 'name']].dropna().drop_duplicates()

In [14]:
ENCSR574CRQ_jumpgate.to_csv('ENCSR574CRQ_jumpgate.csv')

In [15]:
collections.Counter(ENCSR574CRQ['file_status'])

Counter({'released': 1199, 'revoked': 96})

In [16]:
collections.Counter(ENCSR226XLF['file_status'])

Counter({'released': 7360})

In [17]:
#ENCSR226XLF[ENCSR226XLF['file_status'] == 'released']['filename']