# Get PLAZA information for Galaxy

Get all genomes and annotations available in PLAZA v4, formatted to be used in Galaxy
The genome annotations were curated so they always match the genome info

In [21]:
import json
from jinja2 import Environment
import urllib
from urllib.error import HTTPError, URLError

In [36]:
# Testing: only one genome, some debug info
debug = True # change to True to get the full list
genome_for_testing = "Micromonas commoda"

## Template for genomes file

In [37]:
genome_j2 = """genomes:{% for genome in genomes %}
    - url: {{ genome.url }}
      name: {{ genome.name }}
      id: {{ genome.id }}{% endfor %}"""

## Getting genome information available for Galaxy through PLAZA API

In [38]:
plaza_api_calls = {
    'monocots_v4': 'https://bioinformatics.psb.ugent.be/plaza/versions/plaza_v4_monocots/api/get_galaxy_information',
    'dicots_v4': 'https://bioinformatics.psb.ugent.be/plaza/versions/plaza_v4_dicots/api/get_galaxy_information',
}

In [39]:
call_results = []

for key, api_call in plaza_api_calls.items():
    print("Getting info for {}".format(key))
    try:
        with urllib.request.urlopen(api_call) as response:
            html = response.read()
    except HTTPError as e:
        print('The server couldn\'t fulfill the request.')
        print('Error code: ', e.code)
    except URLError as e:
        print('We failed to reach a server.')
        print('Reason: ', e.reason)
    else:
        pass
    plaza_list = json.loads(html.decode('utf-8'))
    if debug:
        print(json.dumps(plaza_list, sort_keys=True, indent=4))
    call_results.append(plaza_list)

Getting info for monocots_v4
[
    {
        "URL": "https://phytozome.jgi.doe.gov/pz/portal.html#!info?alias=Org_Atrichopoda",
        "common_name": "Amborella trichopoda",
        "eco_type": null,
        "fasta": {
            "cds": {
                "filename": "cds.atr.fasta.gz",
                "location": "ftp://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_04//Fasta/cds.atr.fasta.gz"
            },
            "genome": {
                "filename": "atr.con.gz",
                "location": "ftp://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_04//Genomes/atr.con.gz"
            },
            "proteome": {
                "filename": "proteome.atr.fasta.gz",
                "location": "ftp://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_04//Fasta/proteome.atr.fasta.gz"
            }
        },
        "gff": [
            {
                "filename": "Amborella_trichopoda.JGI_v1.0.all_transcripts",
                "location": "ftp://ftp.psb.ugent.be/pub/plaza/plaz

[
    {
        "URL": "ftp://bioinfo.bti.cornell.edu/pub/kiwifruit",
        "common_name": "Actinidia chinensis",
        "eco_type": null,
        "fasta": {
            "cds": {
                "filename": "cds.ach.fasta.gz",
                "location": "ftp://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_04//Fasta/cds.ach.fasta.gz"
            },
            "genome": {
                "filename": "ach.con.gz",
                "location": "ftp://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_04//Genomes/ach.con.gz"
            },
            "proteome": {
                "filename": "proteome.ach.fasta.gz",
                "location": "ftp://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_04//Fasta/proteome.ach.fasta.gz"
            }
        },
        "gff": [
            {
                "filename": "Actinidia_chinensis.FEI_Lab_v1.0.all_transcripts",
                "location": "ftp://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_04//GFF/ach/Actinidia_chinensis.FEI_Lab_v1.0.a

In [40]:
genomes = {}

for plaza_list in call_results:

    for item in plaza_list: 
        print(item['common_name'])
        
        if debug:
            if item['common_name'] != genome_for_testing:
                print("\tSkipping in testing mode")
                continue
                
        if item['common_name'] in genomes:
            print("\tGenome for {} already captured".format(item['common_name']))
            continue
        
        try:
            if item['eco_type'] == None:
                name = "{common_name} {version}".format(**item)
                gid = "{common_name} {version}".format(**item).replace(' ','_')
            else:
                name = "{common_name} {eco_type} {version}".format(**item)
                gid = "{common_name} {eco_type} {version}".format(**item).replace(' ','_')
            url = item['fasta']['genome']['location']
                        
            genomes[item['common_name']] = {'url': url, 'id': gid, 'name': name}
        except TypeError:
            print("\n!!! Not all necessary fields are provided !!!")
            print(json.dumps(item, sort_keys=True, indent=4))
            print("\n")

        except :
            print("Error")
            print(json.dumps(item, sort_keys=True, indent=4))

Amborella trichopoda
	Skipping in testing mode
Ananas comosus
	Skipping in testing mode
Arabidopsis thaliana
	Skipping in testing mode
Brachypodium distachyon
	Skipping in testing mode
Chlamydomonas reinhardtii
	Skipping in testing mode
Elaeis guineensis
	Skipping in testing mode
Hordeum vulgare
	Skipping in testing mode
Marchantia polymorpha
	Skipping in testing mode
Micromonas commoda
Musa acuminata
	Skipping in testing mode
Oropetium thomaeum
	Skipping in testing mode
Oryza brachyantha
	Skipping in testing mode
Oryza sativa ssp. indica
	Skipping in testing mode
Oryza sativa ssp. japonica
	Skipping in testing mode
Phalaenopsis equestris
	Skipping in testing mode
Phyllostachys edulis
	Skipping in testing mode
Physcomitrella patens
	Skipping in testing mode
Picea abies
	Skipping in testing mode
Populus trichocarpa
	Skipping in testing mode
Selaginella moellendorffii
	Skipping in testing mode
Setaria italica
	Skipping in testing mode
Solanum lycopersicum
	Skipping in testing mode
Sorghu

In [43]:
print("\nGenomes found on PLAZA FTP: {}".format(len(genomes)))


Genomes found on PLAZA FTP: 1


## Write genomes to file

In [41]:
genome_list = []
for k, v in sorted(genomes.items()):
    genome_list.append(v)

with open('genomes.yaml', 'w') as f:
    f.write(Environment().from_string(genome_j2).render(genomes=genome_list))

## Get complete yaml file

In [42]:
!echo "\n" >> genomes.yaml # making sure there is a new line at the end
!cat genomes.yaml data_managers.yaml > genome_data_manager.yaml