# Get PLAZA information for Galaxy

Get all genomes and annotations available in PLAZA v4, formatted to be used in Galaxy
The genome annotations were curated so they always match the genome info

In [1]:
import json
from jinja2 import Environment
import urllib
from urllib.error import HTTPError, URLError

In [2]:
# Testing: only one genome, some debug info
debug = False # change to True to get the full list
genome_for_testing = "Micromonas commoda"

## Template for genomes file

In [3]:
genome_j2 = """genomes:{% for genome in genomes %}
    - url: {{ genome.url }}
      name: {{ genome.name }}
      id: {{ genome.id }}{% endfor %}"""

## Getting genome information available for Galaxy through PLAZA API

In [4]:
plaza_api_calls = {
    'monocots_v4': 'https://bioinformatics.psb.ugent.be/plaza/versions/plaza_v4_monocots/api/get_galaxy_information',
    'dicots_v4': 'https://bioinformatics.psb.ugent.be/plaza/versions/plaza_v4_dicots/api/get_galaxy_information',
}

In [5]:
call_results = []

for key, api_call in plaza_api_calls.items():
    print("Getting info for {}".format(key))
    try:
        with urllib.request.urlopen(api_call) as response:
            html = response.read()
    except HTTPError as e:
        print('The server couldn\'t fulfill the request.')
        print('Error code: ', e.code)
    except URLError as e:
        print('We failed to reach a server.')
        print('Reason: ', e.reason)
    else:
        pass
    plaza_list = json.loads(html.decode('utf-8'))
    if debug:
        print(json.dumps(plaza_list, sort_keys=True, indent=4))
    call_results.append(plaza_list)

Getting info for monocots_v4
Getting info for dicots_v4


In [6]:
genomes = {}

for plaza_list in call_results:

    for item in plaza_list: 
        print(item['common_name'])
        
        if debug:
            if item['common_name'] != genome_for_testing:
                print("\tSkipping in testing mode")
                continue
                
        if item['common_name'] in genomes:
            print("\tGenome for {} already captured".format(item['common_name']))
            continue
        
        try:
            if item['eco_type'] == None:
                name = "{common_name} {version}".format(**item)
                gid = "{common_name} {version}".format(**item).replace(' ','_')
            else:
                name = "{common_name} {eco_type} {version}".format(**item)
                gid = "{common_name} {eco_type} {version}".format(**item).replace(' ','_')
            url = item['fasta']['genome']['location']
                        
            genomes[item['common_name']] = {'url': url, 'id': gid, 'name': name}
        except TypeError:
            print("\n!!! Not all necessary fields are provided !!!")
            print(json.dumps(item, sort_keys=True, indent=4))
            print("\n")

        except :
            print("Error")
            print(json.dumps(item, sort_keys=True, indent=4))

Amborella trichopoda
Ananas comosus
Arabidopsis thaliana
Brachypodium distachyon
Chlamydomonas reinhardtii
Elaeis guineensis
Hordeum vulgare
Marchantia polymorpha
Micromonas commoda
Musa acuminata
Oropetium thomaeum
Oryza brachyantha
Oryza sativa ssp. indica
Oryza sativa ssp. japonica
Phalaenopsis equestris
Phyllostachys edulis
Physcomitrella patens
Picea abies
Populus trichocarpa
Selaginella moellendorffii
Setaria italica
Solanum lycopersicum
Sorghum bicolor
Spirodela polyrhiza
Triticum aestivum
Vitis vinifera
Zea mays
Zostera marina
Zoysia japonica ssp. nagirizaki
Actinidia chinensis
Amaranthus hypochondriacus

!!! Not all necessary fields are provided !!!
{
    "URL": "https://phytozome.jgi.doe.gov/pz/portal.html#!info?alias=Org_Ahypochondriacus_er",
    "common_name": "Amaranthus hypochondriacus",
    "eco_type": null,
    "fasta": {
        "cds": "",
        "genome": "",
        "proteome": ""
    },
    "gff": [],
    "pubmed_id": "28854926",
    "species": "ahy",
    "tax_id":

In [7]:
print("\nGenomes found on PLAZA FTP: {}".format(len(genomes)))


Genomes found on PLAZA FTP: 70


## Write genomes to file

In [8]:
genome_list = []
for k, v in sorted(genomes.items()):
    genome_list.append(v)

with open('genomes.yaml', 'w') as f:
    f.write(Environment().from_string(genome_j2).render(genomes=genome_list))

## Get complete yaml file

In [9]:
!echo "\n" >> genomes.yaml # making sure there is a new line at the end
!cat genomes.yaml data_managers.yaml > genome_data_manager.yaml