In [3]:
import json, re, sys
import pandas as pd
import numpy as np

from Bio.KEGG import REST


In [26]:
### Generate the genome CSV

genome_file = "raw_data/cazy_genomes_2021_05_15.json"

cellulose_degrader_file = "raw_data/cellulose_info.tsv"

is_header = True

genome_degrader = {}

for line in open(cellulose_degrader_file, 'r'):
    if is_header == True:
        is_header = False
    else:
        fields = line.strip().split("\t")

        genome_degrader[fields[0]] = fields[1]

desired_rank = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]

taxon_dict = {}

parent_son = set()
taxon_genome = set()

with open("data_for_neo4j/taxon.csv", "w") as output:
    output.write(",".join(["name", "taxid", "rank"]) + "\n")

with open("data_for_neo4j/genome.csv", "w") as output:
    output.write(",".join(["name", "cellulose"]) + "\n")

with open("data_for_neo4j/taxon_connections.csv", "w") as output:
    output.write(",".join(["from", "to"]) + "\n")

with open("data_for_neo4j/genome_connections.csv", "w") as output:
    output.write(",".join(["from", "to"]) + "\n")

with open("data_for_neo4j/has_cazy.csv", "w") as output:
    output.write(",".join(["from", "to", "amount"]) + "\n")

for line in open(genome_file, 'r'):
    genome = json.loads(line)

    if "superkingdom" in genome["taxonomy"] and genome["taxonomy"]["superkingdom"][0] in ["Bacteria", "Archaea"]:
        taxid = genome["taxid"]
        name = genome["name"]

        taxonomy = genome["taxonomy"]

        #taxonomy["genome"] = [name, genome["taxid"]]

        quartett = [""] * 2
        for rank in desired_rank:
            if rank in taxonomy:
                taxon, taxon_taxid = taxonomy[rank]

                if taxon not in taxon_dict:
                    taxon_dict[taxon] = [taxon_taxid, rank]

                if rank == "superkingdom":
                    quartett[0] = taxon
                else:
                    quartett[1] = taxon

                    #if quartett[0] != quartett[1]:
                    
                    parent_son.add(tuple(quartett))

                    quartett[0] = quartett[1]
        
        taxon_dict[name] = [taxid, "genome"]
        taxon_genome.add(tuple([quartett[0], name]))

        for cazy in genome["cazy"]:
            with open("data_for_neo4j/has_cazy.csv", "a") as output:            
                output.write(",".join([f'"{name}"', f'"{cazy}"', f'{genome["cazy"][cazy]}']) + "\n")




for name in taxon_dict:
    with open("data_for_neo4j/taxon.csv", "a") as output:
        if taxon_dict[name][1] != "genome":
            output.write(",".join([f'"{name}"', f'"{taxon_dict[name][0]}"', f'"{taxon_dict[name][1]}"']) + "\n")

for name in taxon_dict:
    with open("data_for_neo4j/genome.csv", "a") as output:
        if taxon_dict[name][1] == "genome":
            cellulose = ""

            if name in genome_degrader:
                if genome_degrader[name] == "no cellulose degrader":
                    output.write(",".join([f'"{name}"', '0']) + "\n")
                else:
                    output.write(",".join([f'"{name}"', '1']) + "\n")
            else:
                output.write(",".join([f'"{name}"', '2']) + "\n")

for parent in parent_son:
    with open("data_for_neo4j/taxon_connections.csv", "a") as output:
        #["~id", "~from", "~to", "~label"]
        output.write(",".join([f'"{parent[0]}"', f'"{parent[1]}"']) + "\n")

for parent in taxon_genome:
    with open("data_for_neo4j/genome_connections.csv", "a") as output:
        output.write(",".join([f'"{parent[0]}"', f'"{parent[1]}"']) + "\n")

In [24]:
## generate the cazy CSVs

info_file = "cazy_info_2021_05_12.json"

rx_ec = re.compile(r'\b\d+\.\d+\.\d+\.\d+\b')

with open("cazy.csv", "w") as output:
    output.write(",".join(["name", "activities", "clan", "mechanism", "catalytic"]) + "\n")

with open("ec.csv", "w") as output:
    output.write(",".join(["name", "sysname", "reaction"]) + "\n")

with open("cazy_ec.csv", "w") as output:
    output.write(",".join(["from", "to"]) + "\n")

unique_ec = set()

for line in open(info_file, 'r'):
    cazy_line = json.loads(line)

    cazy = cazy_line["name"]
    activities = ""
    distribution = ""
    clan = ""
    mechanism = ""
    catalytic = ""

    if " Activities in Family" in cazy_line:
        activities = cazy_line[" Activities in Family"]

    if "distribution" in cazy_line:
        distribution = json.dumps(cazy_line["distribution"])

    if "Clan" in cazy_line:
        clan = cazy_line["Clan"]
    
    if "Mechanism" in cazy_line:
        mechanism = cazy_line["Mechanism"]
    
    if "Catalytic Nucleophile/Base" in cazy_line:
        catalytic = cazy_line["Catalytic Nucleophile/Base"]

    if "ec" in cazy_line:
        ecs = cazy_line["ec"]

        for ec in ecs:
            match_ec = rx_ec.match(ec)

            if match_ec:
                with open("cazy_ec.csv", "a") as output:
                    output.write(",".join([f'"{cazy}"', f'"{ec}"']) + "\n")
                
                unique_ec.add(ec)
        
        with open("cazy.csv", "a") as output:
            output.write(",".join([f'"{cazy}"', f'"{activities}"', f'"{clan}"', f'"{mechanism}"', f'"{catalytic}"']) + "\n")

for ec in unique_ec:
    with open("ec.csv", "a") as output:
        output.write(",".join([f'"{ec}"']) + "\n")

In [21]:
### generate the EC CSVs

with open("ec_final.csv", "w") as output:
    output.write(",".join(["name", "sysname", "reaction"]) + "\n")

is_header = True
for line in open("ec.csv"):

    if is_header == True:
        is_header = False
    else:

        ec = f"ec:{line.strip()}"
        sysname = ""
        reaction = ""
        for content in REST.kegg_get(ec):
            for l in content.split("\n"):
                if l.startswith("SYSNAME"):
                    sysname = l.replace("SYSNAME", "").strip()
            
                if l.startswith("REACTION"):
                    reaction = l.replace("REACTION", "").strip()
        
        #print (f'{line.strip()},"{sysname}","{reaction}"')
        with open("ec_final.csv", "a") as output:
            output.write(",".join([line.strip(), f'"{sysname}"', f'"{reaction}"']) + "\n")

In [24]:
import requests, json
query = """query getGenomeCazy {
	taxons (where: { name: "Formosa" }) {
    name
    rank
    taxid
    genomes {
      name
      cazys {name, amount}
    }
  }
}"""

url = 'http://localhost:4000'
r = requests.post(url, json={'query': query})

print (json.loads(r.text))

{'data': {'taxons': [{'name': 'Formosa', 'rank': 'genus', 'taxid': '225842', 'genomes': [{'name': 'Formosa sediminum PS13', 'cazys': [{'name': 'CE14', 'amount': 1}, {'name': 'GH0', 'amount': 1}, {'name': 'GH3', 'amount': 6}, {'name': 'GT30', 'amount': 1}, {'name': 'GH31', 'amount': 3}, {'name': 'GT5', 'amount': 2}, {'name': 'GH107', 'amount': 1}, {'name': 'GH1', 'amount': 1}, {'name': 'CBM47', 'amount': 1}, {'name': 'PL26', 'amount': 1}, {'name': 'GT51', 'amount': 3}, {'name': 'GT0', 'amount': 2}, {'name': 'GT20', 'amount': 1}, {'name': 'CBM50', 'amount': 7}, {'name': 'CBM6', 'amount': 9}, {'name': 'GT28', 'amount': 1}, {'name': 'PL6', 'amount': 3}, {'name': 'CE11', 'amount': 1}, {'name': 'GH92', 'amount': 1}, {'name': 'GH171', 'amount': 3}, {'name': 'GH43', 'amount': 2}, {'name': 'CE9', 'amount': 1}, {'name': 'GH5', 'amount': 2}, {'name': 'GH25', 'amount': 1}, {'name': 'PL29', 'amount': 1}, {'name': 'PL33', 'amount': 1}, {'name': 'GT19', 'amount': 1}, {'name': 'GH97', 'amount': 1}, {'