In [24]:
import json, re, sys
import pandas as pd
import numpy as np

#from Bio.KEGG import REST

In [25]:
#down size the dataset to only Bacteroidetes

phylum = "Bacteroidetes"

input_json = "raw_data/cazy_genomes_2022_09_03.json"

content = ""

for line in open(input_json):
    data = json.loads(line)
    #print (data)

    if "phylum" in data["taxonomy"] and data["taxonomy"]["phylum"][0] == phylum:
        content += json.dumps(data) + "\n"

with open(f"raw_data/bacteroidetes_2022_09_03.json", "w") as output:
    output.write(content)

In [26]:
### Generate the genome CSV

genome_file = "raw_data/bacteroidetes_2022_09_03.json"

cellulose_degrader_file = "raw_data/cellulose_info.tsv"

is_header = True

genome_degrader = {}

output_folder = "data_for_neo4j_bacteroidetes"

for line in open(cellulose_degrader_file, 'r'):
    if is_header == True:
        is_header = False
    else:
        fields = line.strip().split("\t")

        genome_degrader[fields[0]] = fields[1]

desired_rank = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]

taxon_dict = {}

parent_son = set()
taxon_genome = set()

with open(f"{output_folder}/taxon.csv", "w+") as output:
    output.write(",".join(["name", "taxid", "rank"]) + "\n")

with open(f"{output_folder}/genome.csv", "w+") as output:
    output.write(",".join(["name", "cellulose"]) + "\n")

with open(f"{output_folder}/taxon_connections.csv", "w+") as output:
    output.write(",".join(["from", "to"]) + "\n")

with open(f"{output_folder}/genome_connections.csv", "w+") as output:
    output.write(",".join(["from", "to"]) + "\n")

with open(f"{output_folder}/has_cazy.csv", "w+") as output:
    output.write(",".join(["from", "to", "amount"]) + "\n")

for line in open(genome_file, 'r'):
    genome = json.loads(line)

    if "superkingdom" in genome["taxonomy"] and genome["taxonomy"]["superkingdom"][0] in ["Bacteria", "Archaea"]:
        taxid = genome["taxid"]
        name = genome["name"]

        taxonomy = genome["taxonomy"]

        #taxonomy["genome"] = [name, genome["taxid"]]

        quartett = [""] * 2
        for rank in desired_rank:
            if rank in taxonomy:
                taxon, taxon_taxid = taxonomy[rank]

                if taxon not in taxon_dict:
                    taxon_dict[taxon] = [taxon_taxid, rank]

                if rank == "superkingdom":
                    quartett[0] = taxon
                else:
                    quartett[1] = taxon

                    #if quartett[0] != quartett[1]:
                    
                    parent_son.add(tuple(quartett))

                    quartett[0] = quartett[1]
        
        taxon_dict[name] = [taxid, "genome"]
        taxon_genome.add(tuple([quartett[0], name]))

        for cazy in genome["cazy"]:
            with open(f"{output_folder}/has_cazy.csv", "a") as output:            
                output.write(",".join([f'"{name}"', f'"{cazy}"', f'{genome["cazy"][cazy]}']) + "\n")




for name in taxon_dict:
    with open(f"{output_folder}/taxon.csv", "a") as output:
        if taxon_dict[name][1] != "genome":
            output.write(",".join([f'"{name}"', f'"{taxon_dict[name][0]}"', f'"{taxon_dict[name][1]}"']) + "\n")

for name in taxon_dict:
    with open(f"{output_folder}/genome.csv", "a") as output:
        if taxon_dict[name][1] == "genome":
            cellulose = ""

            if name in genome_degrader:
                if genome_degrader[name] == "no cellulose degrader":
                    output.write(",".join([f'"{name}"', '0']) + "\n")
                else:
                    output.write(",".join([f'"{name}"', '1']) + "\n")
            else:
                output.write(",".join([f'"{name}"', '2']) + "\n")

for parent in parent_son:
    with open(f"{output_folder}/taxon_connections.csv", "a") as output:
        #["~id", "~from", "~to", "~label"]
        output.write(",".join([f'"{parent[0]}"', f'"{parent[1]}"']) + "\n")

for parent in taxon_genome:
    with open(f"{output_folder}/genome_connections.csv", "a") as output:
        output.write(",".join([f'"{parent[0]}"', f'"{parent[1]}"']) + "\n")

In [27]:
# extract two files from CAZy database: subfamilies and activities

import json
import re

input_file = "./raw_data/cazy_info_2022_09_03.json"

relation = "from\tto\taction\n"
cazy_activity = ""

for line in open(input_file, 'r'):
    data = json.loads(line)
    name = data["name"]
    #print (name)
    if "_" in name:
        parent = name.split("_")[0]
        relation += f'{name}\t{parent}\tIS_A\n'

    if "activity" in data:
        activity = re.sub(r"<a.+?<\/a>", "", data["activity"])
        #print (f"{name}: {activity}")
        cazy_activity += f"{name}\t{activity}\n"

relation_output = open("./raw_data/cazy_subfamily_relation.tsv", 'w')
relation_output.write(relation)
relation_output.close()

activity_output = open("./raw_data/cazy_activity.tsv", 'w', encoding="utf-8")
activity_output.write(cazy_activity)
activity_output.close()


In [28]:
import sys

mesh_indent_file = "./raw_data/polysaccharides_mesh.txt"
sugar_file = "./raw_data/substrate.tsv"

def tab_level(astr):
    """Count number of leading tabs in a string
    """
    return len(astr)- len(astr.lstrip(' '))

sugar_set = set()
# is_header = False
# for line in open(sugar_file, 'r'):
#     if is_header == False:
#         is_header = True
#     else:
#         sugar_set.add(line.strip())

indent_space = 4
indent_text = {}

relation = "from\tto\n"

n = 0
for i, line in enumerate(open(mesh_indent_file, 'r')):
    if line.strip() != "":
        line = line.lower()
        indent = tab_level(line)
        indent_text[indent] = line.strip()
        sugar_set.add(line.strip())

        if n == 1:
            indent_space = indent
        #print (indent, line)
        if indent != 0:
            relation += f'{line.strip()}\t{indent_text[indent-indent_space]}' + "\n"
        n += 1

with open("./data_for_neo4j_bacteroidetes/polysaccharide_is_a.tsv", 'w') as f:
    f.write(relation)

with open(sugar_file, 'w+') as f:
    f.write("name\n" + "\n".join(list(sugar_set)))

In [29]:
## generate the cazy CSVs

info_file = "raw_data/cazy_info_2022_09_03.json"

output_folder = "data_for_neo4j_bacteroidetes"
rx_ec = re.compile(r'\b\d+\.\d+\.\d+\.\d+\b')

with open(f"{output_folder}/cazy.csv", "w") as output:
    output.write(",".join(["name", "activities"]) + "\n")

# with open("ec.csv", "w") as output:
#     output.write(",".join(["name", "sysname", "reaction"]) + "\n")

# with open("cazy_ec.csv", "w") as output:
#     output.write(",".join(["from", "to"]) + "\n")

#unique_ec = set()

for line in open(info_file, 'r'):
    cazy_line = json.loads(line)

    cazy = cazy_line["name"]
    activities = ""


    if "activity" in cazy_line:
        activities = re.sub(r'<.+?>', '', cazy_line["activity"])


    # if "ec" in cazy_line:
    #     ecs = cazy_line["ec"]

    #     for ec in ecs:
    #         match_ec = rx_ec.match(ec)

    #         if match_ec:
    #             with open(f"{output_folder}cazy_ec.csv", "a") as output:
    #                 output.write(",".join([f'"{cazy}"', f'"{ec}"']) + "\n")
                
    #             unique_ec.add(ec)
        
    with open(f"{output_folder}/cazy.csv", "a", encoding="utf-8") as output:
        output.write(",".join([f'"{cazy}"', f'"{activities}"']) + "\n")

# for ec in unique_ec:
#     with open(f"{output_folder}ec.csv", "a") as output:
#         output.write(",".join([f'"{ec}"']) + "\n")