In [9]:
import requests
import re
from pyphy import pyphy

import pandas as pd
import numpy as np

In [10]:
virushostdb = pd.read_csv('virushostdb/virushostdb.tsv', sep="\t")
virushostdb.head()

virus_host_connection = virushostdb[virushostdb["host tax id"].notnull()][["virus tax id", "host tax id", "virus name"]]

virus_host_connection["host tax id"] = virus_host_connection["host tax id"].astype("int32")



In [11]:
brite_virus = "https://rest.kegg.jp/get/br:br08620"

In [12]:
detail = requests.get(brite_virus).text

with open("brite/brite_virus.txt", "w") as output:
    output.write(detail)

In [13]:
rx_entry = re.compile(r'(\d+)\s+(.+)\[TAX:(\d+)\]')

In [14]:
taxid_name = {}

virus_type = ""

for line in detail.split("\n"):

        
    search_entry = rx_entry.search(line)
    #print (line)
    if search_entry:
        name = search_entry.group(2)
        name = re.sub(r'\[.+\]', '', name).strip()

        #print (f"{search_entry.group(1)},{name}")
        taxid_name[int(search_entry.group(1))] = name



In [15]:
taxid_name

{12639: 'Duck hepatitis B virus',
 259931: "Ross's goose hepatitis B virus",
 259898: 'Sheldgoose hepatitis B virus',
 89623: 'Snow goose hepatitis B virus',
 28300: 'Heron hepatitis B virus',
 1128118: 'Parrot hepatitis B virus',
 2018685: 'Tinamou hepatitis B virus',
 2169919: 'Tibetan frog hepatitis B virus',
 2169918: 'Bluegill hepatitis B virus',
 2163996: 'Capuchin monkey hepatitis B virus',
 2107574: 'Domestic cat hepadnavirus',
 10406: 'Ground squirrel hepatitis virus',
 10407: 'Hepatitis B virus',
 2050037: 'Long-fingered bat hepatitis B virus',
 2049933: 'Pomona bat hepatitis B virus',
 1508710: 'Roundleaf bat hepatitis B virus',
 1508711: 'Horseshoe bat hepatitis B virus',
 1508712: 'Tent-making bat hepatitis B virus',
 35269: 'Woodchuck hepatitis virus',
 68416: 'Woolly monkey hepatitis B virus',
 1690672: 'White sucker hepatitis B virus',
 1512278: 'Aglaonema bacilliform virus',
 328670: 'Banana streak GF virus',
 1016853: 'Banana streak IM virus',
 1476909: 'Banana streak

In [16]:
for taxid in virus_host_connection[~virus_host_connection["virus tax id"].isin(taxid_name.keys())]["virus tax id"].unique():
    #print (taxid)

    if taxid not in taxid_name:
        taxid_name[taxid] = virus_host_connection.loc[virus_host_connection["virus tax id"] == taxid]["virus name"].values[0]

In [17]:
taxid_name[554168]

'Acanthamoeba castellanii mamavirus'

In [18]:
desired_rank = ["superkingdom", "clade", "phylum", "class", "order", "family", "genus", "species", "genome"]

parent_son = set()

taxid_taxon = {}

for taxid in taxid_name:
    dict_path = pyphy.getDictPathByTaxid(taxid)
    dict_path["genome"] = taxid
    quartett = [""] * 2
    for rank in desired_rank:
        if rank in dict_path:
            
            name = pyphy.getNameByTaxid(dict_path[rank])

            if rank == "superkingdom":

                quartett[0] = dict_path[rank]

                if dict_path[rank] not in taxid_taxon:
                    taxid_taxon[dict_path[rank]] = [name, rank]
            else:
                quartett[1] = dict_path[rank]

                if quartett[0] != quartett[1]:
                    parent_son.add(tuple(quartett))

                    if dict_path[rank] not in taxid_taxon:
                        taxid_taxon[dict_path[rank]] = [name, rank]

                    quartett[0] = quartett[1]

In [19]:
with open("brite/taxon.csv", "w") as output:
    output.write(",".join(["taxid", "name", "rank"]) + "\n")

for taxid in taxid_taxon:
    with open("brite/taxon.csv", "a") as output:
        output.write(",".join([f'"{taxid}"', f'"{taxid_taxon[taxid][0]}"', f'"{taxid_taxon[taxid][1]}"']) + "\n")

In [20]:
with open("brite/taxon_connections.csv", "w") as output:
    output.write(",".join(["from", "to"]) + "\n")

for parent in parent_son:
    with open("brite/taxon_connections.csv", "a") as output:
        #["~id", "~from", "~to", "~label"]
        output.write(",".join([f'"{parent[0]}"', f'"{parent[1]}"']) + "\n")

In [21]:
pyphy.getDictPathByTaxid(259931)

{'no rank': 1,
 'species': 12639,
 'genus': 10437,
 'family': 10404,
 'order': 2732515,
 'class': 2732514,
 'phylum': 2732409,
 'kingdom': 2732397,
 'clade': 2559587,
 'superkingdom': 10239}