In [1]:
import shutil
import urllib.request as request
from contextlib import closing
import pandas as pd

In [2]:
def ftp_download(url):
    try:
        filename = url.split("/")[-1]
        with closing(request.urlopen(url)) as r:
            with open(filename, 'wb') as f:
                shutil.copyfileobj(r, f)
        return filename
    except TimeoutError:
        print("Timed out.")
        return None

In [3]:
def get_refseq_accn(url):
    """ build urls """
    file_prefix = url.split('/')[-1]
    file_ext    = "assembly_report.txt"
    key         = 'RefSeq-Accn'

    """ download page content """
    url         = url + ("/" + file_prefix + "_" + file_ext)
    raw         = request.urlopen(url).read()
    lines       = str(raw).split("\\r\\n")

    """ extract ref-seq accession """
    idx         = [lines.index(line) for line in lines if key in line][0]
    header      = lines[idx].split('\\t')
    values      = lines[idx + 1].split('\\t')
    refseq      = values[header.index(key)]

    return refseq

In [9]:
def build_virus_host_pairs(pair_summary, refseq_summary, output_name):
    ref  = pd.read_csv(refseq_summary, delimiter='\t', low_memory=False, skiprows=1)
    pair = pd.read_csv(pair_summary, delimiter='\t')
    pair = pair[pair['host lineage'].str.contains("Bacteria", na=False)]


    for index, row in pair.iterrows():
        virus_refseq = row['refseq id']
        virus_taxid  = row['virus tax id']
        host_taxid   = row['host tax id']

        record       = ref[(ref["species_taxid"] == int(host_taxid)) & 
                           (ref["refseq_category"].str.contains("representative"))]

        if not len(record):
            continue
        elif len(record) > 1:
            print("Suspicious! {}".format(host_taxid))

        assembly_id  = record['# assembly_accession']
        ftp_path     = record['ftp_path'].to_numpy()[0]
        host_refseq  = get_refseq_accn(ftp_path)

        with open(output_name, "a") as f:
            f.write("{}\t{}\n".format(virus_refseq, host_refseq))
            
        print(index)

In [11]:
""" Virus-Host Pair Summary """
pair_summary = ftp_download("ftp://ftp.genome.jp/pub/db/virushostdb/virushostdb.daily.tsv")

""" RefSeq Summary """
refseq_summary = ftp_download("https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt")

""" Output Filename """
output_name = "Virus_Host_DB.txt"

""" Build pair file """
build_virus_host_pairs(pair_summary, refseq_summary, output_name)

35
36
37
38
39
40
41
42
43
44
45
65
66
67
68
69
70
71
73
74
75
77
78
79
80
81
82
83
84
85
86
87
89
90
91
92
93
94
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
114
116
117
118
123
124
125
126
127
130
131
132
144
186
187
188
189
190
191
192
194
195
196
197
201
202
203
204
205
206
208
209
212
213
214
215
216
217
218
219
220
327
328
329
445
473
648
652
653
658
662
663
665
668
676
679
682
684
686
688
692
698
701
704
706
893
904
906
909
910
912
914
915
918
919
920
921
922
924
926
927
928
929
930
932
933
935
936
937
939
941
943
945
946
947
948
949
951
952
953
956
959
960
962
964
967
969
970
971
972
974
979
980
981
982
983
984
985
987
988
990
991
992
993
994
995
997
1001
1002
1003
1005
1007
1009
1010
1012
1015
1021
1022
1023
1025
1027
1028
1031
1032
1033
1034
1035
1037
1038
1042
1043
1046
1047
1050
1053
1055
1057
1058
1063
1066
1067
1070
1071
1074
1076
1079
1080
1081
1083
1086
1087
1088
1334
1335
1336
1860
1861
2004
2005
2006
2007
2008
2009
2189
2190
2191
2192
2193
2194
2195