In [40]:
import pandas as pd
import numpy as np

# Loading Reference name to Taxa Id file

In [4]:
ref = "/mnt/scratch2/avi/meta-map/kraken/KrakenDB/seqid2taxid.map"
with open(ref) as f:
    refId2TaxId = pd.read_csv(f, header=None, sep="\t", names=["refName", "refId"])

In [5]:
refId2TaxId.head()

Unnamed: 0,refName,refId
0,gi|9791176|ref|NC_002180.1|,1986029
1,gi|10803547|ref|NC_001869.1|,64091
2,gi|10954488|ref|NC_001732.1|,243232
3,gi|10954532|ref|NC_001733.1|,243232
4,gi|10954552|ref|NC_001773.1|,272844


# Loading Taxonomy Tree
### Taxum ID , Taxum Rank , Taxum Parent ID

In [31]:
tf = "/mnt/scratch2/avi/meta-map/kraken/KrakenDB/taxonomy/nodes.dmp"
taxa = []
id2rank = {}
with open(tf) as f:
    for line in f:
        toks = line.rstrip("\t|\n").split("\t|\t")
        taxa += [[int(toks[0]), int(toks[1]), toks[2]]]
        id2rank[int(toks[0])] = toks[2]
taxa_df = pd.DataFrame.from_records(taxa)
taxa_df.columns = ["taxaId", "pid", "rank"]

In [32]:
taxa_df.head()

Unnamed: 0,taxaId,pid,rank
0,1,1,no rank
1,2,131567,superkingdom
2,6,335928,genus
3,7,6,species
4,9,32199,species


# List of Leaves

Right join of **pid** and **cid** (parent and child)
And then looking for any child that has never been a parent : with parent = **nan**

In [74]:
leaves = pd.merge(taxa_df[['pid']], taxa_df[['taxaId', 'rank']], left_on='pid', right_on='taxaId', how='right')

In [75]:
leaves = leaves[np.isnan(leaves['pid'])][['taxaId', 'rank']]
print(len(leaves))
leaves.head()

1528122


Unnamed: 0,taxaId,rank
1669515,25,species
1669516,27,species
1669517,28,species
1669518,38,species
1669519,45,species


# List of Roots
Left join of **pid** and **cid** (parent and child) And then looking for any parent that is never a child : child = **nan** parent

In [81]:
roots = pd.merge(taxa_df[['pid', 'rank']], taxa_df[['taxaId', 'pid']], left_on='pid', right_on='taxaId', how='left')
#roots = roots[np.isnan(roots['taxaId'])][['pid', 'rank']]
roots[roots['taxaId'] == roots['pid_y']]

Unnamed: 0,pid_x,rank,taxaId,pid_y
0,1,no rank,1,1
8284,1,superkingdom,1,1
9909,1,superkingdom,1,1
9931,1,no rank,1,1
12110,1,no rank,1,1
101269,1,no rank,1,1


In [78]:
c2p = {}
c2pid = {}
with open(tf) as f:
    for line in f:
        toks = line.rstrip("\t|\n").split("\t|\t")
        c2pid[int(toks[0])] = int(toks[1])
        if id2rank[int(toks[0])] not in c2p:
            c2p[id2rank[int(toks[0])]] = set()
        c2p[id2rank[int(toks[0])]].add(id2rank[int(toks[1])])

# Child -> Parent

In [24]:
for key, value in c2p.items():
    print ('{} --> parent list:'.format(key))
    print(value)
    print('\n')

no rank --> parent list:
{'genus', 'no rank', 'superorder', 'species subgroup', 'superclass', 'infraorder', 'superfamily', 'tribe', 'superkingdom', 'parvorder', 'subtribe', 'suborder', 'cohort', 'subspecies', 'species', 'phylum', 'infraclass', 'order', 'varietas', 'subphylum', 'subfamily', 'subclass', 'family', 'subkingdom', 'class', 'forma', 'species group', 'subgenus', 'kingdom'}


superkingdom --> parent list:
{'no rank'}


genus --> parent list:
{'no rank', 'suborder', 'family', 'class', 'phylum', 'infraclass', 'order', 'superkingdom', 'subtribe', 'superfamily', 'tribe', 'subfamily', 'subphylum', 'subclass'}


species --> parent list:
{'genus', 'no rank', 'suborder', 'family', 'class', 'species', 'phylum', 'species subgroup', 'species group', 'order', 'subgenus', 'superfamily', 'tribe', 'subfamily', 'subclass'}


order --> parent list:
{'no rank', 'superorder', 'cohort', 'class', 'phylum', 'infraclass', 'subphylum', 'superkingdom', 'subclass'}


family --> parent list:
{'no rank', 

# ranks

In [12]:
print(len(taxa_df['rank'].unique()))
taxa_df['rank'].unique()

30


array(['no rank', 'superkingdom', 'genus', 'species', 'order', 'family',
       'subspecies', 'subfamily', 'tribe', 'phylum', 'class', 'forma',
       'suborder', 'subclass', 'varietas', 'kingdom', 'subphylum',
       'superfamily', 'infraorder', 'infraclass', 'superorder', 'subgenus',
       'superclass', 'parvorder', 'superphylum', 'species group',
       'species subgroup', 'cohort', 'subtribe', 'subkingdom'], dtype=object)

# Both Parent and Child at the same time in the reference list? 
## yes!

In [87]:
ref2taxaLevel = pd.merge(refId2TaxId, taxa_df, left_on="refId", right_on="taxaId")
print(len(refId2TaxId))
print(len(ref2taxaLevel))
ref2taxaLevel.head()

4815
4815


Unnamed: 0,refName,refId,taxaId,pid,rank
0,gi|9791176|ref|NC_002180.1|,1986029,1986029,40274,species
1,gi|10803547|ref|NC_001869.1|,64091,64091,2242,no rank
2,gi|15789340|ref|NC_002607.1|,64091,64091,2242,no rank
3,gi|16119979|ref|NC_002608.1|,64091,64091,2242,no rank
4,gi|10954488|ref|NC_001732.1|,243232,243232,2190,no rank


In [27]:
a = set()
for t in ref2taxaLevel.taxaId:
    a.add(t)

In [29]:
b = set()
for t in ref2taxaLevel.taxaId:
    b.add(c2pid[t])

List of reference taxa that are parents of some children in the same list

In [30]:
a.intersection(b)

{316,
 340,
 813,
 1148,
 1604,
 1639,
 2110,
 29447,
 40041,
 91891,
 107806,
 190650,
 192222,
 196627,
 208964,
 523796,
 543891,
 869727,
 907287,
 941967,
 1001534,
 1071763,
 1263406,
 1283330,
 1306414}

# Reference Rank Distribution
### Note: Not all the "no rank"s are the same!!

In [42]:
ref2taxaLevel.groupby('rank').count()

Unnamed: 0_level_0,refName,refId,taxaId
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no rank,4336,4336,4336
species,465,465,465
subspecies,14,14,14


# Analysis of the truth 

In [None]:
truth = '/mnt/scratch2/avi/meta-map/kraken/meta/truth.txt'
