In [2]:
import numpy as np
import pandas as pd
import taxonomy as t
import re
from bitstring import BitArray, BitStream

paf="contig.paf"

In [9]:
# reading PAF file and dropping columns after s1. Indexing contig name for quick access.

df = pd.read_csv(paf,
    sep='\t',
    header=None,
    names=['qname','qlen','qstart','qend','strand','tname','tlen','tstart','tend','match_bp','mapping_bp','mqua','tp','cm','score','opt1','opt2'],
    index_col=['qname'],
    usecols=['qname','qlen','qstart','qend','strand','tname','tlen','tstart','tend','match_bp','mapping_bp','score'],
)

df['score'] = df['score'].str.replace('s1:i:','').astype(int)
df['ctg'] = df.index

In [4]:
df.index.unique()

Index(['11153_001_HostRemoved_074501', '11153_001_HostRemoved_107444',
       '11153_001_HostRemoved_029610', '11153_001_HostRemoved_075706',
       '11153_001_HostRemoved_046140', '11153_001_HostRemoved_043003',
       '11153_001_HostRemoved_037615', '11153_001_HostRemoved_101400',
       '11153_001_HostRemoved_056945', '11153_001_HostRemoved_041076',
       ...
       '11153_001_HostRemoved_103500', '11153_001_HostRemoved_031612',
       '11153_001_HostRemoved_014908', '11153_001_HostRemoved_067848',
       '11153_001_HostRemoved_019428', '11153_001_HostRemoved_028855',
       '11153_001_HostRemoved_020332', '11153_001_HostRemoved_052392',
       '11153_001_HostRemoved_084248', '11153_001_HostRemoved_038702'],
      dtype='object', name='qname', length=3707)

In [5]:
print(len(df))

df.loc['11153_001_HostRemoved_074501']

20001


Unnamed: 0_level_0,qlen,qstart,qend,strand,tname,tlen,tstart,tend,match_bp,mapping_bp,score,ctg
qname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11153_001_HostRemoved_074501,939,16,895,+,NC_021252.1,8948591,7603144,7604005,144,879,140,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,16,895,-,NZ_CP008953.1,8961318,1017093,1017954,144,879,140,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,16,895,+,NC_018266.1,10246920,8793356,8794217,139,879,135,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,16,895,+,NC_022116.1,10246864,8793300,8794161,139,879,135,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,16,895,+,NC_017186.1,10236779,8783442,8784303,139,879,135,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,16,895,+,NC_014318.1,10236715,8783383,8784244,139,879,135,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,7,887,+,NZ_CP009110.1,7237391,5968403,5969268,119,880,116,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,7,895,+,NC_013093.1,8248144,7392108,7392981,113,888,110,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,7,895,+,NZ_CP023445.1,8131572,7259170,7260043,113,888,110,11153_001_HostRemoved_074501


In [7]:
# For the same mapped region, keeping the first alignment and dropping the rest.

df.sort_values(by=['ctg', 'score'], ascending=False, inplace=True)
df.drop_duplicates(subset=['ctg','qstart','qend'], keep='first', inplace=True)

df.loc['11153_001_HostRemoved_074501']

Unnamed: 0_level_0,qlen,qstart,qend,strand,tname,tlen,tstart,tend,match_bp,mapping_bp,score,ctg
qname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11153_001_HostRemoved_074501,939,16,895,+,NC_021252.1,8948591,7603144,7604005,144,879,140,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,7,887,+,NZ_CP009110.1,7237391,5968403,5969268,119,880,116,11153_001_HostRemoved_074501
11153_001_HostRemoved_074501,939,7,895,+,NC_013093.1,8248144,7392108,7392981,113,888,110,11153_001_HostRemoved_074501


In [117]:
def get_extra_regions(qstart, qend, qlen):
    global mask
    add = BitArray("0b%s%s%s" % ("0"*qstart, "1"*(qend-qstart+1), "0"*(qlen-qend)))
    add_mask = add&(mask^add)
    mask = mask|add
    
    p = re.compile('1+')
    iterator = p.finditer(add_mask.bin)
    return [match.span() for match in iterator]

def aggregate_ctg(p, tname, qlen, qstart, qend, match_bp, mapping_bp):
    """
    aggregate alignments for each contig
    """
    ranks = ["superkingdom","phylum","class","order","family","genus","species","strain"]
    taxid  = t.acc2taxid(tname)
    lineage = t.taxid2lineageDICT(taxid, 1, 1)
    
    for rank in ranks:
        name = lineage[rank]['name']
        p[rank][name]['TAXID'] = lineage[rank]['taxid']

        pcov=""

        if not rank in p: p[rank] = {}
        
        if name in p[rank]:
            pcov = p[rank][name]['pcov']
        else:
            pcov = Bits("0b%s"%"0"*qlen)
            p[rank][name]={}
            p[rank][name]['pcov']        = pcov
            p[rank][name]['TOL_HIT_LEN'] = 0
            p[rank][name]['NUM_HIT']     = 0
            p[rank][name]['AVG_IDT']     = 0
            p[rank][name]['TOL_MISM']    = []

        mask = Bits("0b%s%s%s" % ("0"*(qstart-1), "1"*mapping_bp, "0"*(qlen-qend)))
 
        p[rank][name]['pcov']         = pcov | mask
        p[rank][name]['TOL_HIT_LEN'] += mapping_bp # total mapped bp
        p[rank][name]['NUM_HIT']     += 1 # number of hits
        p[rank][name]['TOL_MISM']    += mapping_bp - match_bp # total mismatches
        p[rank][name]['AVG_IDT']      = (p[rank][name]['TOL_HIT_LEN'] - p[rank][name]['TOL_MISM'])/p[rank][name]['TOL_HIT_LEN']

def genome_seq_cov(ctg_df):
    p={}
    ctg_mask = BitArray("0b%s" % "0"*ctg_df['qlen'])
    ctg_df[] = ctg_df.apply(lambda x: get_extra_regions(ctg_mask, x['qstart'], x['qend']), axis=1)
    
    for rank in p:
        for name in p[rank]:            
            cov_string = str(p[rank][name]['pcov'].bin)
            del p[rank][name]['pcov']
            p[rank][name]['LINEAR_LEN'] = cov_string.count('1')
    
            
genome_seq_cov(df.loc['11153_001_HostRemoved_074501'])

In [None]:
def accCov(acc_cov,id,region):
    (qs, qe) = region

    while( $acc_cov =~ /0+/g ){
        my ($us, $ue) = ($-[0]+1,$+[0]);
        last if $us > $qe;
        if( $qs>=$us && $qe<=$ue){ #whole overlapping
            my $len = $qe-$qs+1;
            substr $acc_cov, $qs-1, $len, ${id}x$len;
        }
        elsif( $us>=$qs && $qe>=$us && $ue>=$qe ){ #cov overlapping 3" 0s
            my $len = $ue-$qs+1;
            substr $acc_cov, $qs-1, $len, ${id}x$len;
        }
        elsif( $qs>=$us && $qs<=$ue && $qe>$ue ){ #overlapping 5"
            my $len = $qe-$us+1;
            substr $acc_cov, $us-1, $len, ${id}x$len;
        }
    }

    return acc_cov

def accCovSummary:
    my ($acc_cov,$map) = @_;
    my $c;
    my $csum;
    while( $acc_cov =~ /(.)\1*/g ){
        my ($qs,$qe) = ($-[0]+1,$+[0]);
        my $tax = $map->{ord($1)};

        #dealing with an upper limit of '32766' on the MAX value of the regex {MIN,MAX} quantifier.
        my ($prev_end) = $csum->{$tax}[-1] =~ /\.\.(\d+)/;
        if( defined $prev_end && $prev_end+1 == $qs ){
            $csum->{$tax}[-1] =~ s/\.\.$prev_end/\.\.$qe/;
        }
        else{
            push @{$csum->{$tax}}, "$qs..$qe";
        }
    }
    return $csum;

NC_021252.1
NZ_CP008953.1
NC_018266.1
NC_022116.1
NC_017186.1
NC_014318.1
NZ_CP009110.1
NC_013093.1
NZ_CP023445.1
NC_021252.1
NZ_CP008953.1
NC_018266.1
NC_022116.1
NC_017186.1
NC_014318.1
NZ_CP009110.1
NC_013093.1
NZ_CP023445.1
NC_021252.1
NZ_CP008953.1
NC_018266.1
NC_022116.1
NC_017186.1
NC_014318.1
NZ_CP009110.1
NC_013093.1
NZ_CP023445.1
NC_021252.1
NZ_CP008953.1
NC_018266.1
NC_022116.1
NC_017186.1
NC_014318.1
NZ_CP009110.1
NC_013093.1
NZ_CP023445.1
NC_021252.1
NZ_CP008953.1
NC_018266.1
NC_022116.1
NC_017186.1
NC_014318.1
NZ_CP009110.1
NC_013093.1
NZ_CP023445.1
NC_021252.1
NZ_CP008953.1
NC_018266.1
NC_022116.1
NC_017186.1
NC_014318.1
NZ_CP009110.1
NC_013093.1
NZ_CP023445.1
NC_021252.1
NZ_CP008953.1
NC_018266.1
NC_022116.1
NC_017186.1
NC_014318.1
NZ_CP009110.1
NC_013093.1
NZ_CP023445.1
NC_021252.1
NZ_CP008953.1
NC_018266.1
NC_022116.1
NC_017186.1
NC_014318.1
NZ_CP009110.1
NC_013093.1
NZ_CP023445.1


In [None]:

            foreach my $region ( keys %{$seq->{$pname}->{$taxid}} ){
                my ($qs, $qe) = $region =~ /^(\d+)\.\.(\d+)$/;
                my $end = length($pcov);
                my $nm = $seq->{$pname}->{$taxid}->{$region};
                
                #update linear length
                my $str = "0"x($qs-1) . "1"x($qe-$qs+1) . "0"x($end-$qe);
                $pcov = $pcov | $str;
                #total mapped
                $p->{$rank}->{$name}->{TOL_HIT_LEN} ||= 0;
                $p->{$rank}->{$name}->{TOL_HIT_LEN} += $qe-$qs+1;
                #number of hits
                $p->{$rank}->{$name}->{NUM_HIT} ||= 0;
                $p->{$rank}->{$name}->{NUM_HIT}++;

                #distance
                $p->{$rank}->{$name}->{TOL_MISM} ||= 0;
                $p->{$rank}->{$name}->{TOL_MISM} += $nm;
            }
            $p->{$rank}->{$name}->{AVG_IDT} = ($p->{$rank}->{$name}->{TOL_HIT_LEN} - $p->{$rank}->{$name}->{TOL_MISM})/$p->{$rank}->{$name}->{TOL_HIT_LEN};
            $p->{$rank}->{$name}->{LINEAR_LEN} = $pcov;
        }

        # 
        foreach my $name ( keys %{$p->{$rank}} ){
            $p->{$rank}->{$name}->{LINEAR_LEN} =~ s/0//g;
            my $sum = length($p->{$rank}->{$name}->{LINEAR_LEN});

            $p->{$rank}->{$name}->{LINEAR_LEN} = $sum;
            $r->{$rank}->{TOL_LINEAR_LEN} ||= 0;
            $r->{$rank}->{TOL_LINEAR_LEN} += $sum;
        }

        #accumulated coverage
        my $acc_cov;
        $acc_cov = "0"x$length->{$pname};
        my $map;
        my $map->{48}="unclassified";
        my $mid_ascii = 49;

        foreach my $cnt ( sort {$a<=>$b} keys %{$cov->{$pname}} )
        {
            foreach my $taxid ( keys %{$cov->{$pname}->{$cnt}} )
            {
                my $name = taxid2rank($taxid, $rank);    
                #upper taxa
                my $upname = taxid2rank($taxid, $upper_level);
                $upname = "NA" unless $upname;
                $name = "$upname $rank" unless $name;

                unless( defined $map->{$name} ){
                    $map->{$name} = $mid_ascii;
                    $map->{$mid_ascii} = $name;
                    $mid_ascii++;
                }

                my $region = $cov->{$pname}->{$cnt}->{$taxid};
                $acc_cov = &accCov($acc_cov, chr($map->{$name}), $region);
            }
        }
        my $csum = &accCovSummary($acc_cov, $map);
        foreach my $name ( keys %$csum )
        {
            $p->{$rank}->{$name}->{ACC_COV_RGN} = join ";", @{$csum->{$name}};
            my $len=0;
            foreach my $rgn ( @{$csum->{$name}} )
            {
                my ($qs,$qe) = $rgn =~ /(\d+)\.\.(\d+)/;
                $len += $qe-$qs+1;
            }
            $p->{$rank}->{$name}->{ACC_COV_LEN} = $len;
        }

        $upper_level = $rank;
    }