Author: Dan Shea  
Date: 2019.11.01  
#### Checking for Presence / Absence of Genes using `stringtie` transcripts

We have a resistant ($R$) cultivar WRC17 (aka RGA) and several susceptible ($S$) lines. Aligning RNA-seq reads to the resistant cultivar and assembling transcripts using `stringtie`, we can examine which genes are absent from all of the $S$ lines. These genes are then considered candidates for conferring resistance.

Since this is essentially the same thing as `bedtools merge` I've added the same behavior to the collapsing of the reads. There is a distance metric that acts to have reads that bookend each other to be merged when they are within `margin` bp of one another. Check out https://bedtools.readthedocs.io/en/latest/content/tools/merge.html this documentation to get a graphical explanation of this metric.

In [1]:
import pandas as pd
from collections import OrderedDict

In [2]:
Res_file = 'WRC17.gtf'
Sus_files = ['Hitome.gtf', 'RIL33.gtf', 'RIL36.gtf', 'RIL48.gtf', 'RIL52.gtf', 'RIL54.gtf', 'RIL71.gtf',]
samples = [s.split('.')[0] for s in Sus_files]

In [3]:
gtf_cols = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
Res_df = pd.read_csv(Res_file, sep='\t', comment='#', names=gtf_cols)
Sus_dfs = OrderedDict()
for key, filename in zip(samples, Sus_files):
    Sus_dfs[key] = pd.read_csv(filename, sep='\t', comment='#', names=gtf_cols)

In [4]:
Res_df

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_13513_10201,StringTie,transcript,1,10201,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; co..."
1,flattened_line_13513_10201,StringTie,exon,1,10201,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; ex..."
2,flattened_line_10970_16115,StringTie,transcript,1,16115,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; co..."
3,flattened_line_10970_16115,StringTie,exon,1,16115,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; ex..."
4,flattened_line_2206_75763,StringTie,transcript,1,75763,1000,.,.,"gene_id ""STRG.3""; transcript_id ""STRG.3.1""; co..."
...,...,...,...,...,...,...,...,...,...
1375,flattened_line_2840_66107,StringTie,exon,6,66107,1000,.,.,"gene_id ""STRG.681""; transcript_id ""STRG.681.1""..."
1376,flattened_line_2964_64641,StringTie,transcript,1,64624,1000,.,.,"gene_id ""STRG.682""; transcript_id ""STRG.682.1""..."
1377,flattened_line_2964_64641,StringTie,exon,1,64624,1000,.,.,"gene_id ""STRG.682""; transcript_id ""STRG.682.1""..."
1378,flattened_line_3055_63100,StringTie,transcript,1,63100,1000,.,.,"gene_id ""STRG.683""; transcript_id ""STRG.683.1""..."


In [5]:
Res_df = Res_df.loc[Res_df['feature'] == 'transcript'].copy()

In [6]:
Res_df

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_13513_10201,StringTie,transcript,1,10201,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; co..."
2,flattened_line_10970_16115,StringTie,transcript,1,16115,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; co..."
4,flattened_line_2206_75763,StringTie,transcript,1,75763,1000,.,.,"gene_id ""STRG.3""; transcript_id ""STRG.3.1""; co..."
6,flattened_line_8_385887,StringTie,transcript,1,385782,1000,.,.,"gene_id ""STRG.4""; transcript_id ""STRG.4.1""; co..."
8,flattened_line_12321_12674,StringTie,transcript,15,12674,1000,.,.,"gene_id ""STRG.5""; transcript_id ""STRG.5.1""; co..."
...,...,...,...,...,...,...,...,...,...
1370,flattened_line_2722_67824,StringTie,transcript,1,67633,1000,.,.,"gene_id ""STRG.679""; transcript_id ""STRG.679.1""..."
1372,flattened_line_1090_103447,StringTie,transcript,60,103447,1000,.,.,"gene_id ""STRG.680""; transcript_id ""STRG.680.1""..."
1374,flattened_line_2840_66107,StringTie,transcript,6,66107,1000,.,.,"gene_id ""STRG.681""; transcript_id ""STRG.681.1""..."
1376,flattened_line_2964_64641,StringTie,transcript,1,64624,1000,.,.,"gene_id ""STRG.682""; transcript_id ""STRG.682.1""..."


In [7]:
for sample in samples:
    Sus_dfs[sample] = Sus_dfs[sample].loc[Sus_dfs[sample]['feature'] == 'transcript'].copy()

In [8]:
from IPython.display import display
for sample in samples:
    display(Sus_dfs[sample])

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_13513_10201,StringTie,transcript,21,2973,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; co..."
2,flattened_line_13513_10201,StringTie,transcript,9697,10201,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; co..."
4,flattened_line_13513_10201,StringTie,transcript,4526,5990,1000,.,.,"gene_id ""STRG.3""; transcript_id ""STRG.3.1""; co..."
6,flattened_line_2206_75763,StringTie,transcript,1344,6509,1000,.,.,"gene_id ""STRG.4""; transcript_id ""STRG.4.1""; co..."
8,flattened_line_2206_75763,StringTie,transcript,6750,11494,1000,.,.,"gene_id ""STRG.5""; transcript_id ""STRG.5.1""; co..."
...,...,...,...,...,...,...,...,...,...
5946,flattened_line_2840_66107,StringTie,transcript,50412,64384,1000,.,.,"gene_id ""STRG.2578""; transcript_id ""STRG.2578...."
5948,flattened_line_2964_64641,StringTie,transcript,254,64031,1000,.,.,"gene_id ""STRG.2579""; transcript_id ""STRG.2579...."
5950,flattened_line_3055_63100,StringTie,transcript,193,32140,1000,.,.,"gene_id ""STRG.2580""; transcript_id ""STRG.2580...."
5952,flattened_line_3055_63100,StringTie,transcript,32451,37810,1000,-,.,"gene_id ""STRG.2581""; transcript_id ""STRG.2581...."


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_13513_10201,StringTie,transcript,1,2045,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; co..."
2,flattened_line_13513_10201,StringTie,transcript,9374,10172,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; co..."
4,flattened_line_13513_10201,StringTie,transcript,4443,5991,1000,.,.,"gene_id ""STRG.3""; transcript_id ""STRG.3.1""; co..."
6,flattened_line_2206_75763,StringTie,transcript,1374,11500,1000,.,.,"gene_id ""STRG.4""; transcript_id ""STRG.4.1""; co..."
8,flattened_line_2206_75763,StringTie,transcript,11789,13098,1000,-,.,"gene_id ""STRG.5""; transcript_id ""STRG.5.1""; co..."
...,...,...,...,...,...,...,...,...,...
3700,flattened_line_2840_66107,StringTie,transcript,65037,66081,1000,.,.,"gene_id ""STRG.1626""; transcript_id ""STRG.1626...."
3702,flattened_line_2840_66107,StringTie,transcript,386,50109,1000,.,.,"gene_id ""STRG.1627""; transcript_id ""STRG.1627...."
3704,flattened_line_2840_66107,StringTie,transcript,50412,64448,1000,.,.,"gene_id ""STRG.1628""; transcript_id ""STRG.1628...."
3706,flattened_line_2964_64641,StringTie,transcript,62,64613,1000,.,.,"gene_id ""STRG.1629""; transcript_id ""STRG.1629...."


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_13513_10201,StringTie,transcript,9,10201,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; co..."
2,flattened_line_2206_75763,StringTie,transcript,1344,11500,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; co..."
4,flattened_line_2206_75763,StringTie,transcript,11789,13558,1000,-,.,"gene_id ""STRG.3""; transcript_id ""STRG.3.1""; co..."
7,flattened_line_2206_75763,StringTie,transcript,15339,20508,1000,.,.,"gene_id ""STRG.4""; transcript_id ""STRG.4.1""; co..."
9,flattened_line_2206_75763,StringTie,transcript,20710,24682,1000,.,.,"gene_id ""STRG.5""; transcript_id ""STRG.5.1""; co..."
...,...,...,...,...,...,...,...,...,...
3451,flattened_line_2840_66107,StringTie,transcript,50412,64471,1000,.,.,"gene_id ""STRG.1537""; transcript_id ""STRG.1537...."
3453,flattened_line_2964_64641,StringTie,transcript,229,64456,1000,.,.,"gene_id ""STRG.1538""; transcript_id ""STRG.1538...."
3455,flattened_line_3055_63100,StringTie,transcript,157,32142,1000,.,.,"gene_id ""STRG.1539""; transcript_id ""STRG.1539...."
3457,flattened_line_3055_63100,StringTie,transcript,32451,36696,1000,-,.,"gene_id ""STRG.1540""; transcript_id ""STRG.1540...."


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_13513_10201,StringTie,transcript,1,10201,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; co..."
2,flattened_line_10970_16115,StringTie,transcript,1,16115,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; co..."
4,flattened_line_2206_75763,StringTie,transcript,9,75763,1000,.,.,"gene_id ""STRG.3""; transcript_id ""STRG.3.1""; co..."
6,flattened_line_8_385887,StringTie,transcript,7877,8135,1000,+,.,"gene_id ""STRG.4""; transcript_id ""STRG.4.1""; co..."
9,flattened_line_8_385887,StringTie,transcript,8384,17261,1000,-,.,"gene_id ""STRG.5""; transcript_id ""STRG.5.1""; co..."
...,...,...,...,...,...,...,...,...,...
3943,flattened_line_3055_63100,StringTie,transcript,138,21020,1000,.,.,"gene_id ""STRG.1751""; transcript_id ""STRG.1751...."
3945,flattened_line_3055_63100,StringTie,transcript,21259,32140,1000,.,.,"gene_id ""STRG.1752""; transcript_id ""STRG.1752...."
3947,flattened_line_3055_63100,StringTie,transcript,32451,37816,1000,-,.,"gene_id ""STRG.1753""; transcript_id ""STRG.1753...."
3950,flattened_line_3055_63100,StringTie,transcript,43137,53011,1000,+,.,"gene_id ""STRG.1754""; transcript_id ""STRG.1754...."


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_13513_10201,StringTie,transcript,1,2068,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; co..."
2,flattened_line_13513_10201,StringTie,transcript,4443,6027,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; co..."
4,flattened_line_13513_10201,StringTie,transcript,9165,10190,1000,.,.,"gene_id ""STRG.3""; transcript_id ""STRG.3.1""; co..."
6,flattened_line_2206_75763,StringTie,transcript,1354,6481,1000,.,.,"gene_id ""STRG.4""; transcript_id ""STRG.4.1""; co..."
8,flattened_line_10970_16115,StringTie,transcript,1,16112,1000,.,.,"gene_id ""STRG.5""; transcript_id ""STRG.5.1""; co..."
...,...,...,...,...,...,...,...,...,...
3846,flattened_line_2722_67824,StringTie,transcript,59520,67629,1000,+,.,"gene_id ""STRG.1716""; transcript_id ""STRG.1716...."
3849,flattened_line_1090_103447,StringTie,transcript,1296,46656,1000,-,.,"gene_id ""STRG.1717""; transcript_id ""STRG.1717...."
3852,flattened_line_2840_66107,StringTie,transcript,9,66107,1000,.,.,"gene_id ""STRG.1718""; transcript_id ""STRG.1718...."
3854,flattened_line_2964_64641,StringTie,transcript,229,64031,1000,.,.,"gene_id ""STRG.1719""; transcript_id ""STRG.1719...."


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_13513_10201,StringTie,transcript,9,10201,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; co..."
2,flattened_line_2206_75763,StringTie,transcript,1344,6482,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; co..."
4,flattened_line_2206_75763,StringTie,transcript,6742,11499,1000,.,.,"gene_id ""STRG.3""; transcript_id ""STRG.3.1""; co..."
6,flattened_line_2206_75763,StringTie,transcript,11789,13041,1000,-,.,"gene_id ""STRG.4""; transcript_id ""STRG.4.1""; co..."
9,flattened_line_2206_75763,StringTie,transcript,14736,24679,1000,.,.,"gene_id ""STRG.5""; transcript_id ""STRG.5.1""; co..."
...,...,...,...,...,...,...,...,...,...
4131,flattened_line_2840_66107,StringTie,transcript,50412,64362,1000,.,.,"gene_id ""STRG.1837""; transcript_id ""STRG.1837...."
4133,flattened_line_2964_64641,StringTie,transcript,196,63794,1000,.,.,"gene_id ""STRG.1838""; transcript_id ""STRG.1838...."
4135,flattened_line_3055_63100,StringTie,transcript,3129,32140,1000,.,.,"gene_id ""STRG.1839""; transcript_id ""STRG.1839...."
4137,flattened_line_3055_63100,StringTie,transcript,32451,37829,1000,-,.,"gene_id ""STRG.1840""; transcript_id ""STRG.1840...."


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_13513_10201,StringTie,transcript,1,1585,1000,.,.,"gene_id ""STRG.1""; transcript_id ""STRG.1.1""; co..."
2,flattened_line_13513_10201,StringTie,transcript,4452,6023,1000,.,.,"gene_id ""STRG.2""; transcript_id ""STRG.2.1""; co..."
4,flattened_line_2206_75763,StringTie,transcript,1382,11500,1000,.,.,"gene_id ""STRG.3""; transcript_id ""STRG.3.1""; co..."
6,flattened_line_2206_75763,StringTie,transcript,11789,13547,1000,-,.,"gene_id ""STRG.4""; transcript_id ""STRG.4.1""; co..."
9,flattened_line_2206_75763,StringTie,transcript,15018,24655,1000,.,.,"gene_id ""STRG.5""; transcript_id ""STRG.5.1""; co..."
...,...,...,...,...,...,...,...,...,...
4127,flattened_line_2964_64641,StringTie,transcript,205,64641,1000,.,.,"gene_id ""STRG.1840""; transcript_id ""STRG.1840...."
4129,flattened_line_3055_63100,StringTie,transcript,134,32140,1000,.,.,"gene_id ""STRG.1841""; transcript_id ""STRG.1841...."
4131,flattened_line_3055_63100,StringTie,transcript,32451,36696,1000,-,.,"gene_id ""STRG.1842""; transcript_id ""STRG.1842...."
4134,flattened_line_3055_63100,StringTie,transcript,62371,63061,1000,.,.,"gene_id ""STRG.1843""; transcript_id ""STRG.1843...."


In [9]:
Sus_df = Sus_dfs[samples[0]]
for sample in samples[1:]:
    Sus_df = pd.concat([Sus_df, Sus_dfs[sample]])

In [10]:
Sus_df.reset_index(inplace=True, drop=True)

In [11]:
Sus_df.sort_values(['seqname', 'start'], kind='mergesort', inplace=True)

In [12]:
Sus_df.reset_index(inplace=True, drop=True)

In [13]:
# collapse the transcripts into the largest ones, removing any transcripts inside of another transcript
def collapse(df, margin=1):
    mask = [False for i in range(df.shape[0])]
    rows = df.itertuples()
    prev = next(rows)
    mask[prev.Index] = True
    current = next(rows)
    while True:
        while prev.seqname == current.seqname:
            # When the next read is contained within the current collapsed read
            if (prev.start <= current.start) and (prev.end >= current.end):
                try:
                    current = next(rows)
                    continue
                except Exception as e:
                    return mask
            # When the current collapsed read can be extended further "right" do to an overhanging read within margin
            elif (prev.start - margin <= current.start) and (prev.end + margin >= current.start) and (prev.end + margin <= current.end):
                df.at[prev.Index, 'end'] = current.end
                try:
                    current = next(rows)
                    continue
                except Exception as e:
                    return mask
            # When the the current collapsed read can be extended "left" do to an overhanging read within margin
            elif (prev.start - margin >= current.start) and (prev.start - margin <= current.end) and (prev.end + margin >= current.end):
                df.at[prev.Index, 'start'] = current.start
                try:
                    current = next(rows)
                    continue
                except Exception as e:
                    return mask
            # Otherwise, the reads are too far apart from one another and should be treated separately
            else:
                prev = current
                mask[prev.Index] = True
                try:
                    current = next(rows)
                    continue
                except Exception as e:
                    return mask
        mask[prev.Index] = True
        prev = current
        mask[prev.Index] = True
        try:
            current = next(rows)
            continue
        except Exception as e:
            return mask
        

In [14]:
my_mask = collapse(Sus_df)

In [15]:
Sus_collapsed = Sus_df[my_mask].copy()

In [16]:
Sus_collapsed

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_0_459513,StringTie,transcript,12,459513,1000,.,.,"gene_id ""STRG.801""; transcript_id ""STRG.801.1""..."
101,flattened_line_10039_18685,StringTie,transcript,3,18685,1000,.,.,"gene_id ""STRG.1518""; transcript_id ""STRG.1518...."
108,flattened_line_10075_18573,StringTie,transcript,1,18573,1000,.,.,"gene_id ""STRG.1513""; transcript_id ""STRG.1513...."
115,flattened_line_10079_18556,StringTie,transcript,1,18556,1000,.,.,"gene_id ""STRG.790""; transcript_id ""STRG.790.1""..."
122,flattened_line_1008_106699,StringTie,transcript,1,106697,1000,.,.,"gene_id ""STRG.2136""; transcript_id ""STRG.2136...."
...,...,...,...,...,...,...,...,...,...
12982,flattened_line_9885_19285,StringTie,transcript,7960,19100,1000,.,.,"gene_id ""STRG.986""; transcript_id ""STRG.986.1""..."
12986,flattened_line_992_107284,StringTie,transcript,3,26104,1000,.,.,"gene_id ""STRG.1472""; transcript_id ""STRG.1472...."
12993,flattened_line_992_107284,StringTie,transcript,26328,102666,1000,.,.,"gene_id ""STRG.1781""; transcript_id ""STRG.1781...."
12999,flattened_line_992_107284,StringTie,transcript,103025,106640,1000,.,.,"gene_id ""STRG.1561""; transcript_id ""STRG.1561...."


In [17]:
Res_df.sort_values(['seqname', 'start'], kind='mergesort', inplace=True)

In [18]:
Res_df.reset_index(inplace=True, drop=True)

In [19]:
my_mask = collapse(Res_df)

In [20]:
Res_collapsed = Res_df[my_mask].copy()

In [21]:
Res_collapsed

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,flattened_line_0_459513,StringTie,transcript,6,459513,1000,.,.,"gene_id ""STRG.374""; transcript_id ""STRG.374.1""..."
1,flattened_line_10039_18685,StringTie,transcript,38,18685,1000,.,.,"gene_id ""STRG.560""; transcript_id ""STRG.560.1""..."
2,flattened_line_10075_18573,StringTie,transcript,1,18568,1000,.,.,"gene_id ""STRG.570""; transcript_id ""STRG.570.1""..."
3,flattened_line_10079_18556,StringTie,transcript,13,18535,1000,.,.,"gene_id ""STRG.293""; transcript_id ""STRG.293.1""..."
4,flattened_line_1008_106699,StringTie,transcript,1,106699,1000,.,.,"gene_id ""STRG.573""; transcript_id ""STRG.573.1""..."
...,...,...,...,...,...,...,...,...,...
679,flattened_line_976_107875,StringTie,transcript,1,107875,1000,.,.,"gene_id ""STRG.656""; transcript_id ""STRG.656.1""..."
680,flattened_line_9855_19308,StringTie,transcript,1,19308,1000,.,.,"gene_id ""STRG.20""; transcript_id ""STRG.20.1""; ..."
681,flattened_line_9885_19285,StringTie,transcript,6,19175,1000,.,.,"gene_id ""STRG.409""; transcript_id ""STRG.409.1""..."
682,flattened_line_992_107284,StringTie,transcript,17,107284,1000,.,.,"gene_id ""STRG.659""; transcript_id ""STRG.659.1""..."


In [22]:
Res_collapsed[~Res_collapsed['seqname'].isin(Sus_collapsed['seqname'])]

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
656,flattened_line_9263_21178,StringTie,transcript,1,21178,1000,.,.,"gene_id ""STRG.396""; transcript_id ""STRG.396.1""..."
