In [1]:
import pandas as pd
import pyranges as pr
import cerberus

In [88]:
gtf_files = ['../../../data/iq/raw/GM22300_1.gtf',
             '../../../data/iq/raw/GM18906_1.gtf']


gb_cols = ['Chromosome', 'Strand', 'ic']

ic_df = pd.DataFrame()
source_ic_df = pd.DataFrame()
for f in gtf_files:

    # get info about which each ic was detected in
    analysis = f.split('/data/')[1].split('/')[0]

    tech_rep = f.rsplit('/', maxsplit=1)[1].split('.')[0]
    source = f'{analysis}_{tech_rep}'

    gtf_df = pr.read_gtf(f, duplicate_attr=True)
    gtf_df = gtf_df.df
    gtf_df = pr.PyRanges(gtf_df)
    df = cerberus.get_ic(gtf_df)
    df['source'] = source

    # remove monoxonic
    df = df.loc[df.ic!='-']

    # agg. sources; groupby and add commas
    source_ic_df = pd.concat([source_ic_df, df[gb_cols+['source']]],
                             axis=0)
    source_ic_df = source_ic_df.groupby(gb_cols).agg({'source': ','.join}).reset_index()


    # merge to get starts for each sample-level thing
    tss_df = gtf_df.features.tss().df
    tes_df = gtf_df.features.tes().df

    tss_df = tss_df[['transcript_id', 'Start']].rename({'Start':'tss'}, axis=1)
    tes_df = tes_df[['transcript_id', 'Start']].rename({'Start':'tes'}, axis=1)

    df = df[gb_cols+['transcript_id']]
    df = df.merge(tss_df, how='left', on='transcript_id')
    df = df.merge(tes_df, how='left', on='transcript_id')
    df = df.drop('transcript_id', axis=1)

    # concat w/ original ic df
    ic_df = pd.concat([df, ic_df], axis=0)
    ic_df.drop_duplicates(inplace=True)

    # keep the longest for each
    fwd, rev = cerberus.get_stranded_gtf_dfs(ic_df)

    fwd = fwd.groupby(gb_cols, observed=True).agg(tss=("tss", "min"),
                                                          tes=("tes", "max")).reset_index()

    rev = rev.groupby(gb_cols, observed=True).agg(tss=("tss", "max"),
                                                          tes=("tes", "min")).reset_index()
    ic_df = pd.concat([fwd, rev], axis=0)

# merge in sources
ic_df = ic_df.merge(source_ic_df, on=gb_cols, how='left')
ic_df.to_csv(ofile, sep='\t', index=False)


In [89]:
ic_df.head()

Unnamed: 0,Chromosome,Strand,ic,tss,tes,source
0,GL000195.1,+,138140-140386-140448-140547,137939,141362,"iq_GM22300_1,iq_GM18906_1"
1,GL000195.1,+,138140-140386-140516-140705,137925,141788,"iq_GM22300_1,iq_GM18906_1"
2,GL000195.1,+,138140-140386-140516-142142-142240-149088,137940,149493,iq_GM18906_1
3,GL000195.1,+,138140-140390-140516-142142-142240-149088,138006,149493,iq_GM18906_1
4,GL000195.1,+,138140-140413,137913,141788,"iq_GM22300_1,iq_GM18906_1"


In [85]:
source_ic_df.head()
source_ic_df.loc[source_ic_df.source.str.contains(',')].head()

Unnamed: 0,ic,source
4,100038316-100049908-100050004-100058665-100058...,"iq_GM22300_1,iq_GM18906_1"
8,100049841-100056910-100057405-100057697-100057...,"iq_GM22300_1,iq_GM18906_1"
11,100065511-100084686-100084855-100097480-100097...,"iq_GM22300_1,iq_GM18906_1"
12,100065511-100084686-100084855-100097480-100097...,"iq_GM22300_1,iq_GM18906_1"
14,100081634-100077433-100077302-100075630-100075...,"iq_GM22300_1,iq_GM22300_1"


In [63]:
ic_df.loc[ic_df.ic.duplicated(keep=False)].sort_values(by='ic')

Unnamed: 0,Chromosome,Strand,transcript_id,ic,tss_start,tes_start
9010,chr14,+,transcript3712.chr14.nic,100065511-100084686-100084855-100097480-100097...,100065358,100108105
9184,chr14,+,transcript3397.chr14.nic,100065511-100084686-100084855-100097480-100097...,100065418,100108099
25212,chr7,-,ENST00000424697.5,100081634-100077433-100077302-100075630-100075...,100081763,100069971
25254,chr7,-,ENST00000449785.5,100081634-100077433-100077302-100075630-100075...,100081738,100071929
29432,chr9,+,transcript2092.chr9.nic,100099416-100104497-100104627-100126382,100099198,100126868
...,...,...,...,...,...,...
28521,chr9,+,transcript2094.chr9.nic,99906706-99915177-99915362-99928777-99928843-9...,99906653,99951481
21496,chr4,-,transcript2038.chr4.nic,99946438-99930621-99930449-99924815-99924759-9...,99946652,99923016
22135,chr4,-,transcript1910.chr4.nic,99946438-99930621-99930449-99924815-99924759-9...,99946586,99923016
22134,chr4,-,transcript1908.chr4.nic,99946438-99930621-99930449-99928617-99928541-9...,99946616,99927344
