In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
import sklearn
from snakemake.io import expand
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *
from proc_revisions.ss_sj_utils import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [62]:
talon_gtf = od+expand(config['lr']['talon']['gtf'], species='human')[0]
talon_ab = od+expand(config['lr']['talon']['ab'], species='human')[0]
talon_filt_ab = od+expand(config['lr']['talon']['filt_ab'], species='human')[0]
ref_gtf = od+expand(config['ref']['gtf'], species='human')[0]
ref_ics = od+expand(config['ref']['cerberus']['ics'], species='human')[0]

In [103]:
def fix_talon_known_fusion_transcripts(talon_ab,
                                       talon_filt_ab,
                                       talon_gtf,
                                       ref_ics,
                                       ref_gtf,
                                       wc,
                                       ofile_gtf,
                                       ofile_ab): 
    """
    Fix gene assignments to novel transcripts that have splice sites
    that intersect with known readthrough transcripts (v29 or vM21). 
    Update gene counts accordingly.
    * Limit to novel transcripts from known genes
    * Limit to novel transcripts that share splice sites with only 2 genes
        (one readthrough, one non readthrough)
    * Compute what % of splice sites are supported in the novel transcript
        by the best match transcript from each of the two annotated genes
    * Compute what % of splice sites in each of the two annotated genes
        are supported by the novel transcript
    * Compute how far away the nearest TSS / TES is for each of the 
        two annotated genes
    * Used a decision tree classifier + manual labels I gave from inspecting 
        loci to determine whether the gene assignment should be made 
        to the readthrough gene or the non-readthrough gene
    """
    if wc['species'] == 'human':
        ver = 'v40_cerberus'
    elif wc['species'] == 'mouse':
        ver = 'vM25_cerberus'

    gtf_df, _, _ = get_gtf_info(ver=ver, how='gene', add_stable_gid=True)
    gtf_df = gtf_df[['gid_stable', 'gname']]
    gtf_df.head()

    talon_df = pd.read_csv(talon_filt_ab, sep='\t')
    talon_df['gid'] = cerberus.get_stable_gid(talon_df, 'annot_gene_id')

    tids = talon_df.loc[(talon_df.transcript_novelty!='Known')&(talon_df.gene_novelty=='Known'), 'annot_transcript_id'].tolist()

    df = pr.read_gtf(talon_gtf, rename_attr=True, duplicate_attr=True)
    df = cerberus.get_ic(df)
    df.rename({'transcript_id':'Name', 'ic': 'Coordinates'}, axis=1, inplace=True)
    df['source'] = 'lapa'
    
    # get gids from annotated fusion genes
    rt_df, _, _ = get_gtf_info(ver=ver, how='iso')
    rt_df = rt_df.loc[rt_df.readthrough_transcript==True]
    rt_df['gid_stable'] = cerberus.get_stable_gid(rt_df, 'gid')
    fusion_gids = rt_df.gid_stable.tolist()
    
    # get reference transcript id to ic name map :'(
    ref_df = pr.read_gtf(talon_gtf, rename_attr=True, duplicate_attr=True)
    ref_df = cerberus.get_ic(ref_df)
    ref_df.head()
    
    known_df = pd.read_csv(ref_ics, sep='\t')
    known_df['gene_id'] = known_df.Name.str.split('_', expand=True)[0]
    known_df['source'] = 'lapa'
    
    ref_df.rename({'ic':'Coordinates'}, axis=1, inplace=True)
    print(len(ref_df.index))
    ref_df= ref_df.merge(known_df[['Coordinates', 'gene_id', 'Chromosome', 'Name']], 
                 how='left', 
                 on=['Coordinates', 'gene_id', 'Chromosome'])
    print(len(ref_df.index))
    
    known_df = pd.read_csv(ref_ics, sep='\t')
    known_df['gene_id'] = known_df.Name.str.split('_', expand=True)[0]
    known_df['source'] = 'lapa'

    _, f_ss_ic_df = get_ss_from_ic(df, ['lapa'])
    f_ss_ic_df.drop(['source', 'novelty'], axis=1, inplace=True)

    _, k_ss_ic_df = get_ss_from_ic(known_df, ['lapa'])
    k_ss_ic_df.drop(['source', 'novelty'], axis=1, inplace=True)
    
    f_ss_ic_df['ss_id'] = f_ss_ic_df.Name+f_ss_ic_df.Chromosome+f_ss_ic_df.Strand+f_ss_ic_df.Start.astype(str)
    k_ss_ic_df['ss_id'] = k_ss_ic_df.Name+k_ss_ic_df.Chromosome+k_ss_ic_df.Strand+k_ss_ic_df.Start.astype(str)
    
    f_ss_ic_df = f_ss_ic_df.merge(k_ss_ic_df, how='left', on=['Chromosome', 'Strand', 'Start', 'ss_type'],
                          suffixes=('', '_known'))
    f_ss_ic_df['known'] = False
    f_ss_ic_df.loc[f_ss_ic_df.Name_known.notnull(), 'known'] = True
    
    # 1. get all transcript + gene combinations
    df = f_ss_ic_df[['gene_id', 'Name', 'gene_id_known', 'known']].drop_duplicates().copy(deep=True)
    
    # 2. compute number of splice sites / transcript
    temp = f_ss_ic_df[['Name', 'ss_id']].groupby('Name').nunique()
    temp = temp.reset_index().rename({'ss_id': 'n_ss'}, axis=1)
    df = df.merge(temp, how='left', on='Name')
    
    # 3. add in the novelty of the talon gene and transcript
    df = df.merge(talon_df[['transcript_novelty', 'gene_novelty', 'annot_transcript_id']], 
                  how='left', 
                  left_on='Name',
                  right_on='annot_transcript_id')
    df.drop('annot_transcript_id', axis=1, inplace=True)
    
    # 4. subset for novel transcripts that belong to annotated genes
    df = df.loc[(df.transcript_novelty!='Known')&\
                (df.gene_novelty=='Known')]
    
    # 5. get all transcripts that have >= 1 splice site shared w/ a readthrough gene
    tids = df.loc[df.gene_id_known.isin(fusion_gids), 'Name'].unique().tolist()
    df = df.loc[df.Name.isin(tids)]
    
    # 6. get how many unique genes share splice sites with this transcript
    temp = df.loc[df.known==True].copy(deep=True)
    temp = temp[['Name', 'gene_id_known']].groupby('Name').nunique().reset_index()
    temp.rename({'gene_id_known': 'n_genes'}, axis=1, inplace=True)
    df = df.merge(temp, how='left', on='Name')
    
    # 7. get how many unique genes share splice sites with this transcript
    temp = df.loc[df.known==True].copy(deep=True)
    temp = temp.loc[~temp.gene_id_known.isin(fusion_gids)]
    temp = temp[['Name', 'gene_id_known']].groupby('Name').nunique().reset_index()
    temp.rename({'gene_id_known': 'n_genes_no_rt'}, axis=1, inplace=True)
    df = df.merge(temp, how='left', on='Name')
    
    # 8. limit to transcripts w/ n_genes_no_rt < n_genes
    # and n_genes_no_rt == 1 (only one choice)
    # where the gene that was merged with is the non-rt one
    df = df.loc[(df.n_genes_no_rt==1)&(df.n_genes>df.n_genes_no_rt)]
    df = df.loc[~df.gene_id_known.isin(fusion_gids)]
    df.head()
    
    # 9. get the number of splice sites supported by the transcript w/ the most
    # shared splice sites for each transcript + annotated gene combination, 
    # then merge in based on transcript from obs. transcripts
    temp = f_ss_ic_df.copy(deep=True)
    temp = temp[['Name', 'ss_id', 'gene_id_known', 'Name_known']].drop_duplicates()
    temp = temp.groupby(['Name', 'gene_id_known', 'Name_known']).nunique().reset_index()
    temp.rename({'ss_id': 'n_supp_ss',
                 'Name_known': 'Name_supp'}, axis=1, inplace=True)
    temp = temp.sort_values(by='n_supp_ss', ascending=False)
    temp = temp.drop_duplicates(subset=['Name', 'gene_id_known'], keep='first')
    df = df.merge(temp, how='left', on=['Name', 'gene_id_known'])
    
    # 9.5. get the number of splice sites supported by the transcript w/ the most
    # shared splice sites for each transcript + annotated gene combination, 
    # then merge in based on transcript from obs. transcripts
    temp = f_ss_ic_df.copy(deep=True)
    temp = temp[['Name', 'ss_id', 'gene_id_known', 'Name_known']].drop_duplicates()
    temp = temp.loc[temp.gene_id_known.isin(fusion_gids)]
    temp = temp.groupby(['Name', 'gene_id_known', 'Name_known']).nunique().reset_index()
    temp.rename({'ss_id': 'n_supp_ss_rt',
                 'Name_known': 'Name_supp_rt',
                 'gene_id_known': 'gene_id_known_rt'}, axis=1, inplace=True)
    temp = temp.sort_values(by='n_supp_ss_rt', ascending=False)
    temp = temp.drop_duplicates(subset=['Name'], keep='first')
    df = df.merge(temp, how='left', on=['Name'])
    
    # 10. remove entries where talon gene id is already
    # the other choice
    df['gid'] = cerberus.get_stable_gid(df, 'gene_id')
    df = df.loc[df.gid!='gene_id_known']
    
    # 11. get the total # of splice sites per annotated transcript
    temp = k_ss_ic_df[['Name', 'ss_id']].groupby('Name').nunique().reset_index()
    temp = temp.rename({'ss_id': 'n_supp_total_ss',
                                      'Name': 'Name_supp'}, axis=1)
    df = df.merge(temp, how='left', on='Name_supp')
    temp = temp.rename({'n_supp_total_ss': 'n_supp_total_ss_rt',
                        'Name_supp': 'Name_supp_rt'}, axis=1)
    df = df.merge(temp, how='left', on='Name_supp_rt')
    
    # 12. limit to obs. transcripts that were annotated to readthrough genes
    df = df.loc[df.gid.isin(fusion_gids)]
    print(len(df.index))
    
    df['perc_supp_ss'] = (df.n_supp_ss/df.n_ss)*100
    df['perc_supp_ss_rt'] = (df.n_supp_ss_rt/df.n_ss)*100
    df['perc_supp_annot_ss'] = (df.n_supp_ss/df.n_supp_total_ss)*100
    df['perc_supp_annot_ss_rt'] = (df.n_supp_ss_rt/df.n_supp_total_ss_rt)*100
    
    temp = df.loc[df.gid.isin(fusion_gids)]
    print(len(temp.Name.unique()))
    
    df['perc_supp_diff'] = df['perc_supp_ss']-df['perc_supp_ss_rt']
    df['perc_supp_annot_diff'] = df['perc_supp_annot_ss']-df['perc_supp_annot_ss_rt']
    df = df.loc[df.perc_supp_annot_ss.notnull()]
    
    # 14. compute distances from transcript start and end to closest start and end from any transcript in
    talon_df = pr.read_gtf(talon_gtf, rename_attr=True, duplicate_attr=True)
    talon_df = talon_df.df
    talon_df = talon_df.loc[talon_df.Feature=='transcript']
    
    known_df = pr.read_gtf(ref_gtf, rename_attr=True, duplicate_attr=True)
    known_df = known_df.df
    known_df = known_df.loc[known_df.Feature=='transcript']
    
    known_df['gene_id'] = cerberus.get_stable_gid(known_df, 'gene_id')
    # add coords to novel thing (df)
    temp = talon_df[['transcript_id', 'Start', 'End', 'Chromosome', 'Strand']]
    temp.rename({'transcript_id':'Name'}, axis=1, inplace=True)
    df = df.merge(temp, how='left', on='Name')
    
    df = df.drop_duplicates()
    temp3 = pd.DataFrame()
    temp3['Name'] = df.Name.tolist()
    for c in ['gene_id_known', 'gene_id_known_rt']:
        for feat in ['End', 'Start']:
            temp2 = pd.DataFrame() 
            for g in df[c].unique().tolist():
                if feat == 'Start':
                    other_feat = 'End'
                elif feat == 'End':
                    other_feat = 'Start'
                temp_nov = df.loc[df[c] == g].copy(deep=True)
                temp_nov = temp_nov[['Name', 'Chromosome', feat, 'Strand']]
                temp_nov[other_feat] = temp_nov[feat]+1 

                temp_known = known_df.loc[known_df.gene_id == g].copy(deep=True)
                temp_known = temp_known[['Chromosome', feat, 'Strand']]
                temp_known[other_feat] = temp_known[feat]+1 


                temp_nov = pr.PyRanges(temp_nov)
                temp_known = pr.PyRanges(temp_known)

                # pyranges join and get closest; concat in w/ temp2s
                temp_nov = temp_nov.k_nearest(temp_known, k=1, 
                                   overlap=True,
                                   how=None, suffix='_known').df
                try:
                    temp_nov = temp_nov[['Name', 'Distance']].drop_duplicates()
                except:
                    import pdb; pdb.set_trace()
                d_col = f'{c}_{feat}_dist'
                temp_nov.rename({'Distance': d_col}, axis=1, inplace=True)        
                # if len(temp_nov.loc[temp_nov[d_col].isnull()].index) > 0:
                #     import pdb; pdb.set_trace()
                temp2 = pd.concat([temp2, temp_nov], axis=0)
            temp3 = temp3.merge(temp2, on='Name', how='left') 
            
    df = df.merge(temp3, how='left', on='Name')  
    
    # add abs. values
    cols = ['gene_id_known_End_dist',
            'gene_id_known_Start_dist',
            'gene_id_known_rt_End_dist',
            'gene_id_known_rt_Start_dist']
    for c in cols:
        new_col = f'{c}_abs'
        df[new_col] = df[c].abs()
    
    # implement DecisionTree classifier in pandas format w/ thresholds
    df['fix'] = False

    # start of closest rt transcript must be > 510 bp away
    inds = df.loc[df.gene_id_known_rt_Start_dist_abs > 510].index
    df.loc[inds, 'fix'] = True

    # end of the closest rt transcript must be > 45 kbp away
    inds = df.loc[df.gene_id_known_rt_End_dist_abs > 45000].index
    df.loc[inds, 'fix'] = True

    # % of sss annotated in best matching non-rt transcripts
    # must be >94
    # % of sss annotated in actual transcript mush be > 84%
    inds = df.loc[(df.perc_supp_annot_ss > 94)&(df.perc_supp_ss > 84)].index
    df.loc[inds, 'fix'] = True

    # df[['fix_needed', 'fix', 'Name']].groupby(['fix_needed', 'fix']).count().reset_index()
    
    gtf_df = pr.read_gtf(talon_gtf, duplicate_attr=True, rename_attr=True)
    gtf_df = gtf_df.df
    gtf_df['gid_stable'] = cerberus.get_stable_gid(gtf_df, 'gene_id')

    ref_gtf_df = pr.read_gtf(ref_gtf, duplicate_attr=True, rename_attr=True)
    ref_gtf_df = ref_gtf_df.df
    ref_gtf_df['gid_stable'] = cerberus.get_stable_gid(ref_gtf_df, 'gene_id')
    
    # update the genes that need fixing to the genes that were non-rt that they 
    # intersected with
    gene_cols = ['Source', 'gene_id', 'gene_name', 'gene_status', 'gene_type', 'talon_gene', 'havana_gene', 'level',
                 'antisense_gene', 'gene_antisense_to_IDs', 'intergenic_novel', 'fusion_novel']
    talon_gene_cols = ['gene_status', 'talon_gene', 'antisense_gene', 'gene_antisense_to_IDs', 'intergenic_novel', 'fusion_novel']
    temp = df.loc[df.fix == True]
    temp3 = temp.copy(deep=True)
    for ind, entry in temp.iterrows():
        t = entry.Name
        inds = gtf_df.loc[gtf_df.transcript_id==t].index
        g = entry.gene_id_known
        dummy_gene_entry = gtf_df.loc[(gtf_df.gid_stable==g)&(gtf_df.Feature=='gene')]
        if len(dummy_gene_entry.index) == 0:
            # have to pull from reference gtf instead and add corresponding gene entry
            dummy_gene_entry = ref_gtf_df.loc[(ref_gtf_df.gid_stable==g)&(ref_gtf_df.Feature=='gene')]
            dummy_gene_entry[talon_gene_cols] = np.nan
            gtf_df = pd.concat([gtf_df, dummy_gene_entry], axis=0)
        try:
            assert len(dummy_gene_entry.index) == 1
        except:
            import pdb; pdb.set_trace()
        for c in gene_cols:
            gtf_df.loc[inds, c] = dummy_gene_entry[c].values[0]
        
    # drop stable gid, sort, update ends, and dump
    gtf_df.drop('gid_stable', axis=1, inplace=True)
    gtf_df = cerberus.sort_gtf(gtf_df)
    # mainly ripped out of cerberus code
    gtf_temp = gtf_df.copy(deep=True)
    for mode in ['tss', 'tes']:
        fwd, rev = cerberus.get_stranded_gtf_dfs(gtf_temp)
        df = pd.DataFrame()
        for strand, temp in zip(['+', '-'], [fwd, rev]):

            # fix gene boundaries
            temp = cerberus.update_gene_ends(temp, mode, strand)
            df = pd.concat([df, temp], ignore_index=True)

        gtf_temp = df.copy(deep=True)
    pr.PyRanges(gtf_temp).to_gtf(ofile_gtf)
    
    # also update the unfiltered abundance table
    import pdb; pdb.set_trace()
    temp = temp3[['Name', 'gene_id_known']]
    assert len(temp3.loc[(temp3.perc_supp_ss==100)&(temp3.perc_supp_annot_ss==100)].index) == 0
    temp.rename({'Name': 'annot_transcript_id',
                 'gene_id_known': 'gid_stable'},
                axis=1, inplace=True)
    
    df = pd.read_csv(talon_ab, sep='\t')
    print(len(df.index))
    df['gid_stable'] = cerberus.get_stable_gid(df, 'annot_gene_id')
    df[df.columns[:11]].head()
    gene_cols = ['gene_ID', 'annot_gene_id', 'annot_gene_name', 'gene_novelty', 'gid_stable']
    gene_info = df[gene_cols].drop_duplicates()
    gene_info.head()
    temp = temp.merge(gene_info, how='left', on='gid_stable')
    temp.drop('gid_stable', axis=1, inplace=True)    
    df = df.merge(temp, how='left', on='annot_transcript_id', suffixes=('', '_fix'))
    fix_inds = df.loc[df.gene_ID_fix.notnull()].index
    gene_cols = [g for g in gene_cols if g!='gid_stable']
    for g in gene_cols:
        fix_col = f'{g}_fix'
        df.loc[fix_inds, g] = df.loc[fix_inds, fix_col]
        df.drop(fix_col, axis=1, inplace=True)
    df.to_csv(ofile_ab, sep='\t', index=False)
    
    

## 231218 try with my function-ized version

In [104]:
ofile_gtf = 'test.gtf'
ofile_ab = 'test_unfiltered_abundance.tsv'
fix_talon_known_fusion_transcripts(talon_ab,
                                       talon_filt_ab,
                                       talon_gtf,
                                       ref_ics,
                                       ref_gtf,
                                       {'species': 'human'},
                                       ofile_gtf,
                                       ofile_ab)

Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'
Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'
191401
191401
253
218
Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'
Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'
> [0;32m<ipython-input-103-23fa204640f2>[0m(301)[0;36mfix_talon_known_fusion_transcripts[0;34m()[0m
[0;32m    299 [0;31m    [0;31m# also update the unfiltered abundance table[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    300 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 301 [0;31m    [0mtemp[0m [0;34m=[0m [0mtemp3[0m[0;34m[[0m[0;34m[[0m[0;34m'Name'[0m[0;34m,[0m [0;34m'gene_id_known'[0m[0;34m][0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    302 [0;31m    

ipdb>  n


> [0;32m<ipython-input-103-23fa204640f2>[0m(302)[0;36mfix_talon_known_fusion_transcripts[0;34m()[0m
[0;32m    300 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    301 [0;31m    [0mtemp[0m [0;34m=[0m [0mtemp3[0m[0;34m[[0m[0;34m[[0m[0;34m'Name'[0m[0;34m,[0m [0;34m'gene_id_known'[0m[0;34m][0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 302 [0;31m    [0;32massert[0m [0mlen[0m[0;34m([0m[0mtemp[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0;34m([0m[0mtemp[0m[0;34m.[0m[0mperc_supp_ss[0m[0;34m==[0m[0;36m100[0m[0;34m)[0m[0;34m&[0m[0;34m([0m[0mtemp[0m[0;34m.[0m[0mperc_supp_annot_ss[0m[0;34m==[0m[0;36m100[0m[0;34m)[0m[0;34m][0m[0;34m.[0m[0mindex[0m[0;34m)[0m [0;34m==[0m [0;36m0[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    303 [0;31m    temp.rename({'Name': 'annot_transcript_id',
[0m[0;32m    304 [0;31m       

ipdb>  temp.head()


               Name    gene_id_known
0   TALONT001122495  ENSG00000143543
11  TALONT000930061  ENSG00000116783
12  TALONT000930071  ENSG00000116783
13  TALONT000930083  ENSG00000116783
14  TALONT000983329  ENSG00000156875


ipdb>  temp3.loc[(temp3.perc_supp_ss==100)&(temp3.perc_supp_annot_ss==100)]


Empty DataFrame
Columns: [gene_id, Name, gene_id_known, known, n_ss, transcript_novelty, gene_novelty, n_genes, n_genes_no_rt, Name_supp, n_supp_ss, gene_id_known_rt, Name_supp_rt, n_supp_ss_rt, gid, n_supp_total_ss, n_supp_total_ss_rt, perc_supp_ss, perc_supp_ss_rt, perc_supp_annot_ss, perc_supp_annot_ss_rt, perc_supp_diff, perc_supp_annot_diff, Start, End, Chromosome, Strand, gene_id_known_End_dist, gene_id_known_Start_dist, gene_id_known_rt_End_dist, gene_id_known_rt_Start_dist, gene_id_known_End_dist_abs, gene_id_known_Start_dist_abs, gene_id_known_rt_End_dist_abs, gene_id_known_rt_Start_dist_abs, fix]
Index: []

[0 rows x 36 columns]


ipdb>  temp['perc_supp_ss'] = 0
ipdb>  temp['perc_supp_annot_ss'] = 0
ipdb>  c


5395598


In [4]:
gtf_df, _, _ = get_gtf_info(ver='v40_cerberus', how='gene', add_stable_gid=True)
gtf_df = gtf_df[['gid_stable', 'gname']]
gtf_df.head()

talon_df = pd.read_csv(talon_filt_ab, sep='\t')
talon_df['gid'] = cerberus.get_stable_gid(talon_df, 'annot_gene_id')

tids = talon_df.loc[(talon_df.transcript_novelty!='Known')&(talon_df.gene_novelty=='Known'), 'annot_transcript_id'].tolist()

df = pr.read_gtf(talon_gtf, rename_attr=True, duplicate_attr=True)
df = cerberus.get_ic(df)
df.rename({'transcript_id':'Name', 'ic': 'Coordinates'}, axis=1, inplace=True)
df['source'] = 'lapa'

Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'


In [5]:
# get gids from annotated fusion genes
rt_df, _, _ = get_gtf_info(ver='v40_cerberus', how='iso')
rt_df = rt_df.loc[rt_df.readthrough_transcript==True]
rt_df['gid_stable'] = cerberus.get_stable_gid(rt_df, 'gid')
fusion_gids = rt_df.gid_stable.tolist()

In [6]:
# get reference transcript id to ic name map :'(
ref_df = pr.read_gtf(talon_gtf, rename_attr=True, duplicate_attr=True)
ref_df = cerberus.get_ic(ref_df)
ref_df.head()

Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'


Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,ic
0,ERCC-00002,-,TALONT001166745,TALONG000193759,-
1,ERCC-00002,+,tSpikein_ERCC-00002,gSpikein_ERCC-00002,-
2,ERCC-00003,+,tSpikein_ERCC-00003,gSpikein_ERCC-00003,-
3,ERCC-00004,+,tSpikein_ERCC-00004,gSpikein_ERCC-00004,-
4,ERCC-00009,-,TALONT000206952,TALONG000058910,-


In [7]:
known_df = pd.read_csv(ref_ics, sep='\t')
known_df['gene_id'] = known_df.Name.str.split('_', expand=True)[0]
known_df['source'] = 'lapa'

In [8]:
ref_df.rename({'ic':'Coordinates'}, axis=1, inplace=True)
print(len(ref_df.index))
ref_df= ref_df.merge(known_df[['Coordinates', 'gene_id', 'Chromosome', 'Name']], 
             how='left', 
             on=['Coordinates', 'gene_id', 'Chromosome'])
print(len(ref_df.index))

191401
191401


In [9]:
known_df = pd.read_csv(ref_ics, sep='\t')
known_df['gene_id'] = known_df.Name.str.split('_', expand=True)[0]
known_df['source'] = 'lapa'

_, f_ss_ic_df = get_ss_from_ic(df, ['lapa'])
f_ss_ic_df.drop(['source', 'novelty'], axis=1, inplace=True)

_, k_ss_ic_df = get_ss_from_ic(known_df, ['lapa'])
k_ss_ic_df.drop(['source', 'novelty'], axis=1, inplace=True)

In [10]:
f_ss_ic_df['ss_id'] = f_ss_ic_df.Name+f_ss_ic_df.Chromosome+f_ss_ic_df.Strand+f_ss_ic_df.Start.astype(str)
k_ss_ic_df['ss_id'] = k_ss_ic_df.Name+k_ss_ic_df.Chromosome+k_ss_ic_df.Strand+k_ss_ic_df.Start.astype(str)

In [11]:
f_ss_ic_df = f_ss_ic_df.merge(k_ss_ic_df, how='left', on=['Chromosome', 'Strand', 'Start', 'ss_type'],
                          suffixes=('', '_known'))
f_ss_ic_df['known'] = False
f_ss_ic_df.loc[f_ss_ic_df.Name_known.notnull(), 'known'] = True

In [12]:
# 1. get all transcript + gene combinations
df = f_ss_ic_df[['gene_id', 'Name', 'gene_id_known', 'known']].drop_duplicates().copy(deep=True)

In [13]:
# 2. compute number of splice sites / transcript
temp = f_ss_ic_df[['Name', 'ss_id']].groupby('Name').nunique()
temp = temp.reset_index().rename({'ss_id': 'n_ss'}, axis=1)
df = df.merge(temp, how='left', on='Name')

In [14]:
# 3. add in the novelty of the talon gene and transcript
df = df.merge(talon_df[['transcript_novelty', 'gene_novelty', 'annot_transcript_id']], 
              how='left', 
              left_on='Name',
              right_on='annot_transcript_id')
df.drop('annot_transcript_id', axis=1, inplace=True)

In [15]:
# 4. subset for novel transcripts that belong to annotated genes
df = df.loc[(df.transcript_novelty!='Known')&\
            (df.gene_novelty=='Known')]

In [16]:
# 5. get all transcripts that have >= 1 splice site shared w/ a readthrough gene
tids = df.loc[df.gene_id_known.isin(fusion_gids), 'Name'].unique().tolist()
df = df.loc[df.Name.isin(tids)]

In [17]:
# 6. get how many unique genes share splice sites with this transcript
temp = df.loc[df.known==True].copy(deep=True)
temp = temp[['Name', 'gene_id_known']].groupby('Name').nunique().reset_index()
temp.rename({'gene_id_known': 'n_genes'}, axis=1, inplace=True)
df = df.merge(temp, how='left', on='Name')

In [18]:
# 7. get how many unique genes share splice sites with this transcript
temp = df.loc[df.known==True].copy(deep=True)
temp = temp.loc[~temp.gene_id_known.isin(fusion_gids)]
temp = temp[['Name', 'gene_id_known']].groupby('Name').nunique().reset_index()
temp.rename({'gene_id_known': 'n_genes_no_rt'}, axis=1, inplace=True)
df = df.merge(temp, how='left', on='Name')

In [19]:
# 8. limit to transcripts w/ n_genes_no_rt < n_genes
# and n_genes_no_rt == 1 (only one choice)
# where the gene that was merged with is the non-rt one
df = df.loc[(df.n_genes_no_rt==1)&(df.n_genes>df.n_genes_no_rt)]
df = df.loc[~df.gene_id_known.isin(fusion_gids)]
df.head()

Unnamed: 0,gene_id,Name,gene_id_known,known,n_ss,transcript_novelty,gene_novelty,n_genes,n_genes_no_rt
0,ENSG00000008128.22,TALONT000291637,ENSG00000008128,True,12,ISM,Known,2,1.0
2,ENSG00000008128.22,TALONT000291641,ENSG00000008128,True,12,ISM,Known,2,1.0
4,ENSG00000008128.22,TALONT000291742,ENSG00000008128,True,2,ISM,Known,2,1.0
6,ENSG00000008128.22,TALONT000292018,ENSG00000008128,True,6,ISM,Known,2,1.0
8,ENSG00000215790.7,TALONT000292468,ENSG00000215790,True,12,NNC,Known,2,1.0


In [20]:
# 9. get the number of splice sites supported by the transcript w/ the most
# shared splice sites for each transcript + annotated gene combination, 
# then merge in based on transcript from obs. transcripts
temp = f_ss_ic_df.copy(deep=True)
temp = temp[['Name', 'ss_id', 'gene_id_known', 'Name_known']].drop_duplicates()
temp = temp.groupby(['Name', 'gene_id_known', 'Name_known']).nunique().reset_index()
temp.rename({'ss_id': 'n_supp_ss',
             'Name_known': 'Name_supp'}, axis=1, inplace=True)
temp = temp.sort_values(by='n_supp_ss', ascending=False)
temp = temp.drop_duplicates(subset=['Name', 'gene_id_known'], keep='first')
df = df.merge(temp, how='left', on=['Name', 'gene_id_known'])

In [21]:
# 9.5. get the number of splice sites supported by the transcript w/ the most
# shared splice sites for each transcript + annotated gene combination, 
# then merge in based on transcript from obs. transcripts
temp = f_ss_ic_df.copy(deep=True)
temp = temp[['Name', 'ss_id', 'gene_id_known', 'Name_known']].drop_duplicates()
temp = temp.loc[temp.gene_id_known.isin(fusion_gids)]
temp = temp.groupby(['Name', 'gene_id_known', 'Name_known']).nunique().reset_index()
temp.rename({'ss_id': 'n_supp_ss_rt',
             'Name_known': 'Name_supp_rt',
             'gene_id_known': 'gene_id_known_rt'}, axis=1, inplace=True)
temp = temp.sort_values(by='n_supp_ss_rt', ascending=False)
temp = temp.drop_duplicates(subset=['Name'], keep='first')
df = df.merge(temp, how='left', on=['Name'])

In [22]:
# 10. remove entries where talon gene id is already
# the other choice
df['gid'] = cerberus.get_stable_gid(df, 'gene_id')
df = df.loc[df.gid!='gene_id_known']

In [23]:
# 11. get the total # of splice sites per annotated transcript
temp = k_ss_ic_df[['Name', 'ss_id']].groupby('Name').nunique().reset_index()
temp = temp.rename({'ss_id': 'n_supp_total_ss',
                                  'Name': 'Name_supp'}, axis=1)
df = df.merge(temp, how='left', on='Name_supp')
temp = temp.rename({'n_supp_total_ss': 'n_supp_total_ss_rt',
                    'Name_supp': 'Name_supp_rt'}, axis=1)
df = df.merge(temp, how='left', on='Name_supp_rt')

In [24]:
# 12. limit to obs. transcripts that were annotated to readthrough genes
df = df.loc[df.gid.isin(fusion_gids)]
len(df.index)

253

In [25]:
df['perc_supp_ss'] = (df.n_supp_ss/df.n_ss)*100
df['perc_supp_ss_rt'] = (df.n_supp_ss_rt/df.n_ss)*100
df['perc_supp_annot_ss'] = (df.n_supp_ss/df.n_supp_total_ss)*100
df['perc_supp_annot_ss_rt'] = (df.n_supp_ss_rt/df.n_supp_total_ss_rt)*100

In [26]:
temp = df.loc[df.gid.isin(fusion_gids)]
len(temp.Name.unique())

218

In [27]:
df['perc_supp_diff'] = df['perc_supp_ss']-df['perc_supp_ss_rt']
df['perc_supp_annot_diff'] = df['perc_supp_annot_ss']-df['perc_supp_annot_ss_rt']
df = df.loc[df.perc_supp_annot_ss.notnull()]

In [28]:
# 14. compute distances from transcript start and end to closest start and end from any transcript in
talon_df = pr.read_gtf(talon_gtf, rename_attr=True, duplicate_attr=True)
talon_df = talon_df.df
talon_df = talon_df.loc[talon_df.Feature=='transcript']

Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'


In [29]:
known_df = pr.read_gtf(ref_gtf, rename_attr=True, duplicate_attr=True)
known_df = known_df.df
known_df = known_df.loc[known_df.Feature=='transcript']

In [31]:
known_df['gene_id'] = cerberus.get_stable_gid(known_df, 'gene_id')
# add coords to novel thing (df)
temp = talon_df[['transcript_id', 'Start', 'End', 'Chromosome', 'Strand']]
temp.rename({'transcript_id':'Name'}, axis=1, inplace=True)
df = df.merge(temp, how='left', on='Name')

In [33]:
df = df.drop_duplicates()
temp3 = pd.DataFrame()
temp3['Name'] = df.Name.tolist()
for c in ['gene_id_known', 'gene_id_known_rt']:
    for feat in ['End', 'Start']:
        temp2 = pd.DataFrame() 
        for g in df[c].unique().tolist():
            if feat == 'Start':
                other_feat = 'End'
            elif feat == 'End':
                other_feat = 'Start'
            temp_nov = df.loc[df[c] == g].copy(deep=True)
            temp_nov = temp_nov[['Name', 'Chromosome', feat, 'Strand']]
            temp_nov[other_feat] = temp_nov[feat]+1 

            temp_known = known_df.loc[known_df.gene_id == g].copy(deep=True)
            temp_known = temp_known[['Chromosome', feat, 'Strand']]
            temp_known[other_feat] = temp_known[feat]+1 


            temp_nov = pr.PyRanges(temp_nov)
            temp_known = pr.PyRanges(temp_known)

            # pyranges join and get closest; concat in w/ temp2s
            temp_nov = temp_nov.k_nearest(temp_known, k=1, 
                               overlap=True,
                               how=None, suffix='_known').df
            try:
                temp_nov = temp_nov[['Name', 'Distance']].drop_duplicates()
            except:
                import pdb; pdb.set_trace()
            d_col = f'{c}_{feat}_dist'
            temp_nov.rename({'Distance': d_col}, axis=1, inplace=True)        
            # if len(temp_nov.loc[temp_nov[d_col].isnull()].index) > 0:
            #     import pdb; pdb.set_trace()
            temp2 = pd.concat([temp2, temp_nov], axis=0)
        temp3 = temp3.merge(temp2, on='Name', how='left')    

In [34]:
df = df.merge(temp3, how='left', on='Name')

In [35]:
# add abs. values
cols = ['gene_id_known_End_dist',
        'gene_id_known_Start_dist',
        'gene_id_known_rt_End_dist',
        'gene_id_known_rt_Start_dist']
for c in cols:
    new_col = f'{c}_abs'
    df[new_col] = df[c].abs()

In [37]:
# merge with my labels
temp = pd.read_csv('labelled_readthrough_fix_candidates.tsv', sep='\t')
df = df.merge(temp, how='left', on='Name')
df.to_csv('labelled_readthrough_fix_candidates_with_features.tsv', sep='\t', index=False)

In [38]:
df = pd.read_csv('labelled_readthrough_fix_candidates_with_features.tsv', sep='\t')

In [39]:
# implement DecisionTree classifier in pandas format w/ thresholds
df = pd.read_csv('labelled_readthrough_fix_candidates_with_features.tsv', sep='\t')
df['fix'] = False

# start of closest rt transcript must be > 510 bp away
inds = df.loc[df.gene_id_known_rt_Start_dist_abs > 510].index
df.loc[inds, 'fix'] = True

# end of the closest rt transcript must be > 45 kbp away
inds = df.loc[df.gene_id_known_rt_End_dist_abs > 45000].index
df.loc[inds, 'fix'] = True

# % of sss annotated in best matching non-rt transcripts
# must be >94
# % of sss annotated in actual transcript mush be > 84%
inds = df.loc[(df.perc_supp_annot_ss > 94)&(df.perc_supp_annot_ss > 84)].index
df.loc[inds, 'fix'] = True

df[['fix_needed', 'fix', 'Name']].groupby(['fix_needed', 'fix']).count().reset_index()

Unnamed: 0,fix_needed,fix,Name
0,could go either way,True,1
1,lost cause,False,5
2,lost cause,True,12
3,no,False,9
4,no,True,7
5,yes,True,41


In [40]:
tim_gene = 'ENSG00000156873'
df.loc[df.gene_id_known==tim_gene]

Unnamed: 0,gene_id,Name,gene_id_known,known,n_ss,transcript_novelty,gene_novelty,n_genes,n_genes_no_rt,Name_supp,...,gene_id_known_End_dist,gene_id_known_Start_dist,gene_id_known_rt_End_dist,gene_id_known_rt_Start_dist,gene_id_known_End_dist_abs,gene_id_known_Start_dist_abs,gene_id_known_rt_End_dist_abs,gene_id_known_rt_Start_dist_abs,fix_needed,fix
73,ENSG00000260899.1,TALONT000637494,ENSG00000156873,True,18,ISM,Known,2,1.0,ENSG00000156873_1,...,-2,112,-5986,-7491,2,112,5986,7491,yes,True
74,ENSG00000260899.1,TALONT000637495,ENSG00000156873,True,14,ISM,Known,2,1.0,ENSG00000156873_1,...,-2,64,-5986,-7539,2,64,5986,7539,,True
75,ENSG00000260899.1,TALONT000637497,ENSG00000156873,True,16,ISM,Known,2,1.0,ENSG00000156873_1,...,-2,10,-5986,-7593,2,10,5986,7593,yes,True
76,ENSG00000260899.1,TALONT000637508,ENSG00000156873,True,16,NIC,Known,2,1.0,ENSG00000156873_5,...,-2,10,-5986,-7593,2,10,5986,7593,yes,True
77,ENSG00000260899.1,TALONT000637511,ENSG00000156873,True,16,ISM,Known,2,1.0,ENSG00000156873_3,...,-2,10,-5986,-7593,2,10,5986,7593,yes,True
78,ENSG00000260899.1,TALONT000637622,ENSG00000156873,True,10,ISM,Known,2,1.0,ENSG00000156873_2,...,-502,0,-2674,-7735,502,0,2674,7735,,True
79,ENSG00000260899.1,TALONT000637792,ENSG00000156873,True,16,ISM,Known,2,1.0,ENSG00000156873_3,...,-2,0,-5986,-7735,2,0,5986,7735,yes,True
80,ENSG00000260899.1,TALONT000638355,ENSG00000156873,True,16,ISM,Known,2,1.0,ENSG00000156873_3,...,-2,0,-5986,-7884,2,0,5986,7884,yes,True


In [45]:
temp = df.loc[df.fix == True]
temp.to_csv('labelled_readthrough_genes_need_fixing.tsv', sep='\t', index=False)

In [54]:
temp = pd.read_csv('labelled_readthrough_genes_need_fixing.tsv', sep='\t')
assert len(temp.loc[(temp.perc_supp_ss==100)&(temp.perc_supp_annot_ss==100)].index) == 0

In [55]:
gtf_df = pr.read_gtf(talon_gtf, duplicate_attr=True, rename_attr=True)
gtf_df = gtf_df.df
gtf_df['gid_stable'] = cerberus.get_stable_gid(gtf_df, 'gene_id')

ref_gtf_df = pr.read_gtf(ref_gtf, duplicate_attr=True, rename_attr=True)
ref_gtf_df = ref_gtf_df.df
ref_gtf_df['gid_stable'] = cerberus.get_stable_gid(ref_gtf_df, 'gene_id')

Found attributes with reserved names: ['source'].
Renaming attributes with suffix '_attr'


In [56]:
temp.columns

Index(['gene_id', 'Name', 'gene_id_known', 'known', 'n_ss',
       'transcript_novelty', 'gene_novelty', 'n_genes', 'n_genes_no_rt',
       'Name_supp', 'n_supp_ss', 'gene_id_known_rt', 'Name_supp_rt',
       'n_supp_ss_rt', 'gid', 'n_supp_total_ss', 'n_supp_total_ss_rt',
       'perc_supp_ss', 'perc_supp_ss_rt', 'perc_supp_annot_ss',
       'perc_supp_annot_ss_rt', 'perc_supp_diff', 'perc_supp_annot_diff',
       'Start', 'End', 'Chromosome', 'Strand', 'gene_id_known_End_dist',
       'gene_id_known_Start_dist', 'gene_id_known_rt_End_dist',
       'gene_id_known_rt_Start_dist', 'gene_id_known_End_dist_abs',
       'gene_id_known_Start_dist_abs', 'gene_id_known_rt_End_dist_abs',
       'gene_id_known_rt_Start_dist_abs', 'fix_needed', 'fix'],
      dtype='object')

In [57]:
# update the genes that need fixing to the genes that were non-rt that they 
# intersected with
gene_cols = ['Source', 'gene_id', 'gene_name', 'gene_status', 'gene_type', 'talon_gene', 'havana_gene', 'level',
             'antisense_gene', 'gene_antisense_to_IDs', 'intergenic_novel', 'fusion_novel']
talon_gene_cols = ['gene_status', 'talon_gene', 'antisense_gene', 'gene_antisense_to_IDs', 'intergenic_novel', 'fusion_novel']
for ind, entry in temp.iterrows():
    t = entry.Name
    inds = gtf_df.loc[gtf_df.transcript_id==t].index
    g = entry.gene_id_known
    dummy_gene_entry = gtf_df.loc[(gtf_df.gid_stable==g)&(gtf_df.Feature=='gene')]
    if len(dummy_gene_entry.index) == 0:
        # have to pull from reference gtf instead and add corresponding gene entry
        dummy_gene_entry = ref_gtf_df.loc[(ref_gtf_df.gid_stable==g)&(ref_gtf_df.Feature=='gene')]
        dummy_gene_entry[talon_gene_cols] = np.nan
        gtf_df = pd.concat([gtf_df, dummy_gene_entry], axis=0)
    try:
        assert len(dummy_gene_entry.index) == 1
    except:
        import pdb; pdb.set_trace()
    for c in gene_cols:
        gtf_df.loc[inds, c] = dummy_gene_entry[c].values[0]

In [60]:
# drop stable gid, sort, update ends, and dump
gtf_df.drop('gid_stable', axis=1, inplace=True)
gtf_df = cerberus.sort_gtf(gtf_df)
# mainly ripped out of cerberus code
gtf_temp = gtf_df.copy(deep=True)
for mode in ['tss', 'tes']:
    fwd, rev = cerberus.get_stranded_gtf_dfs(gtf_temp)
    df = pd.DataFrame()
    for strand, temp in zip(['+', '-'], [fwd, rev]):

        # fix gene boundaries
        temp = cerberus.update_gene_ends(temp, mode, strand)
        df = pd.concat([df, temp], ignore_index=True)

    gtf_temp = df.copy(deep=True)
pr.PyRanges(gtf_temp).to_gtf(ofile)

In [59]:
ofile = 'annot_readthrough_corrected.gtf'
pr.PyRanges(gtf_temp).to_gtf(ofile)

## Also update the gene abundance matrix with the correct new gene assignments for these transcripts

In [79]:
temp = pd.read_csv('labelled_readthrough_genes_need_fixing.tsv', sep='\t')
assert len(temp.loc[(temp.perc_supp_ss==100)&(temp.perc_supp_annot_ss==100)].index) == 0
temp = temp[['Name', 'gene_id_known']]
temp.rename({'Name': 'annot_transcript_id',
             'gene_id_known': 'gid_stable'},
            axis=1, inplace=True)
temp.head()

Unnamed: 0,annot_transcript_id,gid_stable
0,TALONT001122495,ENSG00000143543
1,TALONT000930061,ENSG00000116783
2,TALONT000930071,ENSG00000116783
3,TALONT000930083,ENSG00000116783
4,TALONT000983329,ENSG00000156875


In [92]:
df = pd.read_csv(talon_ab, sep='\t')
print(len(df.index))

5395598


In [71]:
df['gid_stable'] = cerberus.get_stable_gid(df, 'annot_gene_id')
df[df.columns[:11]].head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,ISM_subtype
0,3,4,ENSG00000278267.1,ENST00000619216.1,MIR6859-1,MIR6859-1-201,1,68,Known,Known,
1,4,6,ENSG00000243485.5,ENST00000469289.1,MIR1302-2HG,MIR1302-2HG-201,2,535,Known,Known,
2,6,8,ENSG00000237613.2,ENST00000417324.1,FAM138A,FAM138A-201,3,1187,Known,Known,
3,6,9,ENSG00000237613.2,ENST00000461467.1,FAM138A,FAM138A-202,2,590,Known,Known,
4,10,19,ENSG00000238009.6,ENST00000453576.2,AL627309.1,AL627309.1-204,2,336,Known,Known,


In [80]:
gene_cols = ['gene_ID', 'annot_gene_id', 'annot_gene_name', 'gene_novelty', 'gid_stable']
gene_info = df[gene_cols].drop_duplicates()
gene_info.head()

Unnamed: 0,gene_ID,annot_gene_id,annot_gene_name,gene_novelty,gid_stable
0,3,ENSG00000278267.1,MIR6859-1,Known,ENSG00000278267
1,4,ENSG00000243485.5,MIR1302-2HG,Known,ENSG00000243485
2,6,ENSG00000237613.2,FAM138A,Known,ENSG00000237613
4,10,ENSG00000238009.6,AL627309.1,Known,ENSG00000238009
5,12,ENSG00000233750.3,CICP27,Known,ENSG00000233750


In [81]:
temp = temp.merge(gene_info, how='left', on='gid_stable')
temp.head()

Unnamed: 0,annot_transcript_id,gid_stable,gene_ID,annot_gene_id,annot_gene_name,gene_novelty
0,TALONT001122495,ENSG00000143543,3272,ENSG00000143543.14,JTB,Known
1,TALONT000930061,ENSG00000116783,1978,ENSG00000116783.14,TNNI3K,Known
2,TALONT000930071,ENSG00000116783,1978,ENSG00000116783.14,TNNI3K,Known
3,TALONT000930083,ENSG00000116783,1978,ENSG00000116783.14,TNNI3K,Known
4,TALONT000983329,ENSG00000156875,2365,ENSG00000156875.13,MFSD14A,Known


In [82]:
temp.drop('gid_stable', axis=1, inplace=True)

In [83]:
df = df.merge(temp, how='left', on='annot_transcript_id', suffixes=('', '_fix'))
df.head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,k562_3_2,lower_lobe_of_right_lung_1_1,ovary_2_1,pgp1_endo_1_1,right_cardiac_atrium_3_1,gid_stable,gene_ID_fix,annot_gene_id_fix,annot_gene_name_fix,gene_novelty_fix
0,3,4,ENSG00000278267.1,ENST00000619216.1,MIR6859-1,MIR6859-1-201,1,68,Known,Known,...,0,0,0,0,0,ENSG00000278267,,,,
1,4,6,ENSG00000243485.5,ENST00000469289.1,MIR1302-2HG,MIR1302-2HG-201,2,535,Known,Known,...,0,0,0,0,0,ENSG00000243485,,,,
2,6,8,ENSG00000237613.2,ENST00000417324.1,FAM138A,FAM138A-201,3,1187,Known,Known,...,0,0,0,0,0,ENSG00000237613,,,,
3,6,9,ENSG00000237613.2,ENST00000461467.1,FAM138A,FAM138A-202,2,590,Known,Known,...,0,0,0,0,0,ENSG00000237613,,,,
4,10,19,ENSG00000238009.6,ENST00000453576.2,AL627309.1,AL627309.1-204,2,336,Known,Known,...,0,0,0,0,0,ENSG00000238009,,,,


In [85]:
df.loc[df.gene_ID_fix.notnull()].head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,k562_3_2,lower_lobe_of_right_lung_1_1,ovary_2_1,pgp1_endo_1_1,right_cardiac_atrium_3_1,gid_stable,gene_ID_fix,annot_gene_id_fix,annot_gene_name_fix,gene_novelty_fix
132226,37107,242447,ENSG00000250709.1,TALONT000242447,CCDC169-SOHLH2,TALONT000242447,4,1562,Known,ISM,...,1,0,0,0,0,ENSG00000250709,37108.0,ENSG00000242715.7,CCDC169,Known
133785,40722,244006,ENSG00000187951.11,TALONT000244006,AC091057.1,TALONT000244006,14,2783,Known,NIC,...,3,0,0,0,0,ENSG00000187951,40720.0,ENSG00000285035.1,AC091057.7,Known
158514,45142,268735,ENSG00000262304.2,TALONT000268735,AC027796.3,TALONT000268735,6,2939,Known,NIC,...,2,0,1,0,0,ENSG00000262304,45144.0,ENSG00000197417.7,SHPK,Known
161398,45149,271619,ENSG00000257950.3,TALONT000271619,P2RX5-TAX1BP3,TALONT000271619,12,2291,Known,ISM,...,1,0,0,0,2,ENSG00000257950,45152.0,ENSG00000083454.21,P2RX5,Known
193346,40992,303567,ENSG00000168970.22,TALONT000303567,JMJD7-PLA2G4B,TALONT000303567,20,5541,Known,ISM,...,1,4,5,0,0,ENSG00000168970,40993.0,ENSG00000243708.10,PLA2G4B,Known


In [89]:
fix_inds = df.loc[df.gene_ID_fix.notnull()].index
gene_cols = [g for g in gene_cols if g!='gid_stable']
for g in gene_cols:
    fix_col = f'{g}_fix'
    df.loc[fix_inds, g] = df.loc[fix_inds, fix_col]
    df.drop(fix_col, axis=1, inplace=True)

In [90]:
df.loc[df.transcript_ID==242447]

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,h9_panc_progen_1_2,hepg2_2_1,hl60_m1_12hr_1_2,hl60_m2_72hr_1_2,k562_3_2,lower_lobe_of_right_lung_1_1,ovary_2_1,pgp1_endo_1_1,right_cardiac_atrium_3_1,gid_stable
132226,37108.0,242447,ENSG00000242715.7,TALONT000242447,CCDC169,TALONT000242447,4,1562,Known,ISM,...,0,0,0,0,1,0,0,0,0,ENSG00000250709


In [91]:
len(df.index)

5395598

In [None]:
# ok, think this is working, now need to change the whole thing into a function
# to run systematically