In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *
from scripts.sm_utils import *

In [12]:
config = load_config()
od = ''

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [42]:
def make_hier_entry(df, how='t'):
    """
    kind {'g','t'}
    """
    agg_dict = {'min_coord': 'min', 'max_coord': 'max'}
    t_df = df.copy(deep=True)
    t_df['min_coord'] = t_df[['Start', 'End']].min(axis=1)
    t_df['max_coord'] = t_df[['Start', 'End']].max(axis=1)
    if how == 't':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id', 'transcript_id', 'transcript_name',
                   'tss_id', 'tes_id',
                   'new_transcript_id', 'original_transcript_id',
                   'original_transcript_name', 'ag1', 'ag2']
    elif how == 'g':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id']
    gb_cols = list(set(gb_cols)&(set(t_df.columns)))

    cols = gb_cols + ['min_coord', 'max_coord']
    t_df = t_df[cols]
    t_df = t_df.groupby(gb_cols, observed=True).agg(agg_dict).reset_index()
    t_df.rename({'min_coord': 'Start', 'max_coord': 'End'}, axis=1, inplace=True)
    if how == 't':
        t_df['Feature'] = 'transcript'
    elif how == 'g':
        t_df['Feature'] = 'gene'

    return t_df

In [13]:
meta_file = '../config.tsv'

In [14]:
v47_file = od+config['ref']['gtf']
poder_file = od+config['lr']['gtf_filt_with_genes']

In [35]:
gc_df = pr.read_gtf(v47_file)
p_df = pr.read_gtf(poder_file)

In [18]:
# get list of ics from each
gc_ics = cerberus.get_ic(gc_df)
p_ics = cerberus.get_ic(p_df)

In [32]:
# get uniq ICs (gene+chrom+ic)
# to poder that are not in gc
gc_ics['id'] = gc_ics['Chromosome']+'_'+\
               gc_ics['gene_id']+'_'+\
               gc_ics['ic']
p_ics['id'] = p_ics['Chromosome']+'_'+\
               p_ics['gene_id']+'_'+\
               p_ics['ic']
p_ics['in_gc'] = p_ics['id'].isin(gc_ics['id'].tolist())
p_ics[['in_gc', 'transcript_id']].groupby('in_gc').count()
nov_tids = p_ics.loc[~p_ics['id'].isin(gc_ics['id'])].transcript_id.tolist()

In [37]:
# filter poder gtf based on just novel transcripts
p_df = p_df.df
p_df = p_df.loc[p_df.transcript_id.isin(nov_tids)]
assert len(p_df.loc[p_df.transcript_id.notnull()].transcript_id.unique())==len(nov_tids)

In [39]:
# restrict to just transcript + exon entries
gc_df = gc_df.df
gc_df = gc_df.loc[gc_df.Feature.isin(['transcript', 'exon'])]
p_df = p_df.loc[p_df.Feature.isin(['transcript', 'exon'])]

In [43]:
# concatenate different dfs
df = pd.concat([gc_df, p_df], axis=0)

In [47]:
# drop gene name entries cause all they do is screw everything up
df.drop('gene_name', axis=1, inplace=True)

In [48]:
# # make sure that we don't have duplicates from gene name / gene id combos
# l1 = len(df[['gene_name', 'gene_id']].drop_duplicates())
# l2 = len(df.gene_id.unique())
# print(l1)
# print(l2)

In [49]:
# temp = df[['gene_name', 'gene_id']].drop_duplicates()
# temp.loc[temp.gene_id.duplicated(keep=False)].sort_values(by='gene_id').head()

In [50]:
# make new gene entries for everything
l1 = len(df.gene_id.unique().tolist())
# make gene entry
g_df = make_hier_entry(df, how='g')

g_df['Source'] = 'v47_poder'
g_df['Frame'] = '.'
g_df['Score'] = '.'
l2 = len(g_df.loc[g_df.Feature=='gene'].index)
assert l1 == l2

# concat them and then sort gtf
df = pd.concat([df, g_df], axis=0)
df = cerberus.sort_gtf(df)

In [51]:
df.head()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl,samples
0,chrX,v47_poder,gene,100584935,100599885,.,+,.,ENSG00000000005.6,,...,,,,,,,,,,
1,chrX,HAVANA,transcript,100584935,100599885,.,+,.,ENSG00000000005.6,protein_coding,...,,1.0,OTTHUMT00000057481.2,HGNC:17757,OTTHUMG00000022001.2,,ENSP00000362122.4,CCDS14469.1,,
2,chrX,HAVANA,exon,100584935,100585066,.,+,.,ENSG00000000005.6,protein_coding,...,ENSE00001459371.5,1.0,OTTHUMT00000057481.2,HGNC:17757,OTTHUMG00000022001.2,,ENSP00000362122.4,CCDS14469.1,,
3,chrX,HAVANA,exon,100585230,100585362,.,+,.,ENSG00000000005.6,protein_coding,...,ENSE00000401061.1,1.0,OTTHUMT00000057481.2,HGNC:17757,OTTHUMG00000022001.2,,ENSP00000362122.4,CCDS14469.1,,
4,chrX,HAVANA,exon,100593894,100594035,.,+,.,ENSG00000000005.6,protein_coding,...,ENSE00000673400.1,1.0,OTTHUMT00000057481.2,HGNC:17757,OTTHUMG00000022001.2,,ENSP00000362122.4,CCDS14469.1,,


In [56]:
# make sure we've added the same number of transcripts
n_gc_t = len(gc_df.loc[gc_df.transcript_id.notnull()].transcript_id.unique())
n_nov_t = len(nov_tids)
n_gc_p_t = len(df.loc[df.transcript_id.notnull()].transcript_id.unique())

print(n_gc_t)
print(n_nov_t)
print(n_gc_p_t)
assert n_gc_t+n_nov_t == n_gc_p_t

In [59]:
df = pr.PyRanges(df)
df.to_gtf(output.gtf)

pandas.core.frame.DataFrame