In [45]:
import pyranges as pr
import pandas as pd
import cerberus

def make_hier_entry(df, how='t'):
    """
    kind {'g','t'}
    """
    agg_dict = {'min_coord': 'min', 'max_coord': 'max'}
    t_df = df.copy(deep=True)
    t_df['min_coord'] = t_df[['Start', 'End']].min(axis=1)
    t_df['max_coord'] = t_df[['Start', 'End']].max(axis=1)
    if how == 't':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id', 'transcript_id', 'transcript_name',
                   'tss_id', 'tes_id',
                   'new_transcript_id', 'original_transcript_id',
                   'original_transcript_name', 'ag1', 'ag2']
    elif how == 'g':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id']
    gb_cols = list(set(gb_cols)&(set(t_df.columns)))

    cols = gb_cols + ['min_coord', 'max_coord']
    t_df = t_df[cols]
    t_df = t_df.groupby(gb_cols, observed=True).agg(agg_dict).reset_index()
    t_df.rename({'min_coord': 'Start', 'max_coord': 'End'}, axis=1, inplace=True)
    if how == 't':
        t_df['Feature'] = 'transcript'
    elif how == 'g':
        t_df['Feature'] = 'gene'

    return t_df

gtf = '../../data/transcripts_novel_gene_loci.gtf'
tsv = '../../data/240909merge_associatedgene2isoform_noambigousISM_FSM_genic.tsv'

df = pd.read_csv(tsv, sep='\t', header=None)
df.columns = ['tid', 'gid']
# remove "novelGenes" from sqanti mappings
df = pd.read_csv(tsv, sep='\t', header=None)
df.columns = ['tid', 'gid']
l1 = len(df.index)
df = df.loc[~df.gid.str.startswith('novelGene')]
l2 = len(df.index)
assert l1 != l2

gtf_df = pr.read_gtf(gtf).df

# novel genes separate, remove gene entries
nov_gene = gtf_df.loc[gtf_df.gene_id.notnull()].copy(deep=True)
nov_gene['gene_name'] = nov_gene['gene_id']
nov_gene['Source'] = 'ChatGPT'
nov_gene = nov_gene.loc[nov_gene.Feature!='gene']

# known genes
gtf_df = gtf_df.loc[gtf_df.gene_id.isnull()].copy(deep=True)

# filter based on whether transcript is even in the dataset
gtf_df = gtf_df.loc[gtf_df.transcript_id.isin(df.tid.tolist())]

# get dict mapping tid:gid
tg_dict = dict([(entry['tid'], entry['gid']) for ind, entry in df.iterrows()])
tg_dict[list(tg_dict.keys())[0]]

# add gene id just for known genes
gtf_df['gene_id'] = gtf_df.transcript_id.map(tg_dict)
gtf_df['gene_name'] = gtf_df['gene_id']
assert len(gtf_df.loc[gtf_df.gene_id.isnull()].index) == 0

# cat the novel and known thing together
gtf_df = pd.concat([gtf_df, nov_gene], axis=0)
assert len(gtf_df.loc[gtf_df.gene_id.isnull()].index) == 0

# add gene entries
df = gtf_df.copy(deep=True)
l1 = len(df.gene_id.unique().tolist())
g_df = make_hier_entry(df, how='g')
g_df['Source'] = 'ChatGPT'
g_df['Frame'] = '.'
g_df['Score'] = '.'
l2 = len(g_df.loc[g_df.Feature=='gene'].index)
assert l1 == l2

print(len(gtf_df.index))
df = pd.concat([gtf_df, g_df], axis=0)
print(len(df.index))

df = cerberus.sort_gtf(df)


assert len(df.loc[df.Feature=='gene']) == len(df.gene_id.unique())


# save
out_gtf = '../../data/novel_gene/transcripts_novel_gene_loci_filt_gene_name.gtf'
df = pr.PyRanges(df)
df.to_gtf(out_gtf)


In [47]:
len(gtf_df.gene_id.unique())

32830

In [49]:
len(g_df.index)

32830

2306419
2339249


32830

In [53]:
temp = df[['Feature', 'gene_id']]
temp = temp.loc[temp.Feature == 'gene']
temp.loc[temp.gene_id.duplicated(keep=False)].sort_values(by='gene_id').head()

Unnamed: 0,Feature,gene_id


In [34]:
temp = nov_gene[['Feature', 'gene_id']]
temp = temp.loc[temp.Feature == 'gene']
temp.loc[temp.gene_id.duplicated(keep=False)].sort_values(by='gene_id').head()

Unnamed: 0,Feature,gene_id


In [32]:
df.loc[df.gene_id=='LOC_000000000000']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,transcript_id,samples,exon_number,gene_id,gene_name
1652629,chr14,novel_gene,gene,76978623,76979303,.,-,.,,,,LOC_000000000000,LOC_000000000000
1652630,chr14,ChatGPT,gene,76978623,76979303,.,-,.,,,,LOC_000000000000,LOC_000000000000
1652631,chr14,novel_gene,transcript,76978623,76979303,.,-,.,transcript_250596,,,LOC_000000000000,LOC_000000000000
1652632,chr14,ChatGPT,exon,76979193,76979303,.,-,.,transcript_250596,espresso_GM10495_1,1.0,LOC_000000000000,LOC_000000000000
1652633,chr14,ChatGPT,exon,76978623,76978862,.,-,.,transcript_250596,espresso_GM10495_1,2.0,LOC_000000000000,LOC_000000000000


In [None]:
gtf_df.tail()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,transcript_id,samples,exon_number,gene_id,gene_name
2352934,chrY,ChatGPT,exon,10902209,10902292,.,-,.,transcript_380644,lyric_GM18489_1,4.0,LOC_000000004828,LOC_000000004828
2352935,chrY,novel_gene,gene,3218571,3220711,.,-,.,,,,LOC_000000007043,LOC_000000007043
2352936,chrY,novel_gene,transcript,3218571,3220711,.,-,.,transcript_380845,,,LOC_000000007043,LOC_000000007043
2352937,chrY,ChatGPT,exon,3220633,3220711,.,-,.,transcript_380845,"espresso_HG02261_1,espresso_GM18486_1",1.0,LOC_000000007043,LOC_000000007043
2352938,chrY,ChatGPT,exon,3218571,3220532,.,-,.,transcript_380845,"espresso_HG02261_1,espresso_GM18486_1",2.0,LOC_000000007043,LOC_000000007043


In [None]:
len(gtf_df.gene_id.unique())

32830

In [40]:
len(g_df.gene_id.unique())

32830

In [41]:
len(g_df)

32830

In [27]:
g_df['gene_id_new'] = g_df['gene_name']+'_'+\
                      g_df['Chromosome'].astype(str)+'_'+\
                      g_df.Strand.astype(str)

In [28]:
g_df.loc[g_df.gene_id.duplicated(keep=False)].sort_values(by='gene_id').head()

Unnamed: 0,gene_id,Chromosome,gene_name,Strand,Start,End,Feature,gene_id_new
15546,ENSG00000234394,chr9,ENSG00000234394,-,41100793,41109434,gene,ENSG00000234394_chr9_-
15547,ENSG00000234394,chr9,ENSG00000234394,+,68306527,68330436,gene,ENSG00000234394_chr9_+
19207,ENSG00000274487,chr17,ENSG00000274487,-,38153219,38257394,gene,ENSG00000274487_chr17_-
19208,ENSG00000274487,chr17,ENSG00000274487,+,38078261,38124770,gene,ENSG00000274487_chr17_+
21178,ENSG00000291150,chr1,ENSG00000291150,+,144715581,144758303,gene,ENSG00000291150_chr1_+


In [29]:
# is duplication happening to "novel" genes? 
g_df.loc[(g_df.gene_id.duplicated(keep=False))&\
         (g_df.gene_id.str.startswith('LOC'))].sort_values(by='gene_id').head()

Unnamed: 0,gene_id,Chromosome,gene_name,Strand,Start,End,Feature,gene_id_new


In [30]:
temp = g_df[['gene_id', 'Chromosome', 'Strand']].groupby('gene_id').nunique().reset_index()

In [31]:
# only dupes
temp = temp.loc[(temp.Chromosome>1)|(temp.Strand>1)]

In [32]:
len(temp.index)

29

In [33]:
temp.head()

Unnamed: 0,gene_id,Chromosome,Strand
15546,ENSG00000234394,1,2
19206,ENSG00000274487,1,2
21176,ENSG00000291150,2,2
21234,ENSG00000292327,2,1
21235,ENSG00000292331,2,1


In [None]:
g_df.loc[g_df.gene_id=='ENSG00000291150']

Unnamed: 0,gene_id,Chromosome,gene_name,Strand,Start,End,Feature,gene_id_new
21178,ENSG00000291150,chr1,ENSG00000291150,+,144715581,144758303,gene,ENSG00000291150_chr1_+
21179,ENSG00000291150,chr2,ENSG00000291150,-,91586930,91660038,gene,ENSG00000291150_chr2_-


In [None]:
# are there gene names or gene ids?
df = pd.read_csv(tsv, sep='\t', header=None)
df.columns = ['tid', 'gid']

In [None]:
df.loc[(~df.gid.str.startswith('ENSG'))&\
       (~df.gid.str.startswith('novelGene'))].head()

Unnamed: 0,tid,gid
1219,transcript_359,ISG15_ENSG00000303623
1332,transcript_472,DPH5-DT_SLC30A7
1661,transcript_801,RNPC3_AMY2B
1722,transcript_862,CENPS_CORT
1725,transcript_865,PEX14_CENPS-CORT


In [None]:
# what are these?
print(len(df.loc[df.gid.str.startswith('novelGene')]))
df.loc[df.gid.str.startswith('novelGene')].head()

19759


Unnamed: 0,tid,gid
39,transcript_190125,novelGene_1
40,transcript_190126,novelGene_2
216,transcript_190169,novelGene_3
273,transcript_161,novelGene_ENSG00000307088_AS
274,transcript_162,novelGene_5


In [None]:
tids = df.loc[df.gid=='ENSG00000291150', 'tid'].tolist()

In [20]:
gtf_df.loc[gtf_df.transcript_id.isin(tids)][['Chromosome', 'Strand']].drop_duplicates()

Unnamed: 0,Chromosome,Strand
20576,chr1,+
421396,chr2,-


In [21]:
temp.head()

Unnamed: 0,gene_id,Chromosome,Strand
15546,ENSG00000234394,1,2
19206,ENSG00000274487,1,2
21176,ENSG00000291150,2,2
21234,ENSG00000292327,2,1
21235,ENSG00000292331,2,1
