In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
# metadata 
# get sampleId, indId, group, batch, run from metadata tables
meta_file = '../snakemake/mage/filereport_read_run_PRJNA851328_tsv.txt'
meta = pd.read_csv(meta_file, sep='\t')

meta['sampleId'] = meta['indId'] = meta.experiment_alias.str.split('_', expand=True)[0]
meta['batch'] = meta.experiment_alias.str.split('_', expand=True)[1]


meta_file = '../snakemake/mage/sample_metadata_parsed.tsv'
df2 = pd.read_csv(meta_file, sep=' ', header=None)
df2.columns = ['sampleId', 'pop', 'continent', 'sex', 'sth', 'sth2']

meta = meta.merge(df2, how='left',
              on='sampleId')
meta['group'] = 1 # all the same group because we don't want to call sqtls separately for anything
meta[['sampleId', 'indId', 'group', 'batch', 'sex', 'pop', 'continent', 'experiment_alias']].head()
meta.head()

Unnamed: 0,run_accession,sample_accession,experiment_accession,study_accession,tax_id,scientific_name,instrument_platform,instrument_model,experiment_alias,fastq_bytes,...,bam_bytes,sampleId,indId,batch,pop,continent,sex,sth,sth2,group
0,SRR19762167,SAMN29221552,SRX15806983,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19704_batch11_rep1,1579585821;1583976496,...,,NA19704,NA19704,batch11,ASW,AFR,female,SAMN29221552,SRS13499000,1
1,SRR19762172,SAMN29221523,SRX15806979,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19332_batch14_rep1,1969985537;1972458971,...,,NA19332,NA19332,batch14,LWK,AFR,female,SAMN29221523,SRS13498996,1
2,SRR19762173,SAMN29221521,SRX15806977,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19317_batch15_rep1,1387073734;1388216521,...,,NA19317,NA19317,batch15,LWK,AFR,male,SAMN29221521,SRS13498994,1
3,SRR19762174,SAMN29221520,SRX15806976,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19312_batch05_rep1,2814206811;2808461053,...,,NA19312,NA19312,batch05,LWK,AFR,male,SAMN29221520,SRS13498993,1
4,SRR19762175,SAMN29221522,SRX15806978,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19323_batch11_rep1,1589064552;1592540862,...,,NA19323,NA19323,batch11,LWK,AFR,female,SAMN29221522,SRS13498995,1


## Enhanced GENCODE

### What % of our pop-spec disc. transcripts are expressed?

In [22]:
df = pd.read_csv('../data/mage/enh_v47_kallisto/matrix.abundance.tpm.tsv', sep='\t')

In [23]:
df.set_index('transcript_id', inplace=True)

In [24]:
min_cpm = 1
print(len(df.index))
df = df.loc[(df>=min_cpm).any(axis=1)]
print(len(df.index))

429241
172599


In [25]:
# merge in pop spec t
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
# mt_df = mt_df.loc[mt_df['filter']=='pass']
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)

In [26]:
df = df.merge(mt_df[['isoform', 'pop_spec_t']], 
                      how='left', 
                      left_on='transcript_id',
                      right_on='isoform')

In [27]:
n_tot = len(mt_df.loc[mt_df.pop_spec_t==True])
n = len(df.loc[df.pop_spec_t == True].index)
print(f'{(n/n_tot)*100:.2f}% ({n}/{n_tot}) LR-RNA-seq pop-spec. discovered transcripts expressed => {min_cpm} CPM in >= 1 sample')

28.41% (644/2267) LR-RNA-seq pop-spec. discovered transcripts expressed => 1 CPM in >= 1 sample


### What about just novel transcritps?

In [29]:
df = df.merge(mt_df[['isoform', 'annotated']], 
                      how='left', 
                      on='isoform')

In [33]:
n_tot = len(mt_df.loc[mt_df.annotated=='discovered'])
n = len(df.loc[df.annotated == 'discovered'].index)
print(f'{(n/n_tot)*100:.2f}% ({n}/{n_tot}) LR-RNA-seq novel discovered transcripts expressed => {min_cpm} CPM in >= 1 sample')

24.87% (36222/145620) LR-RNA-seq novel discovered transcripts expressed => 1 CPM in >= 1 sample
