In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
# metadata 
# get sampleId, indId, group, batch, run from metadata tables
meta_file = '../snakemake/mage/filereport_read_run_PRJNA851328_tsv.txt'
meta = pd.read_csv(meta_file, sep='\t')

meta['sampleId'] = meta['indId'] = meta.experiment_alias.str.split('_', expand=True)[0]
meta['batch'] = meta.experiment_alias.str.split('_', expand=True)[1]


meta_file = '../snakemake/mage/sample_metadata_parsed.tsv'
df2 = pd.read_csv(meta_file, sep=' ', header=None)
df2.columns = ['sampleId', 'pop', 'continent', 'sex', 'sth', 'sth2']

meta = meta.merge(df2, how='left',
              on='sampleId')
meta['group'] = 1 # all the same group because we don't want to call sqtls separately for anything
meta[['sampleId', 'indId', 'group', 'batch', 'sex', 'pop', 'continent', 'experiment_alias']].head()
meta.head()

Unnamed: 0,run_accession,sample_accession,experiment_accession,study_accession,tax_id,scientific_name,instrument_platform,instrument_model,experiment_alias,fastq_bytes,...,bam_bytes,sampleId,indId,batch,pop,continent,sex,sth,sth2,group
0,SRR19762167,SAMN29221552,SRX15806983,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19704_batch11_rep1,1579585821;1583976496,...,,NA19704,NA19704,batch11,ASW,AFR,female,SAMN29221552,SRS13499000,1
1,SRR19762172,SAMN29221523,SRX15806979,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19332_batch14_rep1,1969985537;1972458971,...,,NA19332,NA19332,batch14,LWK,AFR,female,SAMN29221523,SRS13498996,1
2,SRR19762173,SAMN29221521,SRX15806977,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19317_batch15_rep1,1387073734;1388216521,...,,NA19317,NA19317,batch15,LWK,AFR,male,SAMN29221521,SRS13498994,1
3,SRR19762174,SAMN29221520,SRX15806976,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19312_batch05_rep1,2814206811;2808461053,...,,NA19312,NA19312,batch05,LWK,AFR,male,SAMN29221520,SRS13498993,1
4,SRR19762175,SAMN29221522,SRX15806978,PRJNA851328,9606,Homo sapiens,ILLUMINA,Illumina NovaSeq 6000,NA19323_batch11_rep1,1589064552;1592540862,...,,NA19323,NA19323,batch11,LWK,AFR,female,SAMN29221522,SRS13498995,1


In [4]:
# regular gencode
df = pd.read_csv('../data/mage/v47_kallisto/matrix.abundance.tsv', sep='\t')

df.head()

Unnamed: 0,transcript_id,NA19704_batch11_rep1,NA19332_batch14_rep1,NA19317_batch15_rep1,NA19312_batch05_rep1,NA19323_batch11_rep1,NA19143_batch04_rep3,NA19143_batch15_rep2,HG00326_batch08_rep1,NA18942_batch15_rep1,...,NA19731_batch09_rep1,NA19732_batch14_rep1,HG00525_batch10_rep1,HG00473_batch03_rep1,NA21127_batch04_rep1,NA20342_batch15_rep1,NA20412_batch07_rep1,NA20318_batch07_rep1,NA20298_batch01_rep1,NA20320_batch05_rep1
0,ENST00000000233.10,2187.66,2483.13,1081.26,4053.85,2051.01,1898.63,1037.97,1923.43,945.648,...,2843.35,2398.14,1717.92,1313.38,1464.37,760.503,1878.09,2689.8,2031.42,3365.31
1,ENST00000000412.8,4203.78,5302.25,3306.58,6273.98,3338.97,3420.26,3246.81,3352.52,3296.86,...,4518.26,6763.18,3087.19,1953.76,3633.84,2991.08,3368.53,4747.02,3129.72,5863.28
2,ENST00000000442.11,335.718,603.101,307.345,729.524,310.898,441.57,396.765,378.063,170.085,...,788.457,770.24,395.389,311.986,427.172,72.0386,421.711,500.598,411.257,690.474
3,ENST00000001008.6,2064.24,1585.8,1706.69,3544.72,2224.32,3766.0,2172.18,1608.99,1572.02,...,2378.49,4240.4,2167.06,1171.16,2734.65,1239.95,1806.9,3532.98,2550.03,3521.23
4,ENST00000001146.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.50477,0.0,0.0,0.0,0.0,0.0,6.0
