In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [6]:
sample = 'HG00621_1'
sample_1 = sample.split('_')[0]
assemblies = ['afr', 'hg38', 't2t']
meta = load_meta()
pop = meta.loc[meta['sample'] == sample_1, 'population'].values[0]
c_dict, _ = get_population_colors()
color = c_dict[pop]

'#4cb33e'

## upset plot showing (binary) which reads mapped to where? starting from the fastq reads

In [7]:
t2t_file = proc_cfg(expand(config['lr']['map']['bam_mapqs'],
                          sample=sample,
                          assembly='t2t')[0], od)
hg38_file = proc_cfg(expand(config['lr']['map']['bam_mapqs'],
                          sample=sample,
                          assembly='hg38')[0], od)
afr_file = proc_cfg(expand(config['lr']['map']['bam_mapqs'],
                          sample=sample,
                          assembly='afr')[0], od)

In [8]:
df = pd.read_csv(hg38_file, sep='\t')
df['assembly'] = 'hg38'
assert len(df.index) == len(df.read_id.unique())

df2 = pd.read_csv(t2t_file, sep='\t')
df2['assembly'] = 't2t'
assert len(df2.index) == len(df2.read_id.unique())

df3 = pd.read_csv(afr_file, sep='\t')
df3['assembly'] = 'afr'
assert len(df3.index) == len(df3.read_id.unique())

df = pd.concat([df, df2, df3], axis=0)
df.head()

Unnamed: 0,read_id,mapq,assembly
0,656f1899-097a-4d0e-a956-eb0f9dd040e2:0,20,hg38
1,39c58a50-2e13-4bd7-9484-008563de3916:0,27,hg38
2,c6e18f10-2b77-4112-a2ee-7157c0a8c8a0:0,27,hg38
3,8c3d82da-fc2c-4667-b5d3-82098f206e7f:0,0,hg38
4,28327cc7-7153-4190-8087-560d3ffccbab:0,0,hg38


In [11]:
fastq_file = proc_cfg(expand(config['lr']['fastq_reads'],
                          sample=sample,
                          assembly='afr')[0], od)
df2 = pd.read_csv(fastq_file, sep='\t', header=None, names=['read_id'])

## upset plot showing which reads mapped to where? Binary (min mapq >0)

In [7]:
t2t_file = proc_cfg(expand(config['lr']['map']['bam_mapqs'],
                          sample=sample,
                          assembly='t2t')[0], od)
hg38_file = proc_cfg(expand(config['lr']['map']['bam_mapqs'],
                          sample=sample,
                          assembly='hg38')[0], od)
afr_file = proc_cfg(expand(config['lr']['map']['bam_mapqs'],
                          sample=sample,
                          assembly='afr')[0], od)

In [None]:
df = pd.read_csv(hg38_file, sep='\t')
df.rename({'mapq':'mapq_hg38'}, axis=1, inplace=True)
assert len(df.index) == len(df.read_id.unique())

df2 = pd.read_csv(t2t_file, sep='\t')
df2.rename({'mapq':'mapq_t2t'}, axis=1, inplace=True)
assert len(df2.index) == len(df2.read_id.unique())

df3 = pd.read_csv(afr_file, sep='\t')
df3.rename({'mapq':'mapq_afr'}, axis=1, inplace=True)
assert len(df3.index) == len(df3.read_id.unique())

In [None]:
df = df.merge(df2, how='outer', on='read_id')
df = df.merge(df3, how='outer', on='read_id')

In [None]:
df_back = df.copy(deep=True)

In [None]:
# convert to binary
df.fillna(0, inplace=True)
df.set_index('read_id', inplace=True)
df = df>0

In [None]:
df.reset_index(inplace=True)
df.set_index(['mapq_hg38', 'mapq_t2t', 'mapq_afr'], inplace=True)

In [None]:
ax_dict = upsetplot.UpSet(df, subset_size='count',
                          show_counts=True,
                          facecolor=color).plot()

In [None]:
## bar plot showing which reads mapped best (mapq)

In [None]:
## bar plot showing which reads mapped best (qcov)