In [7]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *
from scripts.sm_utils import *

In [8]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [10]:
meta_file = '../config.tsv'
meta_file_2 = '../snakemake/map_personal/config.tsv'
genomes_file = '../snakemake/map_personal/genomes_config.tsv'

df = parse_config(meta_file)
df2 = pd.read_csv(meta_file_2, sep='\t')
df2['tech_rep'] = df2.cell_line_id+'_1'

# TODO test
# df2 = df2.loc[df2.cell_line_id == 'GM24385']

# get the genomes to download
g_df = pd.read_csv(genomes_file, sep='\t')

# maternal haplotypes
g_df['aws_mat_link'] = g_df['hap2_aws_fasta']
g_df = g_df.loc[g_df['aws_mat_link'].notnull()]
assert len(g_df.loc[g_df['aws_mat_link'].str.contains('maternal')].index) == len(g_df.index)

# paternal haplotypes
g_df['aws_pat_link'] = g_df['hap1_aws_fasta']
g_df = g_df.loc[g_df['aws_pat_link'].notnull()]
assert len(g_df.loc[g_df['aws_pat_link'].str.contains('paternal')].index) == len(g_df.index)


genome_cols = ['same_population_sample', 'european_sample',	'afr_sample']
g_df = g_df.loc[(g_df['sample'].isin(df2[genome_cols[0]]))|
                (g_df['sample'].isin(df2[genome_cols[1]]))|
                (g_df['sample'].isin(df2[genome_cols[2]]))]

# a little more df2 formatting
df2 = df2[['tech_rep', 'same_population_sample',
           'european_sample', 'afr_sample']].melt(id_vars='tech_rep')
df2 = df2.reset_index()
df2 = df2.rename({'variable':'assembly_status',
                  'value': 'assembly_sample'},
                  axis=1)

# limit just to the samples where we'll do this
df = df.loc[df.tech_rep.isin(df2.tech_rep.tolist())]

# get a key for assembly status, assembly sample, and actual sample
df2['dataset_key'] = df2.assembly_status+'_'+\
                     df2.assembly_sample+'_'+\
                     df2.tech_rep


In [17]:
sample = 'HG00621_1' # params.sample
sample_1 = sample.split('_')[0]
assemblies = ['same_population_sample',
              'european_sample',
              'afr_sample']
meta = load_meta()
pop = meta.loc[meta['sample'] == sample_1, 'population'].values[0]
c_dict, _ = get_population_colors()
color = c_dict[pop]

# TODO 
df2 = df2.loc[df2.tech_rep==sample]

In [24]:
files = expand(expand(proc_cfg(config['lr']['map_personal']['bam_mapqs'],od),
       zip,
       assembly_status=df2.assembly_status.tolist(),
       assembly_sample=df2.assembly_sample.tolist(),
       allow_missing=True),
       sample=sample)
files

['../data/map_personal/same_population_sample/HG00673_HG00621_1_mapqs.txt',
 '../data/map_personal/european_sample/HG002_HG00621_1_mapqs.txt',
 '../data/map_personal/afr_sample/HG02717_HG00621_1_mapqs.txt']

In [23]:
# get upset plot table
i = 0
for f, a in zip(files, assemblies):
    temp = pd.read_csv(f, sep='\t')
    # temp.rename({'mapq':a}, axis=1, inplace=True)
    assert len(temp.index) == len(temp.read_id.unique())
    temp['assembly']=a
    if i == 0:
        df = temp.copy(deep=True)
    else:
        df = pd.concat([df, temp], axis=0)
    import pdb; pdb.set_trace()
    i += 1

# assert min mapq
df = df.loc[df.mapq>thresh]

# groupby read id and mapq to find reads that map equally as well
df = df.groupby(['read_id', 'mapq']).agg({
    'assembly': lambda x: ','.join(x)})

# sort by mapq and dedupe by keeping max
df.reset_index(inplace=True)
df = df.sort_values(by='mapq', ascending=False)
df = df.drop_duplicates(subset=['read_id'], keep='first')

# process out using the upset plot stuff
df['assembly'] = df.assembly.str.split(',')
df = df.explode('assembly')
df['val'] = True
df = df.pivot(index='read_id', columns='assembly', values='val')
df = df.fillna(False)
df = df.reset_index()
df.set_index(assemblies, inplace=True)

# make the upset plot
ax_dict = upsetplot.UpSet(df, subset_size='count', facecolor=color, sort_by='cardinality', show_counts=False, show_percentages=True).plot()
# ax_dict = upsetplot.UpSet(df, subset_size='count',
#                   facecolor=color,
#                   sort_by='cardinality',
#                   show_counts=False,
#                   show_percentages=True).plot()
plt.suptitle(f'% of best-mapping {sample} reads w/ mapq>{thresh}')
plt.savefig(output.upset, dpi=500)

> [0;32m<ipython-input-23-5e9d05b6f4dc>[0m(13)[0;36m<module>[0;34m()[0m
[0;32m     11 [0;31m        [0mdf[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[[0m[0mdf[0m[0;34m,[0m [0mtemp[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     12 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 13 [0;31m    [0mi[0m [0;34m+=[0m [0;36m1[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     14 [0;31m[0;34m[0m[0m
[0m[0;32m     15 [0;31m[0;31m# assert min mapq[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  df.head()


                                  read_id  mapq                assembly
0  2d3a1bfa-c047-4a1c-a16b-2e0d56635468:0    60  same_population_sample
1  a61d2725-6667-4bb1-845d-b8be74dd644b:0    36  same_population_sample
2  c226fd2f-9701-47bd-b155-c50a3f8ed2d9:0    60  same_population_sample
3  db8cd70c-b9b6-405a-9cfb-9a30467f12b7:0    60  same_population_sample
4  8ba6f531-eaa2-4132-b031-feadf47ae941:0    60  same_population_sample


ipdb>  c


> [0;32m<ipython-input-23-5e9d05b6f4dc>[0m(12)[0;36m<module>[0;34m()[0m
[0;32m     10 [0;31m    [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     11 [0;31m        [0mdf[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[[0m[0mdf[0m[0;34m,[0m [0mtemp[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 12 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     13 [0;31m    [0mi[0m [0;34m+=[0m [0;36m1[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     14 [0;31m[0;34m[0m[0m
[0m


ipdb>  df.head()


                                  read_id  mapq                assembly
0  2d3a1bfa-c047-4a1c-a16b-2e0d56635468:0    60  same_population_sample
1  a61d2725-6667-4bb1-845d-b8be74dd644b:0    36  same_population_sample
2  c226fd2f-9701-47bd-b155-c50a3f8ed2d9:0    60  same_population_sample
3  db8cd70c-b9b6-405a-9cfb-9a30467f12b7:0    60  same_population_sample
4  8ba6f531-eaa2-4132-b031-feadf47ae941:0    60  same_population_sample


ipdb>  df.assembly.unique()


array(['same_population_sample', 'european_sample'], dtype=object)


ipdb>  c


> [0;32m<ipython-input-23-5e9d05b6f4dc>[0m(13)[0;36m<module>[0;34m()[0m
[0;32m     11 [0;31m        [0mdf[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[[0m[0mdf[0m[0;34m,[0m [0mtemp[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     12 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 13 [0;31m    [0mi[0m [0;34m+=[0m [0;36m1[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     14 [0;31m[0;34m[0m[0m
[0m[0;32m     15 [0;31m[0;31m# assert min mapq[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


NameError: name 'thresh' is not defined