In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *
from scripts.sm_utils import *

In [4]:
config = load_config()
od = '../../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [12]:
meta_file = '../config.tsv'
meta_file_2 = 'config.tsv'
genomes_file = 'genomes_config.tsv'

df = parse_config(meta_file)
df2 = pd.read_csv(meta_file_2, sep='\t')
df2['tech_rep'] = df2.cell_line_id+'_1'

# TODO test
# df2 = df2.loc[df2.cell_line_id == 'GM24385']

# get the genomes to download
g_df = pd.read_csv(genomes_file, sep='\t')

# maternal haplotypes
g_df['aws_mat_link'] = g_df['hap2_aws_fasta']
g_df = g_df.loc[g_df['aws_mat_link'].notnull()]
assert len(g_df.loc[g_df['aws_mat_link'].str.contains('maternal')].index) == len(g_df.index)

# paternal haplotypes
g_df['aws_pat_link'] = g_df['hap1_aws_fasta']
g_df = g_df.loc[g_df['aws_pat_link'].notnull()]
assert len(g_df.loc[g_df['aws_pat_link'].str.contains('paternal')].index) == len(g_df.index)

genome_cols = ['same_population_sample', 'european_sample',	'afr_sample']
assemblies = genome_cols

genome_cols = ['same_population_sample', 'european_sample',	'afr_sample']
g_df = g_df.loc[(g_df['sample'].isin(df2[genome_cols[0]]))|
                (g_df['sample'].isin(df2[genome_cols[1]]))|
                (g_df['sample'].isin(df2[genome_cols[2]]))]

# a little more df2 formatting
df2 = df2[['tech_rep', 'same_population_sample',
           'european_sample', 'afr_sample']].melt(id_vars='tech_rep')
df2 = df2.reset_index()
df2 = df2.rename({'variable':'assembly_status',
                  'value': 'assembly_sample'},
                  axis=1)

# limit just to the samples where we'll do this
df = df.loc[df.tech_rep.isin(df2.tech_rep.tolist())]

# get a key for assembly status, assembly sample, and actual sample
df2['dataset_key'] = df2.assembly_status+'_'+\
                     df2.assembly_sample+'_'+\
                     df2.tech_rep


In [9]:
df2.head()

Unnamed: 0,index,tech_rep,assembly_status,assembly_sample,dataset_key
0,0,GM24385_1,same_population_sample,HG002,same_population_sample_HG002_GM24385_1
1,1,GM18906_1,same_population_sample,NA19240,same_population_sample_NA19240_GM18906_1
2,2,GM19240_1,same_population_sample,NA18906,same_population_sample_NA18906_GM19240_1
3,3,HG00621_1,same_population_sample,HG00673,same_population_sample_HG00673_HG00621_1
4,4,HG01928_1,same_population_sample,HG01952,same_population_sample_HG01952_HG01928_1


In [7]:
expand(config['lr']['map_personal']['max_mapq_tsv_summary'],
               thresh=[0, 10])

['../../data/map_personal/max_mapq_0_bool_summary.tsv',
 '../../data/map_personal/max_mapq_10_bool_summary.tsv']

In [10]:
expand(expand(config['lr']['map_personal']['mapq_tsv'],
                               zip,
                               assembly_status=df2.assembly_status.tolist(),
                               assembly_sample=df2.assembly_sample.tolist(),
                               sample=df2.tech_rep.tolist(),
                               allow_missing=True),
                               thresh=0)

['../../data/map_personal/GM24385_1/mapq_0_bool.tsv',
 '../../data/map_personal/GM18906_1/mapq_0_bool.tsv',
 '../../data/map_personal/GM19240_1/mapq_0_bool.tsv',
 '../../data/map_personal/HG00621_1/mapq_0_bool.tsv',
 '../../data/map_personal/HG01928_1/mapq_0_bool.tsv',
 '../../data/map_personal/HG01952_1/mapq_0_bool.tsv',
 '../../data/map_personal/GM24385_1/mapq_0_bool.tsv',
 '../../data/map_personal/GM18906_1/mapq_0_bool.tsv',
 '../../data/map_personal/GM19240_1/mapq_0_bool.tsv',
 '../../data/map_personal/HG00621_1/mapq_0_bool.tsv',
 '../../data/map_personal/HG01928_1/mapq_0_bool.tsv',
 '../../data/map_personal/HG01952_1/mapq_0_bool.tsv',
 '../../data/map_personal/GM24385_1/mapq_0_bool.tsv',
 '../../data/map_personal/GM18906_1/mapq_0_bool.tsv',
 '../../data/map_personal/GM19240_1/mapq_0_bool.tsv',
 '../../data/map_personal/HG00621_1/mapq_0_bool.tsv',
 '../../data/map_personal/HG01928_1/mapq_0_bool.tsv',
 '../../data/map_personal/HG01952_1/mapq_0_bool.tsv']

['../../data/map_personal/same_population_sample/HG002_GM24385_1_mapqs.txt',
 '../../data/map_personal/european_sample/HG002_GM24385_1_mapqs.txt',
 '../../data/map_personal/afr_sample/HG02717_GM24385_1_mapqs.txt']

In [25]:
def get_df_val(df, col1, col_dict, uniq_val=True):
    temp = df.copy(deep=True)

    for key, item in col_dict.items():
        temp = temp.loc[temp[key] == item]

    if uniq_val:
        val = temp[col1].unique()
        assert len(val) == 1
        return val[0]
    else:
        return temp[col1].tolist()

In [29]:
df2.loc[df2.tech_rep==sample].assembly_status.tolist()
get_df_val(df2, 'assembly_status', {'tech_rep':sample}, uniq_val=False),
get_df_val(df2, 'assembly_sample', {'tech_rep':sample}, uniq_val=False)
get_df_val(df2, 'tech_rep', {'tech_rep':sample}, uniq_val=False)
# df2.loc[df2.tech_rep==sample].assembly_sample.tolist()
# df2.loc[df2.tech_rep==sample].tech_rep.tolist()

['GM24385_1', 'GM24385_1', 'GM24385_1']

In [None]:
sample = 'GM24385_1'
expand(config['lr']['map_personal']['bam_mapqs'],
               zip,
               assembly_status=get_df_val(df2, 'assembly_status', {'tech_rep':sample}, uniq_val=False),
               assembly_sample=get_df_val(df2, 'assembly_sample', {'tech_rep':sample}, uniq_val=False),
               sample=df2.loc[df2.tech_rep==sample].tech_rep.tolist())