In [9]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *
from scripts.sm_utils import *

In [10]:
config = load_config()
od = '../../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [15]:

meta_file = '../config.tsv'
meta_file_2 = 'config.tsv'
genomes_file = 'genomes_config.tsv'
gtfs_file = 'gtfs_config.tsv'

df = parse_config(meta_file)
df2 = pd.read_csv(meta_file_2, sep='\t')
df2['tech_rep'] = df2.cell_line_id+'_1'
df2['same_sample'] = df2['pangenome_code']

# TODO test
# df2 = df2.loc[df2.cell_line_id == 'GM24385']

# get the genomes to download
g_df = pd.read_csv(genomes_file, sep='\t')

# maternal haplotypes
g_df['aws_mat_link'] = g_df['hap2_aws_fasta']
g_df = g_df.loc[g_df['aws_mat_link'].notnull()]
assert len(g_df.loc[g_df['aws_mat_link'].str.contains('maternal')].index) == len(g_df.index)

# paternal haplotypes
g_df['aws_pat_link'] = g_df['hap1_aws_fasta']
g_df = g_df.loc[g_df['aws_pat_link'].notnull()]
assert len(g_df.loc[g_df['aws_pat_link'].str.contains('paternal')].index) == len(g_df.index)

genome_cols = ['same_population_sample', 'european_sample',	'afr_sample', 'same_sample']
assemblies = genome_cols
# g_df = g_df.loc[(g_df['sample'].isin(df2[genome_cols[0]]))|
#                 (g_df['sample'].isin(df2[genome_cols[1]]))|
#                 (g_df['sample'].isin(df2[genome_cols[2]]))|
#                 (g_df['sample'].isin(df2[genome_cols[3]]))]

# a little more df2 formatting
df2 = df2[['tech_rep']+assemblies].melt(id_vars='tech_rep')
df2 = df2.reset_index()
df2 = df2.rename({'variable':'assembly_status',
                  'value': 'assembly_sample'},
                  axis=1)

# # limit just to the samples where we'll do this
# df = df.loc[df.tech_rep.isin(df2.tech_rep.tolist())]

# get a key for assembly status, assembly sample, and actual sample
df2['dataset_key'] = df2.assembly_status+'_'+\
                     df2.assembly_sample+'_'+\
                     df2.tech_rep

g_df = g_df[['sample', 'aws_mat_link', 'aws_pat_link']]
g_df = g_df = pd.melt(g_df, id_vars=['sample'], 
                  value_vars=['aws_mat_link', 'aws_pat_link'], 
                  var_name='haplotype', value_name='link')

# Map 'haplotype' column to 'maternal' or 'paternal' based on the column name
g_df['haplotype'] = g_df['haplotype'].map({
    'aws_mat_link': 'maternal',
    'aws_pat_link': 'paternal'
})
g_df.head()

Unnamed: 0,sample,haplotype,link
0,HG00438,maternal,s3://human-pangenomics/working/HPRC/HG00438/as...
1,HG00621,maternal,s3://human-pangenomics/working/HPRC/HG00621/as...
2,HG00673,maternal,s3://human-pangenomics/working/HPRC/HG00673/as...
3,HG00735,maternal,s3://human-pangenomics/working/HPRC/HG00735/as...
4,HG00741,maternal,s3://human-pangenomics/working/HPRC/HG00741/as...


In [17]:
# add the gtf info
gtf_df = pd.read_csv(gtfs_file, sep='\t')
gtf_df.head()

Unnamed: 0,path,annot_ver,assembly_sample,assembly_haplotype
0,/gpfs/projects/bsc83/Projects/pantranscriptome...,PODER,HG002,maternal
1,/gpfs/projects/bsc83/Projects/pantranscriptome...,PODER,HG02717,maternal


In [19]:
expand(config['lr']['personal_kallisto']['quant']['merge_matrix_tpm_tsv'],
               zip,
               assembly_sample=gtf_df['assembly_sample'].tolist(),
               assembly_haplotype=gtf_df['assembly_haplotype'].tolist())

['../../data/personal_kallisto_quant/HG002_maternal/matrix.abundance.tpm.tsv',
 '../../data/personal_kallisto_quant/HG02717_maternal/matrix.abundance.tpm.tsv']

In [20]:

def get_df_val(df, col1, col_dict, uniq_val=True):
    """
    uniq_val (b0ool) needs to return a uniq val rather
        than a list
    """
    temp = df.copy(deep=True)

    for key, item in col_dict.items():
        temp = temp.loc[temp[key] == item]

    if uniq_val:
        val = temp[col1].unique()
        assert len(val) == 1
        return val[0]
    else:
        return temp[col1].tolist()

In [22]:
assembly_sample = 'HG002'
assembly_haplotype = 'maternal'
get_df_val(gtf_df, 'path',
                                    {'assembly_sample': assembly_sample,
                                     'assembly_haplotype': assembly_haplotype})

'/gpfs/projects/bsc83/Projects/pantranscriptome/novelannotations/liftoff/PODER/HG002/poder.hg38_to_HG002.lifted.gtf'