In [1]:
#!/usr/bin/env python3
import sys
srcdir = "/project/mchaisso_100/cmb-16/tsungyul/work/vntr/danbing-tk/script/"
sys.path.insert(0, srcdir)

In [2]:
import numpy as np
import pandas as pd
import vntrutils as vu
import utils
# import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
# %load_ext autoreload
# %autoreload 2

In [3]:
from collections import defaultdict
import pickle
# import networkx as nx
# import pygraphviz
# import collections
import gc
import glob
# import warnings
import statsmodels.api as sm
# from statsmodels.stats.multitest import fdrcorrection as fdr
# from scipy import stats
import gzip
# from sklearn.decomposition import PCA
# from scipy.stats import chi2
# import re
# import json

In [4]:
matplotlib.rc('font', size=7)
matplotlib.rc('axes', titlesize=7)
matplotlib.rc('xtick', labelsize=5)
matplotlib.rc('ytick', labelsize=5)
%load_ext autoreload
%autoreload 2

- This notebook demonstrates how to convert kmer genotype matrix to motif genotype matrix
    - In the previous preprocessing step, I computed the correspondence from kmer to motif.
      So once I have a kmer matrix, where each row is a kmer and each col is a sample,
      I can simply do row operations to get the motif dosage.
    - It takes about 10 min to compute each batch of data in `compute_gt_cgt_batch`.
      You could speed it up by moving the code to a Python script and using SLURM's array job.
      Just optional. 
    - Watch out for memory usage and remmeber to allocate enough mem for your compute node,
      otherwise ipython kernel would die
- With motif genotype matrix, we can then perform LD pruning over motifs
    - Add another function to save in tsv.gz format instead of pikcle for portability
    - Use PLINK or write your own code given certain LD threshold
- Data structure in this notebook that can be useful for LD pruning
    - `ccki_tr`: cumulative number of canonical compressed kmer (cck) per TR locus
        - cck is equivalent to motif in our definition
        - vector of size NTR (Number of TR loci)
        - retrieve # of cck in each locus `i` by
            - `ccki_tr[i] -  ccki_tr[i-1]` if i > 0
            - `ccki_tr[i]` if i == 0
        - we will need this to perform LD pruning for each locus, or a block in the matrix

In [5]:
def test():
    with open("/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/cdbg/ki_tr.ccki_tr.pickle", 'rb') as f:
        ki_tr, ccki_tr = pickle.load(f)
    return ki_tr, ccki_tr
ki_tr, ccki_tr = test()

In [6]:
def test():
    fn = "/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/cdbg/ks.ccks.tr_cck_ns.ki_map.pickle"
    with open(fn, 'rb') as f:
        ks, ccks, tr_cck_ns, ki_map = pickle.load(f)
    tr_cck_ns = np.array(tr_cck_ns)
    return ks, ccks, tr_cck_ns, ki_map
ks, ccks, tr_cck_ns, ki_map = test()

In [7]:
NK = len(ks)
NCCK = len(ccks)
NB = 20

# IL dosage

In [15]:
def load_single_gt(fn, gt, i1):
    with open(fn) as f:
        i0 = 0
        for line in f:
            gt[i0,i1] = int(line)
            i0 += 1
    
def compute_gt_cgt_batch():
    kmerfs = sorted(glob.glob("/scratch1/tsungyul/n30488.hprc.full/1kg/genotype/*.tr.kmers"))
    ng = len(kmerfs)
    print(ng, NK, NCCK)
    
    BS = ng // NB
    for i in range(NB):
        BS_ = BS if i != NB-1 else ng - BS*i
        print(f"batch {i}: loading gt... ", end="")
        #gt = np.zeros([BS_, NK], dtype=np.int32)
        gt = np.zeros([NK, BS_], dtype=np.int32)
        for j in range(BS_):
            load_single_gt(kmerfs[i*BS+j], gt, j)
        print(f"computing cgt... ", end="")
        #cgt = np.zeros([BS_,NCCK], dtype=np.float32)
        cgt = np.zeros([NCCK, BS_], dtype=np.float32)
        for i0, i1 in ki_map.items():
            cgt[i1] += gt[i0]
        print(pd.DataFrame(cgt))
        cgt /= tr_cck_ns[:,None]
        print("dumping gt... ", end="")
        with open(f"/scratch1/tsungyul/n30488.hprc.full/1kg/gt_pickle/gt.{i}.pickle", 'wb') as f:
            pickle.dump(gt, f, protocol=pickle.HIGHEST_PROTOCOL)
        print("dumping cgt...")
        with open(f"/scratch1/tsungyul/n30488.hprc.full/1kg/gt_pickle/cgt.{i}.pickle", 'wb') as f:
            pickle.dump(cgt, f, protocol=pickle.HIGHEST_PROTOCOL)

!mkdir -p /scratch1/tsungyul/n30488.hprc.full/1kg/gt_pickle
compute_gt_cgt_batch()

computing cgt...      0        1        2        3        4        5        6        7        \
0     8334.0   3681.0   1403.0   4778.0   5492.0      0.0   1680.0      0.0   
1    13370.0   5627.0   2169.0   7074.0   9045.0      1.0   2441.0      0.0   
2     7517.0   3232.0   1225.0   3364.0   5234.0      0.0   1817.0      0.0   
3    10541.0   4556.0   1877.0   4533.0   7136.0      0.0   1563.0      0.0   
4     9098.0   3829.0   1529.0   3504.0   6226.0      0.0   1569.0      0.0   
..       ...      ...      ...      ...      ...      ...      ...      ...   
155   9071.0   3491.0   1478.0   5382.0   4974.0      0.0   2175.0      0.0   
156   8655.0   3471.0   1340.0   5517.0   5229.0      0.0   2023.0      0.0   
157   9162.0   3500.0   1309.0   5505.0   5641.0      0.0   2200.0      0.0   
158   8026.0   3883.0   1300.0   3261.0   6441.0      0.0   2092.0      4.0   
159   7590.0   2896.0   1154.0   4011.0   4564.0      0.0   1498.0      0.0   

     8        9        ...  157256

KeyboardInterrupt: 

In [None]:
def test():
    # correct coverage, sex chrom dosage unadjusted
    genomes = np.loadtxt("../input/genomes.1kg_plus_related.gt_HPRC.txt", dtype=object)
    ng = genomes.size
    g2i = dict([[g,i] for i, g in enumerate(genomes)])
    cov = np.array([float(c) for g, c in np.loadtxt("../input/HPRC.chr1.cov.tsv", dtype=object) if g in genomes])

    print("Loading cgt", end="")
    BS = ng//NB
    cgt = np.zeros([NCCK,ng], dtype=np.float32)
    for i in range(NB):
        print(f".",end="")
        BS_ = BS if i != NB-1 else ng - BS*i
        with open(f"/scratch1/tsungyul/n30488.hprc.full/1kg/gt_pickle/cgt.{i}.pickle", 'rb') as f:
            si = i*BS
            ei = i*BS + BS_
            cgt[:,si:ei] = pickle.load(f)
    print()
    print("Computing acgt")
    acgt = np.zeros_like(cgt, dtype=np.float32)
    acgt = cgt / cov
    print("Dumping acgt")
    with open("/scratch1/tsungyul/n30488.hprc.full/1kg/gt_pickle/acgt.pickle", 'wb') as f:
        pickle.dump(acgt, f, protocol=pickle.HIGHEST_PROTOCOL)
    return acgt

ilcgt = test()

In [None]:
ilcgt =  pickle.load(open("/scratch1/tsungyul/n30488.hprc.full/1kg/gt_pickle/acgt.pickle", 'rb'))

In [None]:
ilcgt.shape