- This notebook demonstrates how to convert kmer genotype matrix to motif genotype matrix
    - In the previous preprocessing step, I computed the correspondence from kmer to motif.
      So once I have a kmer matrix, where each row is a kmer and each col is a sample,
      I can simply do row operations to get the motif dosage.
    - It takes about 10 min to compute each batch of data in `compute_gt_cgt_batch`.
      You could speed it up by moving the code to a Python script and using SLURM's array job.
      Just optional. 
    - Watch out for memory usage and remmeber to allocate enough mem for your compute node,
      otherwise ipython kernel would die
- With motif genotype matrix, we can then perform LD pruning over motifs
    - Add another function to save in tsv.gz format instead of pikcle for portability
    - Use PLINK or write your own code given certain LD threshold
- Data structure in this notebook that can be useful for LD pruning
    - `ccki_tr`: cumulative number of canonical compressed kmer (cck) per TR locus
        - cck is equivalent to motif in our definition
        - vector of size NTR (Number of TR loci)
        - retrieve # of cck in each locus `i` by
            - `ccki_tr[i] -  ccki_tr[i-1]` if i > 0
            - `ccki_tr[i]` if i == 0
        - we will need this to perform LD pruning for each locus, or a block in the matrix

In [None]:
#!/usr/bin/env python3
import sys
srcdir = "/project/mchaisso_100/cmb-16/tsungyul/work/vntr/danbing-tk/script/"
sys.path.insert(0, srcdir)

In [2]:
import numpy as np
import pandas as pd
import vntrutils as vu
import utils
# import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict
import pickle
# import networkx as nx
# import pygraphviz
# import collections
import gc
import glob
# import warnings
import statsmodels.api as sm
# from statsmodels.stats.multitest import fdrcorrection as fdr
# from scipy import stats
import gzip
# from sklearn.decomposition import PCA
# from scipy.stats import chi2
# import re
# import json
import seaborn as sns
# %load_ext autoreload
# %autoreload 2

matplotlib.rc('font', size=7)
matplotlib.rc('axes', titlesize=7)
matplotlib.rc('xtick', labelsize=5)
matplotlib.rc('ytick', labelsize=5)
%load_ext autoreload
%autoreload 2

In [6]:
def get_1(file_path):
    with open(file_path, 'rb') as f:
        ki_tr, ccki_tr = pickle.load(f)
    return ki_tr, ccki_tr

def get_2(file_path):
    with open(file_path, 'rb') as f:
        ks, ccks, tr_cck_ns, ki_map = pickle.load(f)
    tr_cck_ns = np.array(tr_cck_ns)
    return ks, ccks, tr_cck_ns, ki_map

def gather(gt_HPRC, HPRC_chr1_cov, NCCK, NB, out_dir):
    # correct coverage, sex chrom dosage unadjusted
    genomes = np.loadtxt(gt_HPRC, dtype=object)
    ng = genomes.size
    cov = np.array([float(c) for g, c in np.loadtxt(HPRC_chr1_cov, dtype=object) if g in genomes])

    print("Loading cgt", end="")
    sys.stdout.flush()
    BS = ng//NB
    cgt = np.zeros([NCCK,ng], dtype=np.float32)
    for i in range(NB):
        BS_ = BS if i != NB-1 else ng - BS*i
        with open(f"{out_dir}/cgt.{i}.pickle", 'rb') as f:
            si = i*BS
            ei = i*BS + BS_
            cgt[:,si:ei] = pickle.load(f)
    print("Computing acgt")
    sys.stdout.flush()
    acgt = np.zeros_like(cgt, dtype=np.float32)
    acgt = cgt / cov
    print("Dumping acgt")
    sys.stdout.flush()
    with open("{out_dir}/acgt.pickle", 'wb') as f:
        pickle.dump(acgt, f, protocol=pickle.HIGHEST_PROTOCOL)
    return acgt

In [7]:

get_1_file="/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/cdbg/ki_tr.ccki_tr.pickle"
get_2_file="/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/cdbg/ks.ccks.tr_cck_ns.ki_map.pickle"
gt_HPRC="/project/mchaisso_100/cmb-17/vntr_genotyping/aydin/LD_prune/input/genomes.1kg_plus_related.gt_HPRC.txt"
HPRC_chr1_cov="/project/mchaisso_100/cmb-17/vntr_genotyping/aydin/LD_prune/input/HPRC.chr1.cov.tsv"
out="/scratch1/tsungyul/aydin/k2m_output"

ki_tr, ccki_tr = get_1(get_1_file)
ks, ccks, tr_cck_ns, ki_map = get_2(get_2_file)

NK = len(ks)
NCCK = len(ccks)
NB = 40

# IL dosage

In [None]:
ilcgt = gather(gt_HPRC, HPRC_chr1_cov, NCCK, NB, out)

In [None]:
ilcgt =  pickle.load(open("{out}/acgt.pickle", 'rb'))

In [None]:
ilcgt.shape