- This notebook demonstrates how to convert kmer genotype matrix to motif genotype matrix
    - In the previous preprocessing step, I computed the correspondence from kmer to motif.
      So once I have a kmer matrix, where each row is a kmer and each col is a sample,
      I can simply do row operations to get the motif dosage.
    - It takes about 10 min to compute each batch of data in `compute_gt_cgt_batch`.
      You could speed it up by moving the code to a Python script and using SLURM's array job.
      Just optional. 
    - Watch out for memory usage and remmeber to allocate enough mem for your compute node,
      otherwise ipython kernel would die
- With motif genotype matrix, we can then perform LD pruning over motifs
    - Add another function to save in tsv.gz format instead of pikcle for portability
    - Use PLINK or write your own code given certain LD threshold
- Data structure in this notebook that can be useful for LD pruning
    - `ccki_tr`: cumulative number of canonical compressed kmer (cck) per TR locus
        - cck is equivalent to motif in our definition
        - vector of size NTR (Number of TR loci)
        - retrieve # of cck in each locus `i` by
            - `ccki_tr[i] -  ccki_tr[i-1]` if i > 0
            - `ccki_tr[i]` if i == 0
        - we will need this to perform LD pruning for each locus, or a block in the matrix

In [None]:
#!/usr/bin/env python3
import sys
srcdir = "/project/mchaisso_100/cmb-16/tsungyul/work/vntr/danbing-tk/script/"
sys.path.insert(0, srcdir)

In [2]:
import numpy as np
import pandas as pd
import vntrutils as vu
import utils
import matplotlib
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
import pickle
import itertools
import gc
import glob
import os
import statsmodels.api as sm
import gzip
from sklearn.metrics import r2_score
import seaborn as sns

matplotlib.rc('font', size=7)
matplotlib.rc('axes', titlesize=7)
matplotlib.rc('xtick', labelsize=5)
matplotlib.rc('ytick', labelsize=5)
%load_ext autoreload
%autoreload 2

In [6]:
def get_1(file_path):
    with open(file_path, 'rb') as f:
        ki_tr, ccki_tr = pickle.load(f)
    return ki_tr, ccki_tr

def get_2(file_path):
    with open(file_path, 'rb') as f:
        ks, ccks, tr_cck_ns, ki_map = pickle.load(f)
    tr_cck_ns = np.array(tr_cck_ns)
    return ks, ccks, tr_cck_ns, ki_map

def gather_motifs(gt_HPRC, NCCK, NB, out_dir):
    genomes = np.loadtxt(gt_HPRC, dtype=object)
    ng = genomes.size
    BS = ng//NB
    
    print(BS, ng, NB)
    cgt = np.zeros([NCCK,ng], dtype=np.float32)
    for i in range(NB):
        print(f"Loading batch {i+1}")
        BS_ = BS if i != NB-1 else ng - BS*i
        si = i*BS
        ei = i*BS + BS_
        with open(f"{out_dir}/cgt.{i}.pickle", 'rb') as f:
            cgt[:,si:ei] = pickle.load(f)
    return cgt

def adjust_coverage(cgt, gt_HPRC, HPRC_chr1_cov, out_dir):
    print("Loading coverage...")
    genomes = np.loadtxt(gt_HPRC, dtype=object)
    cov = np.array([float(c) for g, c in np.loadtxt(HPRC_chr1_cov, dtype=object) if g in genomes])
    print("Computing acgt...")
    cgt /= cov
    print("Dumping acgt")
    with open(f"{out_dir}/acgt.pickle", 'wb') as f:
        pickle.dump(cgt, f, protocol=pickle.HIGHEST_PROTOCOL)
    return cgt

def compute_ld_r2(acgt, ccki_tr, ccks, r2_cutoff):
    # keep track of which variants have been pruned
    NL = len(ccki_tr)
    NM = len(ccks)
    pruned = np.zeros(NM, dtype=bool)
    print(NM)
    print("Pruning...")
    # for i in range(NL):
    for i in range(10):
        locus_s = ccki_tr[i-1] if i != 0 else 0
        locus_e = ccki_tr[i]
        if pruned[locus_s]:
            continue
        while locus_s != locus_e:
            locus_m = locus_s + 1
            while locus_m <= locus_e:
                if not pruned[locus_m]:
                    r2 = r2_score(acgt[locus_s], acgt[locus_m])
                    if r2 > r2_cutoff:
                        pruned[locus_m] = True
                    # print(locus_s, locus_m, locus_e, r2, r2 > r2_cutoff)
                locus_m += 1
            locus_s += 1
        if (i + 1) % 2 == 0:
            print(f"{i+1} loci pruned")
    return pruned

In [7]:
get_1_file="/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/cdbg/ki_tr.ccki_tr.pickle"
get_2_file="/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/cdbg/ks.ccks.tr_cck_ns.ki_map.pickle"
gt_HPRC="/project/mchaisso_100/cmb-17/vntr_genotyping/aydin/LD_prune/input/genomes.txt"
HPRC_chr1_cov="/project/mchaisso_100/cmb-17/vntr_genotyping/aydin/LD_prune/input/1kg_all.cov.tsv"
out="/scratch1/tsungyul/aydin/k2m_output"

ki_tr, ccki_tr = get_1(get_1_file)
ks, ccks, tr_cck_ns, ki_map = get_2(get_2_file)

NK = len(ks)
NCCK = len(ccks)
NB = 40

# IL dosage

In [None]:
# acgt = None
# if os.path.exists(f"{out}/acgt.pickle"):
#     print("acgt file found")
#     with open(f"{out}/acgt.pickle", 'rb') as f:
#         acgt =  pickle.load(f)
# else:
#     print("acgt file not found")
#     cgt = gather_motifs(gt_HPRC, NCCK, NB, out)
#     acgt =  adjust_coverage(cgt, gt_HPRC, HPRC_chr1_cov, out)
cgt = gather_motifs(gt_HPRC, NCCK, NB, out)
acgt =  adjust_coverage(cgt, gt_HPRC, HPRC_chr1_cov, out)

In [None]:
r2_cutoff = 0.8
cck_pruned = compute_ld_r2(acgt, ccki_tr, ccks, r2_cutoff)

In [None]:
np.unique(cck_pruned[:ccki_tr[9]], return_counts = True)