In [None]:
#!/usr/bin/env python3
import sys
srcdir = "/project/mchaisso_100/cmb-16/tsungyul/work/vntr/danbing-tk/script/"
sys.path.insert(0, srcdir)

In [None]:
import numpy as np
import pandas as pd
import vntrutils as vu
import utils
import matplotlib
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
import pickle
import itertools
import gc
import glob
import os
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multitest import multipletests
import gzip
from sklearn.metrics import r2_score
import seaborn as sns
import time

matplotlib.rc('font', size=7)
matplotlib.rc('axes', titlesize=7)
matplotlib.rc('xtick', labelsize=5)
matplotlib.rc('ytick', labelsize=5)
%load_ext autoreload
%autoreload 2

In [6]:
def get_1(file_path):
    with open(file_path, 'rb') as f:
        ki_tr, ccki_tr = pickle.load(f)
    return ki_tr, ccki_tr

def get_2(file_path):
    with open(file_path, 'rb') as f:
        ks, ccks, tr_cck_ns, ki_map = pickle.load(f)
    tr_cck_ns = np.array(tr_cck_ns)
    return ks, ccks, tr_cck_ns, ki_map

def combine_pruned_files(out_dir, r2_threshold, num_jobs, num_motifs, total_loci, ccki_tr):
    loci_per_job = total_loci // num_jobs
    pruned_combined = np.zeros(num_motifs, dtype=bool)

    # lead each pruned pickle file and combine them
    for i in range(num_jobs):
        start_loci = i * loci_per_job
        end_loci = (i + 1) * loci_per_job - 1 if i != num_jobs - 1 else total_loci - 1
        start_motif = ccki_tr[start_loci] if start_loci != 0 else 0
        end_motif = ccki_tr[end_loci]
        file_path = f"{out_dir}/cck_pruned_{r2_threshold}_{start_loci}_{end_loci}.pickle"
        # print(f" Loading {file_path}; motifs {start_motif} to {end_motif}")
        with open(file_path, 'rb') as f:
            pruned_partial = pickle.load(f)
        
        pruned_combined[start_motif:end_motif] = pruned_partial

    # dump the combined pruned array
    with open(f"{out_dir}/cck_pruned_combined_{r2_threshold}.pickle", 'wb') as f:
        pickle.dump(pruned_combined, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Combined pruned file saved at {out_dir}/cck_pruned_combined_{r2_threshold}.pickle")
    return pruned_combined

In [None]:
get_1_file = "/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/cdbg/ki_tr.ccki_tr.pickle"
get_2_file = "/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/cdbg/ks.ccks.tr_cck_ns.ki_map.pickle"
gt_HPRC = "/project/mchaisso_100/cmb-17/vntr_genotyping/aydin/LD_prune/input/genomes.txt"
HPRC_chr1_cov = "/project/mchaisso_100/cmb-17/vntr_genotyping/aydin/LD_prune/input/1kg_all.cov.tsv"
meta = "/project/mchaisso_100/cmb-17/vntr_genotyping/1kgr/20130606_g1k_3202_samples_ped_population.simple.tsv"
out = "/scratch1/tsungyul/aydin/k2m_output"
ki_tr, ccki_tr = get_1(get_1_file)
ks, ccks, tr_cck_ns, ki_map = get_2(get_2_file)

NK = len(ks)
NCCK = len(ccks)
NB = 40

num_jobs = 100
total_loci = len(ki_tr)
num_motifs = len(ccks)
r2_threshold = 0.8

In [None]:
meta_data_df = pd.read_csv(meta, sep='\t')

In [None]:
if os.path.exists(f"{out}/acgt.pickle"):
    print("acgt file found")
    with open(f"{out}/acgt.pickle", 'rb') as f:
        acgt =  pickle.load(f)
else:
    print("acgt file NOT found. run job on CARC")

In [None]:
if not os.path.exists(f"{out}/cck_pruned_combined_{r2_threshold}.pickle"):
    print(f"creating pruned_0_{".".split(r2_threshold)[1]}")
    pruned_0_8 = combine_pruned_files(out, 0.8, num_jobs, num_motifs, total_loci, ccki_tr)
else:
    print(f"retrieving pruned_0_{".".split(r2_threshold)[1]}")
    with open(f"{out}/cck_pruned_combined_{r2_threshold}.pickle", 'rb') as f:
        pruned_0_8 = pickle.load(f)

In [None]:
# prune features based on bool vector (keep False)
kept_motifs = acgt[~pruned_0_8]

# convert pruned features to a df
# transpose so each row is a sample and each column is a feature
kept_motifs = np.array(kept_motifs).T
kept_motifs_df = pd.DataFrame(kept_motifs, columns=[f'feature_{i}' for i in range(kept_motifs.shape[1])])

In [None]:
# ensure the sample order matches between kept_motifs_df and df_metadata
kept_motifs_df.index = meta_data_df.index

# combine metadata and kept features
combined_data_df = pd.concat([meta_data_df, kept_motifs_df], axis=1)

# set up and run regression tests for each feature
population_pvals = []
superpopulation_pvals = []
for motif in kept_motifs_df.columns:
    # TODO: check if col names correct
    model_pop = ols(f"{motif} ~ C(population) + age + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10", 
                    data = combined_data_df).fit()
    population_pvals.append(model_pop.pvalues['C(population)'])

    # TODO: check if col names correct
    model_superpop = ols(f"{motif} ~ C(superpopulation) + age + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10", 
                         data=combined_data_df).fit()
    superpopulation_pvals.append(model_superpop.pvalues['C(superpopulation)'])  # Store p-value for superpopulation

# apply Benjamini-Hochberg correction for multiple testing
population_pvals = np.array(population_pvals)
superpopulation_pvals = np.array(superpopulation_pvals)

population_significant = multipletests(population_pvals, method='fdr_bh')[0]
superpopulation_significant = multipletests(superpopulation_pvals, method='fdr_bh')[0]