**Set environment**

In [1]:
import numpy  as np
import pandas as pd
import itertools as it
import os, re
import csv

In [2]:
%run ../run_config_project.py
show_env()

BASE DIRECTORY (FD_BASE): /hpc/group/igvf/kk319
REPO DIRECTORY (FD_REPO): /hpc/group/igvf/kk319/repo
WORK DIRECTORY (FD_WORK): /hpc/group/igvf/kk319/work
DATA DIRECTORY (FD_DATA): /hpc/group/igvf/kk319/data


You are working with      IGVF BlueSTARR
PATH OF PROJECT (FD_PRJ): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR
PROJECT RESULTS (FD_RES): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results
PROJECT SCRIPTS (FD_EXE): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/scripts
PROJECT DATA    (FD_DAT): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/data
PROJECT NOTE    (FD_NBK): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/notebooks
PROJECT DOCS    (FD_DOC): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/docs
PROJECT LOG     (FD_LOG): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/log
PROJECT REF     (FD_REF): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/references



## Helper function

In [23]:
def gen_parse_variant_line(txt_line: str):
    """parse a line of Kircher chunk file, get region, parse variants and yield"""
    ### parse each line
    lst_txt_parsed = txt_line.rstrip("\n").split("\t")
    parse_variant  = re.compile(r"^(\d+):ref=([ACGTN]+):([ACGTN](?:,[ACGTN])*)$")
    
    ### get region
    txt_region = lst_txt_parsed[0]  # e.g., "chr11:5227021-5227208"

    ### parse variants at each location
    for txt in lst_txt_parsed[1:]:
        ### parse variant
        txt_variant = parse_variant.match(txt)
        if not txt_variant:
            continue
            
        ### get position and ref/alt base
        num_pos          = int(txt_variant.group(1))
        txt_base_ref     = txt_variant.group(2)
        vec_txt_base_alt = txt_variant.group(3).split(",")  # includes REF again in your file
        
        for txt_base_alt in vec_txt_base_alt:
            yield (txt_region, num_pos, txt_base_ref, txt_base_alt)

def gen_parse_variant_file(txt_path: str):
    """get Kircher chunk file, parse each line, and yield each variant"""
    with open(txt_path) as file:
        for txt_line in file:
            yield from gen_parse_variant_line(txt_line)

In [24]:
def gen_chunked(iterable, num_size):
    """yield successive lists of length <= size from an iterable/generator"""
    iterable = iter(iterable)
    while True:
        block = list(it.islice(iterable, num_size))
        if not block:
            break
        yield block

## Split Kircher regions into chunks

In [27]:
### set file directory
txt_fdiry = os.path.join(FD_DAT, "kircher2019")
txt_fname = "Kircher_region_chunk.txt"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

### set generator
gen = gen_parse_variant_file(txt_fpath)

### split to chunks and save results
num_chunk_size = 1000

for idx, chunk in enumerate(gen_chunked(gen, num_chunk_size), start=1):
    
    ### set file directory
    txt_fdiry = os.path.join(FD_RES, "predict_variant_kircher2019")
    txt_fname = f"variant.chunk.{idx:02}.tsv"
    txt_fpath = os.path.join(txt_fdiry, txt_fname)
    print(txt_fname)
    
    ### export each chunk
    with open(txt_fpath, "w", newline="") as fout:
        writer = csv.writer(fout, delimiter="\t")
        for row in chunk:
            writer.writerow(row)

variant.chunk.01.tsv
variant.chunk.02.tsv
variant.chunk.03.tsv
variant.chunk.04.tsv
variant.chunk.05.tsv
variant.chunk.06.tsv
variant.chunk.07.tsv
variant.chunk.08.tsv
variant.chunk.09.tsv
variant.chunk.10.tsv
variant.chunk.11.tsv
variant.chunk.12.tsv
variant.chunk.13.tsv
variant.chunk.14.tsv
variant.chunk.15.tsv
variant.chunk.16.tsv
variant.chunk.17.tsv
variant.chunk.18.tsv
variant.chunk.19.tsv
variant.chunk.20.tsv
variant.chunk.21.tsv
variant.chunk.22.tsv
variant.chunk.23.tsv
variant.chunk.24.tsv
variant.chunk.25.tsv
variant.chunk.26.tsv
variant.chunk.27.tsv
variant.chunk.28.tsv
variant.chunk.29.tsv
variant.chunk.30.tsv
variant.chunk.31.tsv
variant.chunk.32.tsv
variant.chunk.33.tsv
variant.chunk.34.tsv
variant.chunk.35.tsv
variant.chunk.36.tsv
variant.chunk.37.tsv
variant.chunk.38.tsv
variant.chunk.39.tsv
variant.chunk.40.tsv
