**Set environment**

In [1]:
import numpy  as np
import pandas as pd
import itertools as it
import os, re
import csv

In [2]:
%run ../run_config_project.py
show_env()

BASE DIRECTORY (FD_BASE): /hpc/group/igvf/kk319
REPO DIRECTORY (FD_REPO): /hpc/group/igvf/kk319/repo
WORK DIRECTORY (FD_WORK): /hpc/group/igvf/kk319/work
DATA DIRECTORY (FD_DATA): /hpc/group/igvf/kk319/data


You are working with      IGVF BlueSTARR
PATH OF PROJECT (FD_PRJ): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR
PROJECT RESULTS (FD_RES): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results
PROJECT SCRIPTS (FD_EXE): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/scripts
PROJECT DATA    (FD_DAT): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/data
PROJECT NOTE    (FD_NBK): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/notebooks
PROJECT DOCS    (FD_DOC): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/docs
PROJECT LOG     (FD_LOG): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/log
PROJECT REF     (FD_REF): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/references



## Import one line

In [3]:
#txt_fdiry = "/hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/shared/igvf_bluestarr_revathy"
#txt_fdiry = os.path.join(FD_PRJ, "shared", "igvf_bluestarr_revathy")
txt_fdiry = os.path.join(FD_DAT, "kircher2019")
os.listdir(txt_fdiry)

['Kircher_region_chunk.txt']

In [4]:
txt_fdiry = os.path.join(FD_DAT, "kircher2019")
txt_fname = "Kircher_region_chunk.txt"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

with open(txt_fpath, "r") as file:
    line = next(file)

In [5]:
line

'chr11:5227021-5227208\t5227021:ref=G:G,C,A,T\t5227022:ref=G:G,C,A,T\t5227023:ref=T:G,C,A,T\t5227024:ref=G:G,C,A,T\t5227025:ref=T:G,C,A,T\t5227026:ref=C:G,C,A,T\t5227027:ref=T:G,C,A,T\t5227028:ref=G:G,C,A,T\t5227029:ref=T:G,C,A,T\t5227030:ref=T:G,C,A,T\t5227031:ref=T:G,C,A,T\t5227032:ref=G:G,C,A,T\t5227033:ref=A:G,C,A,T\t5227034:ref=G:G,C,A,T\t5227035:ref=G:G,C,A,T\t5227036:ref=T:G,C,A,T\t5227037:ref=T:G,C,A,T\t5227038:ref=G:G,C,A,T\t5227039:ref=C:G,C,A,T\t5227040:ref=T:G,C,A,T\t5227041:ref=A:G,C,A,T\t5227042:ref=G:G,C,A,T\t5227043:ref=T:G,C,A,T\t5227044:ref=G:G,C,A,T\t5227045:ref=A:G,C,A,T\t5227046:ref=A:G,C,A,T\t5227047:ref=C:G,C,A,T\t5227048:ref=A:G,C,A,T\t5227049:ref=C:G,C,A,T\t5227050:ref=A:G,C,A,T\t5227051:ref=G:G,C,A,T\t5227052:ref=T:G,C,A,T\t5227053:ref=T:G,C,A,T\t5227054:ref=G:G,C,A,T\t5227055:ref=T:G,C,A,T\t5227056:ref=G:G,C,A,T\t5227057:ref=T:G,C,A,T\t5227058:ref=C:G,C,A,T\t5227059:ref=A:G,C,A,T\t5227060:ref=G:G,C,A,T\t5227061:ref=A:G,C,A,T\t5227062:ref=A:G,C,A,T\t5227063:re

## Helper function

In [6]:
def gen_parse_variant_line(txt_line: str):
    """parse a line of Kircher chunk file, get region, parse variants and yield"""
    ### parse each line
    lst_txt_parsed = txt_line.rstrip("\n").split("\t")
    parse_variant  = re.compile(r"^(\d+):ref=([ACGTN]+):([ACGTN](?:,[ACGTN])*)$")
    
    ### get region
    txt_region = lst_txt_parsed[0]  # e.g., "chr11:5227021-5227208"

    ### parse variants at each location
    for txt in lst_txt_parsed[1:]:
        ### parse variant
        txt_variant = parse_variant.match(txt)
        if not txt_variant:
            continue
            
        ### get position and ref/alt base
        num_pos          = int(txt_variant.group(1))
        txt_base_ref     = txt_variant.group(2)
        vec_txt_base_alt = txt_variant.group(3).split(",")  # includes REF again in your file
        
        for txt_base_alt in vec_txt_base_alt:
            yield (txt_region, num_pos, txt_base_ref, txt_base_alt)

def gen_parse_variant_file(txt_path: str):
    """get Kircher chunk file, parse each line, and yield each variant"""
    with open(txt_path) as file:
        for txt_line in file:
            yield from gen_parse_variant_line(txt_line)

## Test simple line

In [7]:
gen = gen_parse_variant_line(line)
for row in it.islice(gen, 10):
    print(row)

('chr11:5227021-5227208', 5227021, 'G', 'G')
('chr11:5227021-5227208', 5227021, 'G', 'C')
('chr11:5227021-5227208', 5227021, 'G', 'A')
('chr11:5227021-5227208', 5227021, 'G', 'T')
('chr11:5227021-5227208', 5227022, 'G', 'G')
('chr11:5227021-5227208', 5227022, 'G', 'C')
('chr11:5227021-5227208', 5227022, 'G', 'A')
('chr11:5227021-5227208', 5227022, 'G', 'T')
('chr11:5227021-5227208', 5227023, 'T', 'G')
('chr11:5227021-5227208', 5227023, 'T', 'C')


## Test whole file

In [8]:
txt_fdiry = os.path.join(FD_DAT, "kircher2019")
txt_fname = "Kircher_region_chunk.txt"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

gen = gen_parse_variant_file(txt_fpath)
gen = it.islice(gen, 10)
for row in gen:
    print(row)

('chr11:5227021-5227208', 5227021, 'G', 'G')
('chr11:5227021-5227208', 5227021, 'G', 'C')
('chr11:5227021-5227208', 5227021, 'G', 'A')
('chr11:5227021-5227208', 5227021, 'G', 'T')
('chr11:5227021-5227208', 5227022, 'G', 'G')
('chr11:5227021-5227208', 5227022, 'G', 'C')
('chr11:5227021-5227208', 5227022, 'G', 'A')
('chr11:5227021-5227208', 5227022, 'G', 'T')
('chr11:5227021-5227208', 5227023, 'T', 'G')
('chr11:5227021-5227208', 5227023, 'T', 'C')


**Total number of variants**

In [11]:
txt_fdiry = os.path.join(FD_DAT, "kircher2019")
txt_fname = "Kircher_region_chunk.txt"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

gen = gen_parse_variant_file(txt_fpath)
num = sum(1 for _ in gen)
print(num)

39376


**Total number of positions**

In [14]:
num / 4

9844.0

## Export test chunk

In [9]:
### get a chunk with 100 variants
txt_fdiry = os.path.join(FD_DAT, "kircher2019")
txt_fname = "Kircher_region_chunk.txt"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

gen = gen_parse_variant_file(txt_fpath)
gen = it.islice(gen, 100)

### export
txt_fdiry = os.path.join(FD_RES, "predict_variant_kircher2019")
txt_fname = "variant.test.chunk_100.tsv"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

with open(txt_fpath, "w", newline="") as fout:
    writer = csv.writer(fout, delimiter="\t")
    
    for row in gen:
        writer.writerow(row)

In [10]:
### get a chunk with 1000 variants
txt_fdiry = os.path.join(FD_DAT, "kircher2019")
txt_fname = "Kircher_region_chunk.txt"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

gen = gen_parse_variant_file(txt_fpath)
gen = it.islice(gen, 1000)

### export
txt_fdiry = os.path.join(FD_RES, "predict_variant_kircher2019")
txt_fname = "variant.test.chunk_1000.tsv"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

with open(txt_fpath, "w", newline="") as fout:
    writer = csv.writer(fout, delimiter="\t")
    
    for row in gen:
        writer.writerow(row)