**Set environment**

In [1]:
import numpy  as np
import pandas as pd
import os

from Bio import SeqIO

In [2]:
%run ../run_config_project.py
show_env()

BASE DIRECTORY (FD_BASE): /hpc/group/igvf/kk319
REPO DIRECTORY (FD_REPO): /hpc/group/igvf/kk319/repo
WORK DIRECTORY (FD_WORK): /hpc/group/igvf/kk319/work
DATA DIRECTORY (FD_DATA): /hpc/group/igvf/kk319/data


You are working with      IGVF BlueSTARR
PATH OF PROJECT (FD_PRJ): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR
PROJECT RESULTS (FD_RES): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results
PROJECT SCRIPTS (FD_EXE): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/scripts
PROJECT DATA    (FD_DAT): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/data
PROJECT NOTE    (FD_NBK): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/notebooks
PROJECT DOCS    (FD_DOC): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/docs
PROJECT LOG     (FD_LOG): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/log
PROJECT REF     (FD_REF): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/references



## Import data

**Check file existent**

In [3]:
%env FD_RES={FD_RES}

env: FD_RES=/hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results


In [4]:
%%bash
ls ${FD_RES}/analysis_variant_motif_richard

test_seq.fa
test_seq.fa.fai
test_variants.fa
test_variants.fa.fai
variant_closed_gof_bluestarr_flank35_obs.fa
variant_closed_gof_bluestarr_flank35_ref.fa
variant_closed_gof_bluestarr_flank35_unobs.fa
variant_closed_gof_bluestarr.tsv.gz
variant_closed_gof_bluestarr_withseq_flank35.tsv.gz


**Import data**

In [5]:
### set file directory
txt_fdiry = os.path.join(FD_RES, "analysis_variant_motif_richard")

txt_fpath_ref   = os.path.join(txt_fdiry, "variant_closed_gof_bluestarr_flank35_ref.fa")
txt_fpath_obs   = os.path.join(txt_fdiry, "variant_closed_gof_bluestarr_flank35_obs.fa")
txt_fpath_unobs = os.path.join(txt_fdiry, "variant_closed_gof_bluestarr_flank35_unobs.fa")

### import data
lst_seq_ref   = list(SeqIO.parse(txt_fpath_ref,   "fasta"))
lst_seq_obs   = list(SeqIO.parse(txt_fpath_obs,   "fasta"))
lst_seq_unobs = list(SeqIO.parse(txt_fpath_unobs, "fasta"))

**check data**

In [13]:
print(type(lst_seq_ref[0]))

<class 'Bio.SeqRecord.SeqRecord'>


In [6]:
print(lst_seq_ref[0])

ID: chr4:74487586:A:G:C
Name: chr4:74487586:A:G:C
Description: chr4:74487586:A:G:C
Number of features: 0
Seq('ACATGTGTGTATGCTTGTCTATATACACTGTATGAAGCAAGAGGAGAAAATACT...ACT')


In [7]:
print(lst_seq_obs[0])

ID: chr4:74487586:A:G:C
Name: chr4:74487586:A:G:C
Description: chr4:74487586:A:G:C
Number of features: 0
Seq('ACATGTGTGTATGCTTGTCTATATACACTGTATGAGGCAAGAGGAGAAAATACT...ACT')


In [8]:
print(lst_seq_unobs[0])

ID: chr4:74487586:A:G:C
Name: chr4:74487586:A:G:C
Description: chr4:74487586:A:G:C
Number of features: 0
Seq('ACATGTGTGTATGCTTGTCTATATACACTGTATGACGCAAGAGGAGAAAATACT...ACT')


In [9]:
### check data
assert len(lst_seq_ref) == len(lst_seq_obs) == len(lst_seq_unobs)
num_total = len(lst_seq_ref)
print("Total variants:", num_total)

Total variants: 1884977


## Extract test batches of 1k/10k sequences for each fasta

In [10]:
### Output folder for dev batches
txt_fdiry_batch = os.path.join(FD_RES, "analysis_variant_motif_richard", "batches_dev")
os.makedirs(txt_fdiry_batch, exist_ok=True)

### define batch sizes
lst_num_batch_size = [1000, 10000]

### define file name prefix
txt_prefix = "variant_closed_gof_bluestarr_flank35"

In [12]:
### create dev batches
for num_batch_size in lst_num_batch_size:
    
    # slice the first N records
    lst_batch_ref   = lst_seq_ref[:num_batch_size]
    lst_batch_obs   = lst_seq_obs[:num_batch_size]
    lst_batch_unobs = lst_seq_unobs[:num_batch_size]

    # set file names
    txt_fpath_out_ref   = os.path.join(txt_fdiry_batch, f"{txt_prefix}_dev{num_batch_size}_ref.fa")
    txt_fpath_out_obs   = os.path.join(txt_fdiry_batch, f"{txt_prefix}_dev{num_batch_size}_obs.fa")
    txt_fpath_out_unobs = os.path.join(txt_fdiry_batch, f"{txt_prefix}_dev{num_batch_size}_unobs.fa")

    # Write
    SeqIO.write(lst_batch_ref,   txt_fpath_out_ref,   "fasta")
    SeqIO.write(lst_batch_obs,   txt_fpath_out_obs,   "fasta")
    SeqIO.write(lst_batch_unobs, txt_fpath_out_unobs, "fasta")

    print(f"Wrote dev batch of {num_batch_size} sequences to {txt_fdiry_batch}")

Wrote dev batch of 1000 sequences to /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results/analysis_variant_motif_richard/batches_dev
Wrote dev batch of 10000 sequences to /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results/analysis_variant_motif_richard/batches_dev


In [5]:
### set file directory
txt_fdiry = os.path.join(FD_RES, "analysis_variant_motif_richard")
txt_fname = "variant_closed_gof_bluestarr_withseq_flank35.tsv.gz"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

### read table
dat = pd.read_csv(txt_fpath, sep = "\t")

### assign and show
dat_variant_withseq = dat
print(dat.shape)
dat.head()

(1884977, 17)


Unnamed: 0,Chrom,ChromStart,ChromEnd,Region,Variant_ID,Pos0,Ref,Obs,Unobs,Delta,Flank,Interval,Seq_Length,Seq_Ref,Seq_Obs,Seq_Unobs,Check_RefMatch
0,chr4,74487576,74488141,chr4:74487576-74488141,chr4:74487586:A:G:C,74487586,A,G,C,0.964984,35,chr4:74487551-74487622,71,ACATGTGTGTATGCTTGTCTATATACACTGTATGAAGCAAGAGGAG...,ACATGTGTGTATGCTTGTCTATATACACTGTATGAGGCAAGAGGAG...,ACATGTGTGTATGCTTGTCTATATACACTGTATGACGCAAGAGGAG...,True
1,chr9,135685425,135685942,chr9:135685425-135685942,chr9:135685432:G:G:C,135685432,G,G,C,0.945423,35,chr9:135685397-135685468,71,CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGAGGCAACACCAA...,CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGAGGCAACACCAA...,CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGACGCAACACCAA...,True
2,chr1,173251481,173251541,chr1:173251481-173251541,chr1:173251490:G:G:C,173251490,G,G,C,0.910127,35,chr1:173251455-173251526,71,ACATAGTACTCAATACCTATTTGGTGAATAGATGAGGCAATCTCAC...,ACATAGTACTCAATACCTATTTGGTGAATAGATGAGGCAATCTCAC...,ACATAGTACTCAATACCTATTTGGTGAATAGATGACGCAATCTCAC...,True
3,chr1,147523084,147523503,chr1:147523084-147523503,chr1:147523391:G:G:C,147523391,G,G,C,0.9088,35,chr1:147523356-147523427,71,ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGAGGCAATGCAAT...,ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGAGGCAATGCAAT...,ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGACGCAATGCAAT...,True
4,chr18,54580694,54581420,chr18:54580694-54581420,chr18:54580749:A:G:T,54580749,A,G,T,0.862296,35,chr18:54580714-54580785,71,TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGAAGCAATGGCCA...,TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGAGGCAATGGCCA...,TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGATGCAATGGCCA...,True
