**Set environment**

In [1]:
import numpy  as np
import pandas as pd
import itertools as it
from functools import partial
import os, sys, re
import csv

In [2]:
%run ../run_config_project.py
show_env()

BASE DIRECTORY (FD_BASE): /hpc/group/igvf/kk319
REPO DIRECTORY (FD_REPO): /hpc/group/igvf/kk319/repo
WORK DIRECTORY (FD_WORK): /hpc/group/igvf/kk319/work
DATA DIRECTORY (FD_DATA): /hpc/group/igvf/kk319/data


You are working with      IGVF BlueSTARR
PATH OF PROJECT (FD_PRJ): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR
PROJECT RESULTS (FD_RES): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results
PROJECT SCRIPTS (FD_EXE): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/scripts
PROJECT DATA    (FD_DAT): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/data
PROJECT NOTE    (FD_NBK): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/notebooks
PROJECT DOCS    (FD_DOC): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/docs
PROJECT LOG     (FD_LOG): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/log
PROJECT REF     (FD_REF): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/references



In [3]:
FP_GEN = "/hpc/group/igvf/kk319/data/genome/hg38/hg38.fa"

import pysam
fasta  = pysam.FastaFile(FP_GEN)

#from pyfaidx import Fasta
#fasta  = Fasta(FP_GEN)

## Import data

**Check file existent**

In [4]:
%%bash -s "{FD_DAT}"
echo $1

/hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/data


In [5]:
!ls $FD_DAT/variant_bluestarr_richard/closed

annotated_snv_sites_filtered.tsv.gz  regions.fasta.gz
annotated_snv_sites.tsv.gz	     regions_GoF_only.fasta.gz


**Import table**

In [6]:
### set file directory
txt_fdiry = os.path.join(FD_DAT, "variant_bluestarr_richard", "closed")
txt_fname = "annotated_snv_sites_filtered.tsv.gz"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

### read table
dat = pd.read_csv(txt_fpath, sep = "\t")

### assign and show
dat_variant_import = dat
print(dat.shape)
dat.head()

(1884977, 6)


Unnamed: 0,region,pos0,delta,ref,obs,unobs
0,chr4:74487576-74488141,74487586,0.964984,A,G,C
1,chr9:135685425-135685942,135685432,0.945423,G,G,C
2,chr1:173251481-173251541,173251490,0.910127,G,G,C
3,chr1:147523084-147523503,147523391,0.9088,G,G,C
4,chr18:54580694-54581420,54580749,0.862296,A,G,T


## Arrange table

In [7]:
### init
dat = dat_variant_import.copy()

### split region into chrom/start/end
dat_chrom_info    = dat["region"].str.split("[:-]", expand=True)
dat["Chrom"]      = dat_chrom_info[0]
dat["ChromStart"] = dat_chrom_info[1].astype(int)
dat["ChromEnd"]   = dat_chrom_info[2].astype(int)

### rename original columns
dat = dat.rename(columns={
    "region": "Region",
    "pos0":   "Pos0",
    "delta":  "Delta",
    "ref":    "Ref",
    "obs":    "Obs",
    "unobs":  "Unobs"
})

### add Variant_ID
dat["Variant_ID"] = (
    dat["Chrom"].astype(str) + ":" +
    dat["Pos0"].astype(str)  + ":" +
    dat["Ref"].astype(str)   + ":" +
    dat["Obs"].astype(str)   + ":" +
    dat["Unobs"].astype(str)
)

### Reorder columns
dat = dat[[
    "Chrom", "ChromStart", "ChromEnd", "Region", 
    "Variant_ID", 
    "Pos0",
    "Ref", "Obs", "Unobs",
    "Delta"
]]

### assign and show
dat_variant_arrange = dat
print(dat.shape)
dat.head()

(1884977, 10)


Unnamed: 0,Chrom,ChromStart,ChromEnd,Region,Variant_ID,Pos0,Ref,Obs,Unobs,Delta
0,chr4,74487576,74488141,chr4:74487576-74488141,chr4:74487586:A:G:C,74487586,A,G,C,0.964984
1,chr9,135685425,135685942,chr9:135685425-135685942,chr9:135685432:G:G:C,135685432,G,G,C,0.945423
2,chr1,173251481,173251541,chr1:173251481-173251541,chr1:173251490:G:G:C,173251490,G,G,C,0.910127
3,chr1,147523084,147523503,chr1:147523084-147523503,chr1:147523391:G:G:C,147523391,G,G,C,0.9088
4,chr18,54580694,54581420,chr18:54580694-54581420,chr18:54580749:A:G:T,54580749,A,G,T,0.862296


**Export table**

In [8]:
### set file directory
txt_fdiry = os.path.join(FD_RES, "analysis_variant_motif_richard")
txt_fname = "variant_closed_gof_bluestarr.tsv.gz"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

### write table
dat_variant_arrange.to_csv(txt_fpath, sep="\t", index=False, compression="gzip")

## Sanity check: reference allele

**Check the first row**

In [9]:
txt_region = "chr4:74487576-74488141"

#txt_chrom_name, txt_chrom_start, txt_chrom_end = re.split(':|-', txt_region)
txt_chrom_name, txt_chrom_slice = txt_region.split(":")
txt_chrom_name  = str(txt_chrom_name)
txt_chrom_pos   = 74487586

print(f"Chrom: {txt_chrom_name}; Start-End: {txt_chrom_slice}; Pos: {txt_chrom_pos}")

Chrom: chr4; Start-End: 74487576-74488141; Pos: 74487586


In [10]:
### # 0-based slicing
print(fasta.fetch(txt_chrom_name, txt_chrom_pos, txt_chrom_pos+1))
#print(fasta[txt_chrom_name][txt_chrom_pos:txt_chrom_pos+1].seq)

A


**Check more positions**

In [11]:
### init
dat = dat_variant_import
dat = dat.sample(10000, random_state=1)

### fetch ref allele and check mismatches
lst_mismatches = []

for txt_chrom_name, num_chrom_pos, txt_allele_ref_table in zip(
    dat["region"].str.split(":").str[0],
    dat["pos0"], 
    dat["ref"]
):
    ### query the reference allele
    ### note: pos is 0-based, end-exclusive
    txt_allele_ref_fasta = fasta.fetch(txt_chrom_name, num_chrom_pos, num_chrom_pos+1) 

    ### convert to uppercase for simplicity
    ### uppercase: high-confidence sequence
    ### lowercase: soft-masked sequence
    txt_allele_ref_fasta = txt_allele_ref_fasta.upper()
    
    ### check match/mismatch
    if txt_allele_ref_fasta != txt_allele_ref_table:
        tmp = (txt_chrom_name, num_chrom_pos, txt_allele_ref_table, txt_allele_ref_fasta)
        lst_mismatches.append(tmp)

print(f"Checked {len(dat)} variants")
print(f"Found {len(lst_mismatches)} mismatches")
if lst_mismatches:
    print("Example mismatches:", lst_mismatches[:5])

Checked 10000 variants
Found 0 mismatches


## Helper function

In [12]:
def get_variant_interval_series(row, num_interval_flank=35):
    txt_chrom_name  = row["Region"].split(":")[0]
    txt_chrom_pos0  = row["Pos0"]
    txt_allele_ref  = row["Ref"]
    txt_allele_obs  = row["Obs"]
    txt_allele_unobs= row["Unobs"]

    # fetch genomic interval centered at variant position
    txt_seq_ref = fasta.fetch(
        txt_chrom_name,
        txt_chrom_pos0 - num_interval_flank,
        txt_chrom_pos0 + num_interval_flank + 1
    ).upper()

    # sanity check reference allele
    if txt_seq_ref[num_interval_flank].upper() != txt_allele_ref.upper():
        raise ValueError(
            f"Reference mismatch at {txt_chrom_name}:{txt_chrom_pos0} "
            f"(expected {txt_allele_ref}, got {txt_seq_ref[num_interval_flank]})"
        )

    # build alt windows
    txt_seq_obs   = txt_seq_ref[:num_interval_flank] + txt_allele_obs   + txt_seq_ref[num_interval_flank+1:]
    txt_seq_unobs = txt_seq_ref[:num_interval_flank] + txt_allele_unobs + txt_seq_ref[num_interval_flank+1:]

    # flank-dependent columns only
    return pd.Series({
        "Flank":      num_interval_flank,
        "Interval":   f"{txt_chrom_name}:{txt_chrom_pos0 - num_interval_flank}-{txt_chrom_pos0 + num_interval_flank + 1}",
        "Seq_Length": len(txt_seq_ref),
        "Seq_Ref":    txt_seq_ref,
        "Seq_Obs":    txt_seq_obs,
        "Seq_Unobs":  txt_seq_unobs,
        "Check_RefMatch": (txt_seq_ref[num_interval_flank].upper() == txt_allele_ref.upper())
    })


## Get sequence by specifying flanking size as 35bp

In [13]:
### spanning x bp at the center of each variant
### here the flanking region is set as 35 bp at both side
NUM_INTERVAL_FLANK = 35 

**Test run**

In [14]:
%%time

### init
dat = dat_variant_arrange.copy()
dat = dat.sample(10000, random_state=123)

### add sequences
fun = partial(get_variant_interval_series, num_interval_flank=NUM_INTERVAL_FLANK)
lst = ["Flank", "Interval", "Seq_Length", "Seq_Ref", "Seq_Obs", "Seq_Unobs", "Check_RefMatch"]
dat[lst] = dat.apply(fun, axis=1)

### assign and show
dat_variant_withseq_sample = dat
print(dat.shape)
dat.head()

(10000, 17)
CPU times: user 1.54 s, sys: 225 ms, total: 1.77 s
Wall time: 1.77 s


Unnamed: 0,Chrom,ChromStart,ChromEnd,Region,Variant_ID,Pos0,Ref,Obs,Unobs,Delta,Flank,Interval,Seq_Length,Seq_Ref,Seq_Obs,Seq_Unobs,Check_RefMatch
1467403,chr2,181868375,181869091,chr2:181868375-181869091,chr2:181869026:G:A:T,181869026,G,A,T,0.00198,35,chr2:181868991-181869062,71,CTTGATCCTCATAATCCTCACAATAACTCTAGAGAGTGAGAAGTGT...,CTTGATCCTCATAATCCTCACAATAACTCTAGAGAATGAGAAGTGT...,CTTGATCCTCATAATCCTCACAATAACTCTAGAGATTGAGAAGTGT...,True
176754,chr1,159747007,159747231,chr1:159747007-159747231,chr1:159747115:T:T:C,159747115,T,T,C,0.018575,35,chr1:159747080-159747151,71,CGTGCATTTTTGTGACCCCAATCATTTTTGAAAACTATCTCAGAGC...,CGTGCATTTTTGTGACCCCAATCATTTTTGAAAACTATCTCAGAGC...,CGTGCATTTTTGTGACCCCAATCATTTTTGAAAACCATCTCAGAGC...,True
227022,chr11,55372676,55374644,chr11:55372676-55374644,chr11:55374072:T:A:C,55374072,T,A,C,0.016494,35,chr11:55374037-55374108,71,AGCCACCTCCTAAAATATATTTGTATACATACTATTTCTATTTTAT...,AGCCACCTCCTAAAATATATTTGTATACATACTATATCTATTTTAT...,AGCCACCTCCTAAAATATATTTGTATACATACTATCTCTATTTTAT...,True
471983,chr7,73751160,73751171,chr7:73751160-73751171,chr7:73751165:G:G:T,73751165,G,G,T,0.010665,35,chr7:73751130-73751201,71,TAAGTTTCAAGCTTTAAGACCAAAAGGGTTGACCCGCACCATGGCT...,TAAGTTTCAAGCTTTAAGACCAAAAGGGTTGACCCGCACCATGGCT...,TAAGTTTCAAGCTTTAAGACCAAAAGGGTTGACCCTCACCATGGCT...,True
1118365,chr8,72987671,72988177,chr8:72987671-72988177,chr8:72987743:G:G:T,72987743,G,G,T,0.004071,35,chr8:72987708-72987779,71,TTAATTAAAGATTACAAACAAAGCTGAAAACCTAGGAGAAGATAGG...,TTAATTAAAGATTACAAACAAAGCTGAAAACCTAGGAGAAGATAGG...,TTAATTAAAGATTACAAACAAAGCTGAAAACCTAGTAGAAGATAGG...,True


**Get sequence for full table**

In [15]:
%%time

### init
dat = dat_variant_arrange.copy()

### add sequences
fun = partial(get_variant_interval_series, num_interval_flank=NUM_INTERVAL_FLANK)
lst = ["Flank", "Interval", "Seq_Length", "Seq_Ref", "Seq_Obs", "Seq_Unobs", "Check_RefMatch"]
dat[lst] = dat.apply(fun, axis=1)

### assign and show
dat_variant_withseq_flank35 = dat
print(dat.shape)
dat.head()

(1884977, 17)
CPU times: user 3min 51s, sys: 26.3 s, total: 4min 17s
Wall time: 4min 18s


Unnamed: 0,Chrom,ChromStart,ChromEnd,Region,Variant_ID,Pos0,Ref,Obs,Unobs,Delta,Flank,Interval,Seq_Length,Seq_Ref,Seq_Obs,Seq_Unobs,Check_RefMatch
0,chr4,74487576,74488141,chr4:74487576-74488141,chr4:74487586:A:G:C,74487586,A,G,C,0.964984,35,chr4:74487551-74487622,71,ACATGTGTGTATGCTTGTCTATATACACTGTATGAAGCAAGAGGAG...,ACATGTGTGTATGCTTGTCTATATACACTGTATGAGGCAAGAGGAG...,ACATGTGTGTATGCTTGTCTATATACACTGTATGACGCAAGAGGAG...,True
1,chr9,135685425,135685942,chr9:135685425-135685942,chr9:135685432:G:G:C,135685432,G,G,C,0.945423,35,chr9:135685397-135685468,71,CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGAGGCAACACCAA...,CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGAGGCAACACCAA...,CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGACGCAACACCAA...,True
2,chr1,173251481,173251541,chr1:173251481-173251541,chr1:173251490:G:G:C,173251490,G,G,C,0.910127,35,chr1:173251455-173251526,71,ACATAGTACTCAATACCTATTTGGTGAATAGATGAGGCAATCTCAC...,ACATAGTACTCAATACCTATTTGGTGAATAGATGAGGCAATCTCAC...,ACATAGTACTCAATACCTATTTGGTGAATAGATGACGCAATCTCAC...,True
3,chr1,147523084,147523503,chr1:147523084-147523503,chr1:147523391:G:G:C,147523391,G,G,C,0.9088,35,chr1:147523356-147523427,71,ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGAGGCAATGCAAT...,ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGAGGCAATGCAAT...,ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGACGCAATGCAAT...,True
4,chr18,54580694,54581420,chr18:54580694-54581420,chr18:54580749:A:G:T,54580749,A,G,T,0.862296,35,chr18:54580714-54580785,71,TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGAAGCAATGGCCA...,TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGAGGCAATGGCCA...,TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGATGCAATGGCCA...,True


**Export results**

In [16]:
### set file directory for table with sequences
txt_fdiry = os.path.join(FD_RES, "analysis_variant_motif_richard")
txt_fname = "variant_closed_gof_bluestarr_withseq_flank35.tsv.gz"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

### write table with sequences
dat_variant_withseq_flank35.to_csv(txt_fpath, sep="\t", index=False, compression="gzip")

## Test function

In [17]:
def get_variant_interval(
    txt_chrom_name, 
    txt_chrom_pos0, 
    txt_allele_ref, 
    txt_allele_obs, 
    txt_allele_unobs, 
    num_interval_flank = 35
):
    """
    Get reference, observed, and unobserved allele windows around a variant.

    Parameters
    ----------
    txt_chrom_name : str
        Chromosome name (e.g., "chr4").
    txt_chrom_pos0 : int
        Variant position (0-based coordinate).
    txt_allele_ref : str
        Reference allele from the table.
    txt_allele_obs : str
        Observed alternate allele.
    txt_allele_unobs : str
        Unobserved alternate allele.
    num_interval_flank : int, default=35
        Number of bases to include on each side of the variant.

    Returns
    -------
    tuple of str
        (seq_ref, seq_obs, seq_unobs) sequences of length 2*flank+1.
    """
    ### extract reference window from FASTA
    txt_seq_ref = fasta.fetch(
        txt_chrom_name,
        txt_chrom_pos0 - num_interval_flank,
        txt_chrom_pos0 + num_interval_flank + 1
    )

    ### sanity check central base
    if txt_seq_ref[num_interval_flank].upper() != txt_allele_ref.upper():
        raise ValueError(
            f"Reference mismatch at {txt_chrom_name}:{txt_chrom_pos0} "
            f"(expected {txt_allele_ref}, got {seq_ref[num_interval_flank]})"
        )

    ### build alt windows
    txt_seq_obs   = txt_seq_ref[:num_interval_flank] + txt_allele_obs   + txt_seq_ref[num_interval_flank+1:]
    txt_seq_unobs = txt_seq_ref[:num_interval_flank] + txt_allele_unobs + txt_seq_ref[num_interval_flank+1:]

    return txt_seq_ref, txt_seq_obs, txt_seq_unobs

In [18]:
row = dat_variant_import.iloc[0]

txt_chrom_name = row["region"].split(":")[0]
txt_chrom_pos0 = row["pos0"]
txt_allele_ref = row["ref"]
txt_allele_obs = row["obs"]
txt_allele_unobs = row["unobs"]

txt_seq_ref, txt_seq_obs, txt_seq_unobs = get_variant_interval(
    txt_chrom_name, txt_chrom_pos0, 
    txt_allele_ref, txt_allele_obs, txt_allele_unobs,
    num_interval_flank = NUM_INTERVAL_FLANK
)

print("ref   :", txt_seq_ref)
print("obs   :", txt_seq_obs)
print("unobs :", txt_seq_unobs)

ref   : acatgtgtgtatgcttgtctatataCACTGTATGAAGCAAGAGGAGAAAATACTCATGAATTCTAGGAACT
obs   : acatgtgtgtatgcttgtctatataCACTGTATGAGGCAAGAGGAGAAAATACTCATGAATTCTAGGAACT
unobs : acatgtgtgtatgcttgtctatataCACTGTATGACGCAAGAGGAGAAAATACTCATGAATTCTAGGAACT


In [21]:
%%time

dat = dat_variant_import.copy()
dat = dat.sample(10000, random_state=1)


fun = partial(get_variant_interval_series, num_interval_flank=NUM_INTERVAL_FLANK)
dat[["interval", "seq_ref", "seq_obs", "seq_unobs"]] = dat.apply(
    fun, axis=1
)

dat_variant_arrange = dat
print(dat.shape)
dat.head()

(10000, 9)
CPU times: user 2.14 s, sys: 225 ms, total: 2.36 s
Wall time: 2.38 s


Unnamed: 0,region,pos0,delta,ref,obs,unobs,seq_ref,seq_obs,seq_unobs
771970,chr4:183063682-183063779,183063712,0.006892,T,C,A,AAAGAAATACAGAATTTTTCCCATGGTCACTCTTCTGAGGCACAGT...,AAAGAAATACAGAATTTTTCCCATGGTCACTCTTCCGAGGCACAGT...,AAAGAAATACAGAATTTTTCCCATGGTCACTCTTCAGAGGCACAGT...
256899,chr7:115860432-115860716,115860564,0.015483,T,T,C,GAATAACACAGAGCTTTGAGTTCACCCAAACCCATTGTGATCAAGG...,GAATAACACAGAGCTTTGAGTTCACCCAAACCCATTGTGATCAAGG...,GAATAACACAGAGCTTTGAGTTCACCCAAACCCATCGTGATCAAGG...
1618901,chr9:116092981-116093246,116093025,0.001214,T,T,A,AAGCAGTTCTCTAAGACTCTAGTTAGTAAATTGAATATTCATTTAA...,AAGCAGTTCTCTAAGACTCTAGTTAGTAAATTGAATATTCATTTAA...,AAGCAGTTCTCTAAGACTCTAGTTAGTAAATTGAAAATTCATTTAA...
149278,chr1:152521537-152522216,152521539,0.020028,G,G,T,ttaatttaaagtagacaacagcaagtgcaaaggTTGGAAAGGTGAT...,ttaatttaaagtagacaacagcaagtgcaaaggTTGGAAAGGTGAT...,ttaatttaaagtagacaacagcaagtgcaaaggTTTGAAAGGTGAT...
201620,chr7:34886252-34886354,34886288,0.017478,A,A,T,GCTATTATCTAGTTCCTCATGATTAATGCTGTGTTAGGACTCCAGA...,GCTATTATCTAGTTCCTCATGATTAATGCTGTGTTAGGACTCCAGA...,GCTATTATCTAGTTCCTCATGATTAATGCTGTGTTTGGACTCCAGA...
