**Set environment**

In [1]:
import numpy  as np
import pandas as pd
import os

from Bio.Seq       import Seq
from Bio.SeqRecord import SeqRecord
from Bio           import SeqIO

In [2]:
%run ../run_config_project.py
show_env()

BASE DIRECTORY (FD_BASE): /hpc/group/igvf/kk319
REPO DIRECTORY (FD_REPO): /hpc/group/igvf/kk319/repo
WORK DIRECTORY (FD_WORK): /hpc/group/igvf/kk319/work
DATA DIRECTORY (FD_DATA): /hpc/group/igvf/kk319/data


You are working with      IGVF BlueSTARR
PATH OF PROJECT (FD_PRJ): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR
PROJECT RESULTS (FD_RES): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results
PROJECT SCRIPTS (FD_EXE): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/scripts
PROJECT DATA    (FD_DAT): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/data
PROJECT NOTE    (FD_NBK): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/notebooks
PROJECT DOCS    (FD_DOC): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/docs
PROJECT LOG     (FD_LOG): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/log
PROJECT REF     (FD_REF): /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/references



## Import data

**Check file existent**

In [3]:
%env FD_RES={FD_RES}

env: FD_RES=/hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results


In [4]:
%%bash
ls ${FD_RES}/analysis_variant_motif_richard

test_seq.fa
test_seq.fa.fai
test_variants.fa
test_variants.fa.fai
variant_closed_gof_bluestarr.tsv.gz
variant_closed_gof_bluestarr_withseq_flank35.tsv.gz


**Import table**

In [5]:
### set file directory
txt_fdiry = os.path.join(FD_RES, "analysis_variant_motif_richard")
txt_fname = "variant_closed_gof_bluestarr_withseq_flank35.tsv.gz"
txt_fpath = os.path.join(txt_fdiry, txt_fname)

### read table
dat = pd.read_csv(txt_fpath, sep = "\t")

### assign and show
dat_variant_withseq = dat
print(dat.shape)
dat.head()

(1884977, 17)


Unnamed: 0,Chrom,ChromStart,ChromEnd,Region,Variant_ID,Pos0,Ref,Obs,Unobs,Delta,Flank,Interval,Seq_Length,Seq_Ref,Seq_Obs,Seq_Unobs,Check_RefMatch
0,chr4,74487576,74488141,chr4:74487576-74488141,chr4:74487586:A:G:C,74487586,A,G,C,0.964984,35,chr4:74487551-74487622,71,ACATGTGTGTATGCTTGTCTATATACACTGTATGAAGCAAGAGGAG...,ACATGTGTGTATGCTTGTCTATATACACTGTATGAGGCAAGAGGAG...,ACATGTGTGTATGCTTGTCTATATACACTGTATGACGCAAGAGGAG...,True
1,chr9,135685425,135685942,chr9:135685425-135685942,chr9:135685432:G:G:C,135685432,G,G,C,0.945423,35,chr9:135685397-135685468,71,CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGAGGCAACACCAA...,CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGAGGCAACACCAA...,CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGACGCAACACCAA...,True
2,chr1,173251481,173251541,chr1:173251481-173251541,chr1:173251490:G:G:C,173251490,G,G,C,0.910127,35,chr1:173251455-173251526,71,ACATAGTACTCAATACCTATTTGGTGAATAGATGAGGCAATCTCAC...,ACATAGTACTCAATACCTATTTGGTGAATAGATGAGGCAATCTCAC...,ACATAGTACTCAATACCTATTTGGTGAATAGATGACGCAATCTCAC...,True
3,chr1,147523084,147523503,chr1:147523084-147523503,chr1:147523391:G:G:C,147523391,G,G,C,0.9088,35,chr1:147523356-147523427,71,ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGAGGCAATGCAAT...,ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGAGGCAATGCAAT...,ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGACGCAATGCAAT...,True
4,chr18,54580694,54581420,chr18:54580694-54581420,chr18:54580749:A:G:T,54580749,A,G,T,0.862296,35,chr18:54580714-54580785,71,TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGAAGCAATGGCCA...,TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGAGGCAATGGCCA...,TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGATGCAATGGCCA...,True


## Helper function

**Test**

In [6]:
dat["Variant_ID"].head()

0     chr4:74487586:A:G:C
1    chr9:135685432:G:G:C
2    chr1:173251490:G:G:C
3    chr1:147523391:G:G:C
4    chr18:54580749:A:G:T
Name: Variant_ID, dtype: object

In [7]:
dat["Seq_Ref"].head()

0    ACATGTGTGTATGCTTGTCTATATACACTGTATGAAGCAAGAGGAG...
1    CCGGGGACACATCCCCCAGGAAACCTCTGGGCTGAGGCAACACCAA...
2    ACATAGTACTCAATACCTATTTGGTGAATAGATGAGGCAATCTCAC...
3    ATTTTTAAAAGTTCAGGTAGTTTAAAGATGGCTGAGGCAATGCAAT...
4    TGTTTAGCTAACAGAAATACTTCAGCAGAGCCTGAAGCAATGGCCA...
Name: Seq_Ref, dtype: object

In [8]:
seq = dat["Seq_Ref"][0]
print(seq)

seq = Seq(seq)
print(seq)
print(type(seq))

ACATGTGTGTATGCTTGTCTATATACACTGTATGAAGCAAGAGGAGAAAATACTCATGAATTCTAGGAACT
ACATGTGTGTATGCTTGTCTATATACACTGTATGAAGCAAGAGGAGAAAATACTCATGAATTCTAGGAACT
<class 'Bio.Seq.Seq'>


In [9]:
idx = dat["Variant_ID"][0]
seq = dat["Seq_Ref"][0]
seq = SeqRecord(Seq(seq), id=idx, description="")
print(seq)

ID: chr4:74487586:A:G:C
Name: <unknown name>
Number of features: 0
Seq('ACATGTGTGTATGCTTGTCTATATACACTGTATGAAGCAAGAGGAGAAAATACT...ACT')


**FASTA writer using Biopython**

In [10]:
def write_fasta_biopython(txt_fpath, vec_txt_index, vec_txt_sequence):
    lst_seq_record = [
        SeqRecord(Seq(seq), id=idx, description="")
        for idx, seq in zip(vec_txt_index, vec_txt_sequence)
    ]
    SeqIO.write(lst_seq_record, txt_fpath, "fasta")

## Export sequences in fasta files

**Set file paths**

In [11]:
txt_fdiry       = os.path.join(FD_RES,    "analysis_variant_motif_richard")
txt_fpath_ref   = os.path.join(txt_fdiry, "variant_closed_gof_bluestarr_flank35_ref.fa")
txt_fpath_obs   = os.path.join(txt_fdiry, "variant_closed_gof_bluestarr_flank35_obs.fa")
txt_fpath_unobs = os.path.join(txt_fdiry, "variant_closed_gof_bluestarr_flank35_unobs.fa")

**Export files**

In [12]:
### write Ref sequences
write_fasta_biopython(txt_fpath_ref, dat["Variant_ID"], dat["Seq_Ref"])
print("Wrote Ref FASTA:", txt_fpath_ref)

### write Obs sequences
write_fasta_biopython(txt_fpath_obs, dat["Variant_ID"], dat["Seq_Obs"])
print("Wrote Obs FASTA:", txt_fpath_obs)

### write Unobs sequences
write_fasta_biopython(txt_fpath_unobs, dat["Variant_ID"], dat["Seq_Unobs"])
print("Wrote Unobs FASTA:", txt_fpath_unobs)

Wrote Ref FASTA: /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results/analysis_variant_motif_richard/variant_closed_gof_bluestarr_flank35_ref.fa
Wrote Obs FASTA: /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results/analysis_variant_motif_richard/variant_closed_gof_bluestarr_flank35_obs.fa
Wrote Unobs FASTA: /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results/analysis_variant_motif_richard/variant_closed_gof_bluestarr_flank35_unobs.fa


**Double check the export**

In [13]:
def qc_fasta_against_table(txt_fpath, vec_txt_index, vec_txt_sequence, n_check=5):
    """
    Read FASTA back in and check against original table.
    """
    ### import fasta file
    records = list(SeqIO.parse(txt_fpath, "fasta"))
    print(f"Read {len(records)} records from {os.path.basename(txt_fpath)}")

    # QC: same number of sequences
    if len(records) != len(vec_txt_index):
        print("Length mismatch: FASTA vs DataFrame")
    else:
        print("Record count matches DataFrame")

    # QC: compare a few records
    for rec, idx, seq in zip(records[:n_check], vec_txt_index[:n_check], vec_txt_sequence[:n_check]):
        print(f"\nID (FASTA): {rec.id}")
        print(f"ID (Table): {idx}")
        print("Match ID?   ", rec.id == idx)

        print(f"Seq length: {len(rec.seq)} vs {len(seq)}")
        print("Match Seq?  ", str(rec.seq) == seq)

In [14]:
### check for Ref FASTA
qc_fasta_against_table(
    txt_fpath_ref,
    dat["Variant_ID"].values,
    dat["Seq_Ref"].values
)

Read 1884977 records from variant_closed_gof_bluestarr_flank35_ref.fa
Record count matches DataFrame

ID (FASTA): chr4:74487586:A:G:C
ID (Table): chr4:74487586:A:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr9:135685432:G:G:C
ID (Table): chr9:135685432:G:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr1:173251490:G:G:C
ID (Table): chr1:173251490:G:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr1:147523391:G:G:C
ID (Table): chr1:147523391:G:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr18:54580749:A:G:T
ID (Table): chr18:54580749:A:G:T
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True


In [15]:
### check for Obs FASTA
qc_fasta_against_table(
    txt_fpath_obs,
    dat["Variant_ID"].values,
    dat["Seq_Obs"].values
)

Read 1884977 records from variant_closed_gof_bluestarr_flank35_obs.fa
Record count matches DataFrame

ID (FASTA): chr4:74487586:A:G:C
ID (Table): chr4:74487586:A:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr9:135685432:G:G:C
ID (Table): chr9:135685432:G:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr1:173251490:G:G:C
ID (Table): chr1:173251490:G:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr1:147523391:G:G:C
ID (Table): chr1:147523391:G:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr18:54580749:A:G:T
ID (Table): chr18:54580749:A:G:T
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True


In [16]:
### check for Unobs FASTA
qc_fasta_against_table(
    txt_fpath_unobs,
    dat["Variant_ID"].values,
    dat["Seq_Unobs"].values
)

Read 1884977 records from variant_closed_gof_bluestarr_flank35_unobs.fa
Record count matches DataFrame

ID (FASTA): chr4:74487586:A:G:C
ID (Table): chr4:74487586:A:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr9:135685432:G:G:C
ID (Table): chr9:135685432:G:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr1:173251490:G:G:C
ID (Table): chr1:173251490:G:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr1:147523391:G:G:C
ID (Table): chr1:147523391:G:G:C
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True

ID (FASTA): chr18:54580749:A:G:T
ID (Table): chr18:54580749:A:G:T
Match ID?    True
Seq length: 71 vs 71
Match Seq?   True
