In [2]:
import os

from pathlib import Path
from Bio import SeqIO
from tqdm import tqdm

# Section 0: Setup

In [3]:
data_dir   = "/home/brett/work/OrthogonalTrainValSplits/hashFrag/data"
fasta_path = os.path.join(data_dir,"K562.sample_10000.fa.gz")
label      = os.path.basename(fasta_path).replace(".fa.gz","")
score_path = os.path.join(data_dir,f"{label}.pairwise_scores.csv.gz")
work_dir   = os.path.join(data_dir,f"{label}.hashFrag.work")
Path(work_dir).mkdir(parents=True,exist_ok=True)
print(work_dir)

blast_dir = os.path.join(work_dir,"blast_partitions")
Path(blast_dir).mkdir(parents=True,exist_ok=True)

/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work


## Section 0.1 - Computing all pairwise comparisons

See job script. TODO

# Section 1 - Identifying candidate similar sequences

In [6]:
%%bash

FASTA_PATH=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.fa.gz
WORK_DIR=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work

WORD_SIZE=7
MAX_TARGET_SEQS=10000 # size of dataset
E_VALUE=100
DUST=no

hashFrag blastn -f $FASTA_PATH -w $WORD_SIZE -e $E_VALUE -d $DUST -o $WORK_DIR



Building a new DB, current time: 12/10/2024 08:00:47
New DB name:   /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/K562.sample_10000.blastdb
New DB title:  K562.sample_10000
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 10000 sequences in 0.260447 seconds.



BLAST DataBase construction finished and written to: /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/K562.sample_10000.blastdb

BLASTn process finished and written to: /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/K562.sample_10000.blastn.out


# Section 2: Computing Smith-Waterman alignment score for candidate pairs

In [9]:
%%bash

SPLIT_SIZE=100000
BLAST_PATH=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/K562.sample_10000.blastn.out
BLAST_DIR=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions

LABEL=$( basename -s ".out" $BLAST_PATH )

mkdir -p $BLAST_DIR
cd $BLAST_DIR

split -l $SPLIT_SIZE -a 4 $BLAST_PATH ${LABEL}.partition_

In [8]:
%%bash

cd ../src/external

FASTA_PATH=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.fa.gz
PARTITIONED_BLAST_DIR=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions

for PARTITIONED_BLAST_PATH in $PARTITIONED_BLAST_DIR/*.blastn.partition_*
do
    echo $PARTITIONED_BLAST_PATH
    bash compute_blast_candidate_SW_scores.sh $FASTA_PATH $PARTITIONED_BLAST_PATH
done

/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions/K562.sample_10000.blastn.partition_aaaa
/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions/K562.sample_10000.blastn.partition_aaab
/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions/K562.sample_10000.blastn.partition_aaac
/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions/K562.sample_10000.blastn.partition_aaad
/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions/K562.sample_10000.blastn.partition_aaae
/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions/K562.sample_10000.blastn.partition_aaaf
/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions/K562.sample_10000.blastn.

# Section 3: Filter false-positives based on a defined threshold

In [17]:
%%bash

BLAST_DIR=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blast_partitions
SCORE_DIR=$BLAST_DIR
THRESHOLD=60
WORK_DIR=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work

hashFrag filter_false_positives -s $SCORE_DIR -b $BLAST_DIR -t $THRESHOLD -o $WORK_DIR

Filtered results written to: /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blastn_results.filtered_candidates.tsv.gz


# Section 4: Determine groups of homology

In [22]:
%%bash

HITS_PATH=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/blastn_results.filtered_candidates.tsv.gz
OUTPUT_PATH=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/homologous_groups.csv

hashFrag identify_homologous_groups -i $HITS_PATH -o $OUTPUT_PATH

1138 sequences exhibiting homology.
92 homologous groups identified.
Homologous groups written to file: /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/homologous_groups.csv


# Section 5: Create orthogonal data split

In [40]:
%%bash

FASTA_PATH=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.fa.gz
HOMOLOGY_PATH=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/homologous_groups.csv
OUT_DIR=/home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work

hashFrag create_orthogonal_splits -f $FASTA_PATH -i $HOMOLOGY_PATH -n 10 -o $OUT_DIR

Writing splits...
  /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/hashFrag.train_8000.test_2000.split_001.csv.gz
  /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/hashFrag.train_8000.test_2000.split_002.csv.gz
  /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/hashFrag.train_8000.test_2000.split_003.csv.gz
  /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/hashFrag.train_8000.test_2000.split_004.csv.gz
  /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/hashFrag.train_8000.test_2000.split_005.csv.gz
  /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/hashFrag.train_8000.test_2000.split_006.csv.gz
  /home/brett/work/OrthogonalTrainValSplits/hashFrag/data/K562.sample_10000.hashFrag.work/hashFrag.train_8000.test_2000.split_007.csv.gz
  /home/brett/work/Orth