# partitioning data

In [None]:
%%bash

python src/partition_dataset.py -i data/K562.tsv -p 10 -o data/partitioned

# calculate alignment for the partition

In [None]:
%%bash

DISTANCES_DIR=data/batch_alignments
mkdir -p $DISTANCES_DIR

python src/batch_sequence_alignment.py -i data/partitioned/K562.part_001.fa.gz -j data/partitioned/K562.part_002.fa.gz -o data/batch_alignments/K562.part_001.part_002.txt.gz

# submit an array job that computes the alignment scores for different partitions of the dataset

##### A skeleton of that script is provided below

In [None]:
%%bash

FASTA_FILEPATHS=data/partitioned/filepaths.txt
N_PARTITIONS=$( cat $FASTA_FILEPATHS | wc -l )
DISTANCES_DIR=data/batch_alignments
N_PARTITIONS=$( cat $FASTA_FILEPATHS | wc -l )

for SLURM_ARRAY_TASK_ID in {1..6};do
    PAIRWISE_INFO=$( sed -n ${SLURM_ARRAY_TASK_ID}p $FASTA_FILEPATHS )
    FILENAME=$( echo $PAIRWISE_INFO | cut -d "," -f 1 )
    FASTA_PATH_i=$( echo $PAIRWISE_INFO | cut -d "," -f 2 )
    FASTA_PATH_j=$( echo $PAIRWISE_INFO | cut -d "," -f 3 )
    OUTPATH=$DISTANCES_DIR/${FILENAME}

    echo "Fasta path i: ${FASTA_PATH_i}"
    echo "Fasta path j: ${FASTA_PATH_j}"
    echo "Output path:  ${OUTPATH}"

    python src/batch_sequence_alignment.py \
    -i $FASTA_PATH_i \
    -j $FASTA_PATH_j \
    -o $OUTPATH
    
done

# merging the computations to create the full matrix

In [None]:
%%bash

python src/batch_matrix_construction.py \
-i data/partitioned \
-d data/batch_alignments \
-m smith_waterman \
-o data/smith_waterman.darray.pbz2

# read the all by all matrix

In [None]:
import bz2
import pickle

In [None]:
with bz2.BZ2File("data/smith_waterman.darray.pbz2","rb") as handle:
    distarray = pickle.load(handle)

In [None]:
with open("data/smith_waterman.seqids.txt","r") as handle:
    idmap = handle.read().splitlines()

# create chromosomal splits

In [None]:
%%bash

python src/create_chromosomal_splits.py \
-i "data/K562.tsv" \
-tr "data/train_chroms_0.txt" \
-o "data/"

# Calculate similarity statistics between two sets of sequences

In [None]:
%%bash

python src/calculate_similarity_stats.py \
-m "data/smith_waterman.darray.pbz2" \
-s "data/smith_waterman.seqids.txt" \
-tr "data/train_indices.txt" \
-te "data/test_indices.txt" \
-o "sw_stats"

# find all sequence pairs that have similarity above a threshold

In [None]:
%%bash

python src/find_all_similar_seq_pairs_above_threshold.py \
-m "data/smith_waterman.darray.pbz2" \
-s "data/smith_waterman.seqids.txt" \
-t 10 \
-o "data/"

# find top N matches between training and test sequences

In [None]:
%%bash

python src/find_top_k_similar_seqs.py \
-m "data/smith_waterman.darray.pbz2" \
-s "data/smith_waterman.seqids.txt" \
-k 10 \
-o "data/top_10.txt"

# OverfitNN

In [None]:
%%bash

python src/overfitNN.py \
-i "data/top_10.txt" \
-mx 30 \
-mn 10 \
-bw 10 \
-p 0.05 \
-n 1 \
-tr "data/train.txt" \
-te "data/test.txt"

# calculate sequence alignment between two datasets (from scratch)

In [None]:
%%bash

python src/calculate_similarity_between_two_sets.py \
-set1 "data/train.txt" \
-set2 "data/test.txt" \
-t 12 \
-s "genomic" \
-o "data/sw.npy"

# selecting a threshold

##### calculate the similarity between two sets of genomic sequences and unrelated sequences and compare the maximum SW score the sequences have across the sets to decide on a threhsold

In [None]:
%%bash

python src/calculate_similarity_between_two_sets.py \
-set1 "data/train.txt" \
-set2 "data/test.txt" \
-t 12 \
-s "dinucleotide_shuffle" \
-o "data/sw_shuffled.npy"

# calculate maximum sequence alignment between two sets (from an all by all matrix)

In [None]:
%%bash

python src/calculate_max_alignment_between_two_sets.py \
-m "data/smith_waterman.darray.pbz2" \
-set1 "data/train_indices.txt" \
-set2 "data/test_indices.txt" \
-o "data/max_sw.npy"

# clustermap

In [None]:
%%bash

python src/clustermap.py \
-m "data/smith_waterman.darray.pbz2" \
-s "data/smith_waterman.seqids.txt" \
-set1 "data/train_indices.txt" \
-set2 "data/test_indices.txt" \
-mx 20 \
-mn 15 \
-o "data/"