In [12]:
import bioframe
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
R1FastqDir = "/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-30"
R2FastqDir = "/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-39"

In [14]:
conditions = [
    't0Mit_R1_T1',
    't2_R1',
    't4DMSO_R1',
    't4ICRF_R1',
    't8DMSO_R1',
    't8ICRF_R1',
    't0Mit_R1_T2',
    't0Mit_R2',
    't2_R2',
    't4DMSO_R2',
    't4ICRF_R2',
    't8DMSO_R2',
    't8ICRF_R2'
]

long_names = {
    't0Mit_R1_T1' : 'TI-MC3C-Dpn-t0Mit-4-30',
    't2_R1' : 'TI-MC3C-Dpn-t2-4-30',
    't4DMSO_R1' : 'TI-MC3C-Dpn-t4DMSO-4-30',
    't4ICRF_R1' : 'TI-MC3C-Dpn-t4ICRF-4-30',
    't8DMSO_R1' : 'TI-MC3C-Dpn-t8DMSO-4-30',
    't8ICRF_R1' : 'TI-MC3C-Dpn-t8ICRF-4-30',
    't0Mit_R1_T2' : 'TI-MC3C-Dpn-t0Mit-4-30-T2',
    't0Mit_R2' : 'TI-MC3C-Dpn-t0Mit-4-39',
    't2_R2' : 'TI-MC3C-Dpn-t2-4-39',
    't4DMSO_R2' : 'TI-MC3C-Dpn-t4DMSO-4-39',
    't4ICRF_R2' : 'TI-MC3C-Dpn-t4ICRF-4-39',
    't8DMSO_R2' : 'TI-MC3C-Dpn-t8DMSO-4-39',
    't8ICRF_R2' : 'TI-MC3C-Dpn-t8ICRF-4-39'
}

In [15]:
fastq_files = {}
for cond in conditions[0:6]:
    fastq_files[cond] = f'{R1FastqDir}/{long_names[cond]}.fastq.gz'
for cond in conditions[6:13]:
    fastq_files[cond] = f'{R2FastqDir}/{long_names[cond]}.fastq.gz'    

In [16]:
fastq_files

{'t0Mit_R1_T1': '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-30/TI-MC3C-Dpn-t0Mit-4-30.fastq.gz',
 't2_R1': '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-30/TI-MC3C-Dpn-t2-4-30.fastq.gz',
 't4DMSO_R1': '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-30/TI-MC3C-Dpn-t4DMSO-4-30.fastq.gz',
 't4ICRF_R1': '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-30/TI-MC3C-Dpn-t4ICRF-4-30.fastq.gz',
 't8DMSO_R1': '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-30/TI-MC3C-Dpn-t8DMSO-4-30.fastq.gz',
 't8ICRF_R1': '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-30/TI-MC3C-Dpn-t8ICRF-4-30.fastq.gz',
 't0Mit_R1_T2': '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-39/TI-MC3C-Dpn-t0Mit-4-30-T2.fastq.gz',
 't0Mit_R2': '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-39/TI-MC3C-Dpn-t0Mit-4-39.fastq.gz',
 't2_R2': '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/fastq/exp4-39/TI-MC3C-Dpn-t2-4-39.fastq.gz',
 't4DMSO_R2': 

In [17]:
refDir = '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/minimap/reference'
ref_genome_file = f'{refDir}/hg19.fa'

In [18]:
alignmentDir = '/nl/umw_job_dekker/users/eh37w/Topo-Inhib/MC3C/minimap/alignments'
output_filenames = {}
for cond in conditions:
    output_filenames[cond] = f'{alignmentDir}/{long_names[cond]}.minimap2.output.paf'

In [22]:
#--secondary no reports only primary alignments
#Ran this already - worked fine! Although there are some extra columns that I'm not sure what they are...

for cond in conditions:
    in_fname = fastq_files[cond]
    out_fname = output_filenames[cond]
    !bsub -q short -W 04:00 -e /home/eh37w/lsf_jobs/LSB_%J.err -o /home/eh37w/lsf_jobs/LSB_%J.log \
        -n 3 -R span[hosts=1] -R select[ib] -R rusage[mem=8000] -N -u erica.hildebrand@umassmed.edu \
        "minimap2 --secondary=no $ref_genome_file $in_fname > $out_fname"

Job <3045922> is submitted to queue <short>.
Job <3045923> is submitted to queue <short>.
Job <3045924> is submitted to queue <short>.
Job <3045925> is submitted to queue <short>.
Job <3045926> is submitted to queue <short>.
Job <3045927> is submitted to queue <short>.
Job <3045928> is submitted to queue <short>.
Job <3045929> is submitted to queue <short>.
Job <3045930> is submitted to queue <short>.
Job <3045931> is submitted to queue <short>.
Job <3045932> is submitted to queue <short>.
Job <3045933> is submitted to queue <short>.
Job <3045934> is submitted to queue <short>.


In [19]:
output_filenames_2 = {}
for cond in conditions:
    output_filenames_2[cond] = f'{alignmentDir}/{long_names[cond]}.minimap2.KeepUnmapped.output.paf'

In [21]:
#--secondary no reports only primary alignments
#Also trying with reporting unmapped reads as well - might help id full walks??

for cond in conditions:
    in_fname = fastq_files[cond]
    out_fname = output_filenames_2[cond]
    !bsub -q short -W 04:00 -e /home/eh37w/lsf_jobs/LSB_%J.err -o /home/eh37w/lsf_jobs/LSB_%J.log \
        -n 3 -R span[hosts=1] -R select[ib] -R rusage[mem=8000] -N -u erica.hildebrand@umassmed.edu \
        "minimap2 --secondary=no --paf-no-hit $ref_genome_file $in_fname > $out_fname"

Job <3045909> is submitted to queue <short>.
Job <3045910> is submitted to queue <short>.
Job <3045911> is submitted to queue <short>.
Job <3045912> is submitted to queue <short>.
Job <3045913> is submitted to queue <short>.
Job <3045914> is submitted to queue <short>.
Job <3045915> is submitted to queue <short>.
Job <3045916> is submitted to queue <short>.
Job <3045917> is submitted to queue <short>.
Job <3045918> is submitted to queue <short>.
Job <3045919> is submitted to queue <short>.
Job <3045920> is submitted to queue <short>.
Job <3045921> is submitted to queue <short>.
