In [None]:
#Use conda env minimap_conda_env.yml to create conda env for this script
#This is an example script of mapping MC-3C data using minimap2
#It is run from within the 'scripts' subdirectory, using following directory structure:
#Analysis_Dir
#├── data
#    ├── permutations
#├── alignments
#├── figures
#├── scripts
#├── lsf_jobs

In [None]:
import bioframe
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
R1FastqDir = #path_to_mc3c_fastq_files
outDataDir = '../'

In [None]:
conditions = [
    't0Mit_R1_T1',
    't2_R1',
    't4DMSO_R1',
    't4ICRF_R1',
    't8DMSO_R1',
    't8ICRF_R1',
    't0Mit_R1_T2',
    't0Mit_R2',
    't2_R2',
    't4DMSO_R2',
    't4ICRF_R2',
    't8DMSO_R2',
    't8ICRF_R2',
    't0Mit_R3',
    't2_R3',
    't4DMSO_R3',
    't4ICRF_R3',
    't8DMSO_R3',
    't8ICRF_R3'
]

long_names = {
    't0Mit_R1_T1' : 'TI-MC3C-Dpn-t0Mit-4-30',
    't2_R1' : 'TI-MC3C-Dpn-t2-4-30',
    't4DMSO_R1' : 'TI-MC3C-Dpn-t4DMSO-4-30',
    't4ICRF_R1' : 'TI-MC3C-Dpn-t4ICRF-4-30',
    't8DMSO_R1' : 'TI-MC3C-Dpn-t8DMSO-4-30',
    't8ICRF_R1' : 'TI-MC3C-Dpn-t8ICRF-4-30',
    't0Mit_R1_T2' : 'TI-MC3C-Dpn-t0Mit-4-30-T2',
    't0Mit_R2' : 'TI-MC3C-Dpn-t0Mit-4-39',
    't2_R2' : 'TI-MC3C-Dpn-t2-4-39',
    't4DMSO_R2' : 'TI-MC3C-Dpn-t4DMSO-4-39',
    't4ICRF_R2' : 'TI-MC3C-Dpn-t4ICRF-4-39',
    't8DMSO_R2' : 'TI-MC3C-Dpn-t8DMSO-4-39',
    't8ICRF_R2' : 'TI-MC3C-Dpn-t8ICRF-4-39',
    't0Mit_R3' : 'TI-MC3C-Dpn-t0Mit-R3-5-14',
    't2_R3' : 'TI-MC3C-Dpn-t2-R3-5-14',
    't4DMSO_R3' : 'TI-MC3C-Dpn-t4DMSO-R3-5-14',
    't4ICRF_R3' : 'TI-MC3C-Dpn-t4ICRF-R3-5-14',
    't8DMSO_R3' : 'TI-MC3C-Dpn-t8DMSO-R3-5-14',
    't8ICRF_R3' : 'TI-MC3C-Dpn-t8ICRF-R3-5-14'
}

In [None]:
fastq_files = {}
for cond in conditions:
    fastq_files[cond] = f'{R1FastqDir}/{long_names[cond]}.fastq.gz'

In [None]:
refDir = f'{outDataDir}/data'
ref_genome_file = f'{outDataDir}/data/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz'
#from https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/

In [None]:
alignmentDir = f'{outDataDir}/alignments'
output_filenames = {}
for cond in conditions:
    output_filenames[cond] = f'{alignmentDir}/{long_names[cond]}.hg38.minimap2.output.paf'

In [None]:
#--secondary no reports only primary alignments

for cond in conditions:
    in_fname = fastq_files[cond]
    out_fname = output_filenames[cond]
    !bsub -q short -W 04:00 -e ../lsf_jobs/LSB_%J.err -o ../lsf_jobs/LSB_%J.log \
        -n 3 -R span[hosts=1] -R select[ib] -R rusage[mem=8000]\
        "minimap2 --secondary=no $ref_genome_file $in_fname > $out_fname"