In [31]:
from __future__ import print_function
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pylab
import pandas as pd
import numpy as np
import os
import sys
import gzip
import itertools
import operator
import subprocess
import twobitreader
from Bio.Alphabet import IUPAC
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pysam
import shutil

from LAM_scripts.LAM_helpersDanner.py import *


#  Pipeline1
# This, for the most part, is the UDITAS pipeline modified to allow for REPLACE targeting. 

Overview:
- Trim off the 5 nt on both sides and check for primering (how many to trim off of read 1 or two)
- need to check for priming (make it able to check priming on Read1 or two. add missmatches. and if input is trimmed or not. pull primer from data sheet. pull primer using coordiantes and get seq downstream depending on lenght. 
- trims short amplicons for reads that go into the adapter or illumina primers
- local align to the plasmid without the AAV seq at all
- local align to the AAV seq without the HDR arms
- after pulling out the reads that didn't align to AAV seq or plasmid backbone, analyze the breaks

- generate table of expected amplicons (use the hdr sample and just replace seq between breaks if thats possible)
- align agasint the reads that didn't map to AAV or plasmid backbone,
  look at indels and quantification

Test pipeline 2:
- Trim up to the cut site. Run end-to-end across the genome. Look for expected off target translocations
- also generally measure the frequency at which integrrations happen

Questions:
- which side is the HDR arm on? which of the two sides should i be looking for extra hdr transcripts from
- Can I bring in some of the analysis tools from crispresso to understand the indel profiles better.



In [40]:
#Directory

directory = '/home/eric/Data/Spaced_Nicking/LAM2_Miniseq_ELANE'
print(directory)

/home/eric/Data/Spaced_Nicking/LAM2_Miniseq_ELANE


In [33]:
##########        Assign the file_genome_2bit location.     ############ 
#
#   This is needed for pulling sequence from the referene genome by location
#assembly = amplicon_info['genome']
assembly = 'hg38'
file_genome_2bit = os.path.join('/home/eric/Data/Ref_Genomes', assembly + '.2bit')
print(file_genome_2bit)

###############   BOWTIE2_INDEXES for genome alignments    ################
#
#check in bash: > ECHO $GENOMES_2BIT

%env BOWTIE2_INDEXES=/home/eric/Data/Ref_Genomes

/home/eric/Data/Ref_Genomes/hg38.2bit
env: BOWTIE2_INDEXES=/home/eric/Data/Ref_Genomes


In [4]:
#THIS trimming doesn't compress the trimmed files because there was some error where compressing into a .gz 
# was clipping the last ~24 reads or something like that.

def trimming_R1_R2(dir_sample, amplicon_info, R1trim, R2trim, pipeline):
    
    
    #define the input files and output files
    N7 = amplicon_info['index_I1']
    N5 = amplicon_info['index_I2']
    
    #input/output files
    if pipeline == 1:
        #inputs
        r1_fastq = create_filename(dir_sample, N7, N5, 'R1fastqgz')
        r2_fastq = create_filename(dir_sample, N7, N5, 'R2fastqgz')
        
         #output files not gz compressed
        r1_clipped_fastq = create_filename(dir_sample, N7, N5, 'R1fastq_LAM')
        r2_clipped_fastq = create_filename(dir_sample, N7, N5, 'R2fastq_LAM')
    
    elif pipeline == 2:
        print('makingpipeline2 files')
        r1_fastq = create_filename(dir_sample, N7, N5, 'unmapped_plasmid_R1fastqgz')
        r2_fastq = create_filename(dir_sample, N7, N5, 'unmapped_plasmid_R2fastqgz')
        
        #output files not gz compressed
        r1_clipped_fastq = create_filename(dir_sample, N7, N5, 'breaktrimmed_R1fastq')
        r2_clipped_fastq = create_filename(dir_sample, N7, N5, 'breaktrimmed_R2fastq')
   

    
    #open/create all these files
    ref_file_r1_fasta_good = open(r1_clipped_fastq, "w")
    ref_file_r2_fasta_good = open(r2_clipped_fastq, "w")

      
    
    # We open r1,r2 files and distribute reads
    with open_fastq_or_gz(r1_fastq) as r1_file, open_fastq_or_gz(r2_fastq) as r2_file:

        r1_r2 = itertools.izip(r1_file, r2_file)

        for header_r1, header_r2 in r1_r2:
           
            seq_r1, seq_r2 = r1_r2.next()

            r1_r2.next()

            qual_r1, qual_r2 = r1_r2.next()
            seq_r1, seq_r2 = seq_r1.rstrip(), seq_r2.rstrip()
            qual_r1, qual_r2 = qual_r1.rstrip(), qual_r2.rstrip()

            seq_r1_use = seq_r1[R1trim:]
            seq_q1_use = qual_r1[R1trim:]
            seq_r2_use = seq_r2[R2trim:]
            seq_q2_use = seq_r2[R2trim:]
            
            r1f = ref_file_r1_fasta_good
            r2f = ref_file_r2_fasta_good

            print("\n".join([header_r1.rstrip(), seq_r1_use.rstrip(), "+", seq_q1_use.rstrip()]), file=r1f)
            print("\n".join([header_r2.rstrip(), seq_r2_use.rstrip(), "+", seq_q2_use.rstrip()]), file=r2f)
                    
           
    #for fo in [r1_clipped_fastq, r2_clipped_fastq]:
     #   with open(fo) as f_in, gzip.open(fo + '.gz', 'wb') as f_out:
      #      f_out.writelines(f_in)
       # os.remove(fo)

In [7]:
########## Remove first 5 nts to remove adapter seq and spot generation sequencs  #########
# 
#     Misha adds 4 nt's to help spot generation on Illumina on both read 1 and 2
#    Read 1 has gene binding primer
#    Read 2 has adapter to ligate on universal reverse seq


                    # decide on the length to trim
adapter_seq = 'GACTATAGGGCACGCGTGG'
adapt_len = len(adapter_seq)
read2trim = 5 + adapt_len
print('read2trim is :', read2trim, ' nucleotides.')


                  # run the trimming
for i in range(30):
        
    amplicon_info = get_csv_data(directory, i)
    
    trimming_R1_R2(directory, amplicon_info, R1trim = 3, R2trim = read2trim, pipeline = 1)
    
    #print('done with sample', i)
    print('done with sample', i , amplicon_info['name'], amplicon_info['index_I1'],amplicon_info['index_I2'])

read2trim is : 24  nucleotides.
done with sample 0 1.1.F N701 N501
done with sample 1 1.2.F N701 N502
done with sample 2 1.3.F N701 N504
done with sample 3 2.1.F N701 N505
done with sample 4 2.2.F N701 N506
done with sample 5 2.3.F N701 N507
done with sample 6 3.1.F N701 N508
done with sample 7 3.2.F N701 N510
done with sample 8 3.3.F N702 N501
done with sample 9 12.1.F N702 N502
done with sample 10 12.2.F N702 N504
done with sample 11 12.3.F N702 N505
done with sample 12 13.1.F N702 N506
done with sample 13 13.2.F N702 N507
done with sample 14 13.3.F N702 N508
done with sample 15 1.1.R N703 N501
done with sample 16 1.2.R N703 N502
done with sample 17 1.3.R N703 N504
done with sample 18 2.1.R N703 N505
done with sample 19 2.2.R N703 N506
done with sample 20 2.3.R N703 N507
done with sample 21 3.1.R N703 N508
done with sample 22 3.2.R N703 N510
done with sample 23 3.3.R N704 N501
done with sample 24 12.1.R N704 N502
done with sample 25 12.2.R N704 N504
done with sample 26 12.3.R N704 N5

In [28]:
#ignore this

#could make an automatic way to generate the primer mismatch and downstream if I want later
start = 5226605
end = 5226627

genome = twobitreader.TwoBitFile(file_genome_2bit)
primer = genome['chr11'][int(start):int(end)]
print(primer)
print(reverse_complement(primer))

TGTCACAGTGCAGCTCACTCAG
CTGAGTGAGCTGCACTGTGACA


### Discard Mispriming Reads

LAM uses a anchored primer and then gets ride of the background gDNA. Then it uses a nested primer and so there should be a very clean product. 

In [8]:
############### GOOD PRIMING Filter ##########
#
#    Here we assume the NNNNN is no longer on Read1 and I used 'trimmed_R1R2() function'
#    gene specific primer on READ 1 in this case. The program only understnad checing primer 1***
# The program 

# make a dataframe to capture all of the priming information and put it in the 'results' folder
results_df_all = pd.DataFrame()

results_folder = os.path.join(directory, 'results')
if not os.path.exists(results_folder):
    os.mkdir(results_folder)
results_file = os.path.join(directory, 'results','all_priming.xlsx')
    
## inputs for correct_priming2() function
mismatches =3                       # I am using the Levenshtein distance so if it is out of frame it costs 2
trimmed_R1R2 = True                 #if the file has been already trimed
removePrimerPlusDownstream = False  # remove the primer/downstream seq if it is good (good for guideseq)
exportMismatch = True               # export the file of mismatches sequences

for i in range(30):
        
    amplicon_info = get_csv_data(directory, i)

    
    #EVERYTHING IS CAPITAL
    #                       
    #                    
    ThreePrimeEnd_seq =       'CCTCGGAGCGTTGGATGATAGAGTCGATCCAGTTTACAAA'
    #      Ch11:5226605:5226627                   
    ThrePrimeEnd_primeronly = 'CCTCGGAGCGTTGGATGATAGAGTCGATCC'
    
    #3primer 
    FivePrimeEnd_seq =            'CTTCTGGGCAGGAACCGTGGGATCGCCAGCGT'
    #      Ch11:5227054:5227084
    FivePrimeEnd_seq_primeronly = 'CTTCTGGGCAGGAACCGTG'

    direction = amplicon_info['Direction']

    if direction == 3:
        primer_seq_plus_downstream = ThreePrimeEnd_seq
        primer_seq = ThrePrimeEnd_primeronly
    elif direction == 5:
        primer_seq_plus_downstream = FivePrimeEnd_seq
        primer_seq = FivePrimeEnd_seq_primeronly
    
    df_sample_results = correct_priming2(directory, amplicon_info, primer_seq, primer_seq_plus_downstream, 
                                          mismatches, trimmed_R1R2, removePrimerPlusDownstream,
                                          exportMismatch)
    #add the results to the ongoing dataframe        
    results_df_all = results_df_all.append(df_sample_results, ignore_index=True)
    print('done with sample', i)

#export the final table
results_df_all.to_excel(results_file)    
print(results_df_all)


done with sample 0
done with sample 1
done with sample 2
done with sample 3
done with sample 4
done with sample 5
done with sample 6
done with sample 7
done with sample 8
done with sample 9
done with sample 10
done with sample 11
done with sample 12
done with sample 13
done with sample 14
done with sample 15
done with sample 16
done with sample 17
done with sample 18
done with sample 19
done with sample 20
done with sample 21
done with sample 22
done with sample 23
done with sample 24
done with sample 25
done with sample 26
done with sample 27
done with sample 28
done with sample 29
            sample_name    i7    i5  total_reads  reads_with_good_priming  \
0         ELANE_AAV_Fwd  N701  N501        70127                    34280   
1         ELANE_AAV_Fwd  N701  N502        49097                    18503   
2         ELANE_AAV_Fwd  N701  N504        40795                    10553   
3      ELANE_Cas9WT_Fwd  N701  N505        33080                     9220   
4      ELANE_Cas9WT_Fwd  

In [10]:
def trim_short_fastq(dir_sample, amplicon_info, direction5primer, direction3primer, adapter):

    
    direction = amplicon_info['Direction']

    # Read1  This is the gene specific primering binding
    if direction == 5:
        gene_specific_primer = direction5primer         # this is the gene specific primer for the 5 side
    elif direction == 3:
        gene_specific_primer = direction3primer         # gene specific primer for the 3 side
    rev_gene_specific_primer = reverse_complement(gene_specific_primer)
    
    # LAM adapter
    adapter_rev = reverse_complement(adapter)
    
    # We first check if the experiment had any guides
    N7 = amplicon_info['index_I1']
    N5 = amplicon_info['index_I2']
    
    file_R1 = create_filename(dir_sample, N7, N5, 'R1fastq_CorrPrime')
    file_R2 = create_filename(dir_sample, N7, N5, 'R2fastq_CorrPrime')

    file_cutadapt_R1 = create_filename(dir_sample, N7, N5, 'R1trimmed')
    file_cutadapt_R2 = create_filename(dir_sample, N7, N5, 'R2trimmed')
    file_cutadapt_report = create_filename(dir_sample, N7, N5, 'trimmed_report')
    
    
    if not os.path.exists(os.path.dirname(file_cutadapt_R1)):
        os.mkdir(os.path.dirname(file_cutadapt_R1))
    
    
    # remove adapters with cutadapt
    #original uditas peramiter had an error -e 0.33 (but was cutting of random stuff too much)
    # -a is hte 3' adapter for Read1
    # -A is 3' adapter for Read2
    # -m minium length
    cutadapt_command = ['cutadapt',
                        '-m', '50',
                        '-e', '0.1',
                        '-a', adapter_rev,
                        '-A', rev_gene_specific_primer,
                        '-o', file_cutadapt_R1, '-p', file_cutadapt_R2,
                        file_R1, file_R2]

    handle_cutadapt_report = open(file_cutadapt_report, 'wb')
    subprocess.call(cutadapt_command, stdout=handle_cutadapt_report)
    handle_cutadapt_report.close()

In [11]:
### TRIMMING ####
#need to trim off the end of the short reads. This is for amplicons that were too short and have the other side on them.

direction5primer = 'CTTCTGGGCAGGAACCGTG'
direction3primer = 'CCTCGGAGCGTTGGATGATAGAGTCGATCC' 
adapter ='GACTATAGGGCACGCGTGG'

for i in range(30):
        
    amplicon_info = get_csv_data(directory, i)
    
    trim_short_fastq(directory, amplicon_info, direction5primer, direction3primer, adapter)
    
    print('done with sample', i)


done with sample 0
done with sample 1
done with sample 2
done with sample 3
done with sample 4
done with sample 5
done with sample 6
done with sample 7
done with sample 8
done with sample 9
done with sample 10
done with sample 11
done with sample 12
done with sample 13
done with sample 14
done with sample 15
done with sample 16
done with sample 17
done with sample 18
done with sample 19
done with sample 20
done with sample 21
done with sample 22
done with sample 23
done with sample 24
done with sample 25
done with sample 26
done with sample 27
done with sample 28
done with sample 29




## Need to make an new bowtie2 index file that includes targeting for alignment agains the whole genome so it is all in one sheet together. 

### Other option would be to align it to the amplicon, extract unaligned files and then align to the genome but seems cleaner this way. 
#### Ideally every sequence is unique between the targeting vector and genome.


1. Build your fastas of interest and label .fa files.
    1. You need fasta of hg38 or reference genome. You can pull this from downloaded bowtie indexed sampels and then use the following command to turn the index into a fasta file: bowtie2-inspect hg38 > hg38.fa   
    2. Put all the fasta files in the same folder. Should also use the transfected plasmid
2. index the files with bowtie
    1. use the command bowtie2-build -f pE049,pe038_mc.fa,hg38.fa -p hg38_plus_targetvectorandplasmid
    2. this has the -p to make it take less ram in my case.
    3. In this case it adds the hg38 and the minicircle targeting file together
    4. I have a Intel® Core™ i7-5500U CPU @ 2.40GHz × 4 with 15.1 GiB ram and it required about 13.8 gigs of ram and 2 hours to do a hg38+small fasta index
    5. be sure to pay attention to the name of the new indexed file. "hg38_plus_targetvector" in example above. Add it to the sample_info.csv sheet. Under the tab 'genome_plus_targeting'.
    6. you can check it indexed correcctly: bowtie2-inspect -s hg38_plus_targetvector
    

    



### Making the reference alingment sequences

- plasmid sequence in .csv:  entire thing without HDR sequence (but with the ITRS) to map backbone integration
- AAV: reference has entire AAV including Homology arms
- HDR seq: The hdr reference sequence will be directly copied out ofthe .csv file and not altered.

### Logic for following AAV integration
1. Map locally to the plasmid without homology. This check for plasmid integration or ITR integration
2. Extract out mappings. Things that don't map will be aligned to amplicons. Things that do map will be aligned to the AAV file.
3. Align files that mapped to the plasmid to the complete AAV vector (they should contain ITR seq)
4. Align files that did not map to the plasmid to the amplicons


In [13]:
# single smaple for testing

amplicon_info = get_csv_data(directory, 0)
print('N7 :', amplicon_info['index_I1'], "   N5 : ", amplicon_info['index_I2'])
print ("sample name: ", amplicon_info['name'], "   sample name: ", amplicon_info['description'])
print('rection type', get_reaction_type(amplicon_info))

create_plasmid_reference(directory, amplicon_info)
#create_AAV_reference(directory, amplicon_info)
create_amplicon(directory, amplicon_info, file_genome_2bit)

N7 : N701    N5 :  N501
sample name:  1.1.F    sample name:  ELANE_AAV_Fwd
rection type double_cut_same_chromosome_and_HDR


In [29]:
######## GENERATING REFERENCE SEQUENCES ###########
#
# MAKE SURE PLASMID DOESNT HAVE THE HDR PORTION IN IT
# MAKE SURE AAV is entire AAV sequence
# HDR should be entire sequence for reference knock-in


for i in range(30):
    
    amplicon_info = get_csv_data(directory, i)
    print('N7 :', amplicon_info['index_I1'], "   N5 : ", amplicon_info['index_I2'])
    print ("sample name: ", amplicon_info['name'], "   sample name: ", amplicon_info['description'])
    get_reaction_type(amplicon_info)

    create_plasmid_reference(directory, amplicon_info)
    create_AAV_reference(directory, amplicon_info)
    create_amplicon(directory, amplicon_info, file_genome_2bit)


N7 : N701    N5 :  N501
sample name:  1.1.F    sample name:  ELANE_AAV_Fwd
N7 : N701    N5 :  N502
sample name:  1.2.F    sample name:  ELANE_AAV_Fwd
N7 : N701    N5 :  N504
sample name:  1.3.F    sample name:  ELANE_AAV_Fwd
N7 : N701    N5 :  N505
sample name:  2.1.F    sample name:  ELANE_Cas9WT_Fwd
N7 : N701    N5 :  N506
sample name:  2.2.F    sample name:  ELANE_Cas9WT_Fwd
N7 : N701    N5 :  N507
sample name:  2.3.F    sample name:  ELANE_Cas9WT_Fwd
N7 : N701    N5 :  N508
sample name:  3.1.F    sample name:  ELANE_D10A_Fwd
N7 : N701    N5 :  N510
sample name:  3.2.F    sample name:  ELANE_D10A_Fwd
N7 : N702    N5 :  N501
sample name:  3.3.F    sample name:  ELANE_D10A_Fwd
N7 : N702    N5 :  N502
sample name:  12.1.F    sample name:  ELANE_Cas9WT_d3_Fwd
N7 : N702    N5 :  N504
sample name:  12.2.F    sample name:  ELANE_Cas9WT_d3_Fwd
N7 : N702    N5 :  N505
sample name:  12.3.F    sample name:  ELANE_Cas9WT_d3_Fwd
N7 : N702    N5 :  N506
sample name:  13.1.F    sample name:  ELANE

# Aligning reads to plasmid/AAV (local alignment)

In [4]:
######## ALIGNING LOCAL TO PLASMID FILES ###########
#
# MAKE SURE PLASMID DOESNT HAVE THE AAV PORTION IN IT
# MAKE SURE AAV DOESN'T HAVE HDR ARMS IN IT


for i in range(30):
    
    amplicon_info = get_csv_data(directory, i)
    print('N7 :', amplicon_info['index_I1'], "   N5 : ", amplicon_info['index_I2'])
    print ("sample name: ", amplicon_info['name'], "   sample name: ", amplicon_info['description'])
    
    align_plasmid_local(directory, amplicon_info, ncpu=12)
    extract_reads_plasmid(directory, amplicon_info)
    align_AAV_local(directory, amplicon_info, ncpu=12)


N7 : N701    N5 :  N501
sample name:  1.1.F    sample name:  ELANE_AAV_Fwd
made filenames
N7 : N701    N5 :  N502
sample name:  1.2.F    sample name:  ELANE_AAV_Fwd
made filenames
N7 : N701    N5 :  N504
sample name:  1.3.F    sample name:  ELANE_AAV_Fwd
made filenames
N7 : N701    N5 :  N505
sample name:  2.1.F    sample name:  ELANE_Cas9WT_Fwd
made filenames
N7 : N701    N5 :  N506
sample name:  2.2.F    sample name:  ELANE_Cas9WT_Fwd
made filenames
N7 : N701    N5 :  N507
sample name:  2.3.F    sample name:  ELANE_Cas9WT_Fwd
made filenames
N7 : N701    N5 :  N508
sample name:  3.1.F    sample name:  ELANE_D10A_Fwd
made filenames
N7 : N701    N5 :  N510
sample name:  3.2.F    sample name:  ELANE_D10A_Fwd
made filenames
N7 : N702    N5 :  N501
sample name:  3.3.F    sample name:  ELANE_D10A_Fwd
made filenames
N7 : N702    N5 :  N502
sample name:  12.1.F    sample name:  ELANE_Cas9WT_d3_Fwd
made filenames
N7 : N702    N5 :  N504
sample name:  12.2.F    sample name:  ELANE_Cas9WT_d3_Fwd

In [28]:
######## QUANTIFYING PLASMID/AAV INTEGRATION ###########
#



#SUMMARY FILE
results_df_all = pd.DataFrame()

results_folder = os.path.join(directory, 'results')
if not os.path.exists(results_folder):
    os.mkdir(results_folder)
results_file = os.path.join(directory, 'results','plasmid_AAV_integration.xlsx')


# FUNCTION INPUTS
analysis = 'plasmid'
min_MAPQ = 1

for i in range(30):
    
    amplicon_info = get_csv_data(directory, i) #cas9 cutting sample

    df_sample_results = analyze_local_alignments(directory, amplicon_info, min_MAPQ, analysis)

    results_df_all = results_df_all.append(df_sample_results, ignore_index=True)
    print('done with sample', i)

#export the final table
results_df_all.to_excel(results_file)   

print(results_df_all)

done with sample 0
done with sample 1
done with sample 2
done with sample 3
done with sample 4
done with sample 5
done with sample 6
done with sample 7
done with sample 8
done with sample 9
done with sample 10
done with sample 11
done with sample 12
done with sample 13
done with sample 14
done with sample 15
done with sample 16
done with sample 17
done with sample 18
done with sample 19
done with sample 20
done with sample 21
done with sample 22
done with sample 23
done with sample 24
done with sample 25
done with sample 26
done with sample 27
done with sample 28
done with sample 29
   sample_name    i7    i5  number_plasmid_alignments
0        1.1.F  N701  N501                          0
1        1.2.F  N701  N502                          0
2        1.3.F  N701  N504                          0
3        2.1.F  N701  N505                          8
4        2.2.F  N701  N506                         21
5        2.3.F  N701  N507                         28
6        3.1.F  N701  N508      

## Align filtered reads to amplicons (end-to-end)

In [52]:
############################
#
# Aligns reads globally to amplicon. "end-to-end" as the default function in bowtie2
# Input: directory to be analyzed
#        amplicon_info, slice of sample_info.csv for the sample being processed
#        file_genome_2bit, 2bit file with the reference genome being used
#
#        paired = True or False
# ##########################
def align_ampliconLAM(dir_sample, amplicon_info, check_plasmid_insertions, paired, ncpu=4):

    # We first check if the experiment had any guides
    N7 = amplicon_info['index_I1']
    N5 = amplicon_info['index_I2']

    has_plasmid = type(amplicon_info['plasmid_sequence']) is str or type(amplicon_info['plasmid_sequence']) is unicode

    if check_plasmid_insertions == 1 and has_plasmid:
        file_R1 = create_filename(dir_sample, N7, N5, 'unmapped_plasmid_R1fastqgz')
        file_R2 = create_filename(dir_sample, N7, N5, 'unmapped_plasmid_R2fastqgz')
    else:
        file_R1 = create_filename(dir_sample, N7, N5, 'R1trimmed')
        file_R2 = create_filename(dir_sample, N7, N5, 'R2trimmed')

    if not os.path.exists(os.path.dirname(file_R1)):
        os.mkdir(os.path.dirname(file_R1))

    file_sam_amplicons = create_filename(dir_sample, N7, N5, 'sam_amplicons')
    file_sam_report_amplicons = create_filename(dir_sample, N7, N5, 'sam_report_amplicons')

    if not os.path.exists(os.path.dirname(file_sam_amplicons)):
        os.mkdir(os.path.dirname(file_sam_amplicons))

    file_bam_amplicons = create_filename(dir_sample, N7, N5, 'bam_amplicons')
    file_sorted_bam_amplicons = create_filename(dir_sample, N7, N5, 'sorted_bam_amplicons')

    if not os.path.exists(os.path.dirname(file_bam_amplicons)):
        os.mkdir(os.path.dirname(file_bam_amplicons))

    # global alignment to the amplicons with bowtie2
    initial_dir = os.getcwd()
    folder_amplicons = create_filename(dir_sample, N7, N5, 'amplicons')

    os.chdir(folder_amplicons)
    
    if paired == True:
        bowtie2_command = ['bowtie2', '-p', str(ncpu), '--very-sensitive', 
                           '-X', '5000', '-k', '3', '-x', 'amplicons',
                           '-1', file_R1, '-2', file_R2,
                           '-S', file_sam_amplicons]
    
    else:
        bowtie2_command = ['bowtie2', '--very-sensitive', '-p', str(ncpu),
                   '-X', '5000', '-k', '3', '-x', 'amplicons',
                   '-U', file_R1, '-S', file_sam_amplicons]


    handle_sam_report_amplicons = open(file_sam_report_amplicons, 'wb')

    subprocess.call(bowtie2_command, stderr=handle_sam_report_amplicons)

    handle_sam_report_amplicons.close()

    # convert sam to bam
    sam_to_bam_amplicons_command = ['samtools', 'view', '-Sb', file_sam_amplicons]

    handle_file_bam_amplicons = open(file_bam_amplicons, 'wb')

    subprocess.call(sam_to_bam_amplicons_command, stdout=handle_file_bam_amplicons)

    # sort bam files
    sort_bam_amplicons_command = ['samtools', 'sort', file_bam_amplicons, '-o', file_sorted_bam_amplicons]

    subprocess.call(sort_bam_amplicons_command)

    # Clean up
    #os.remove(file_sam_amplicons)
    os.remove(file_bam_amplicons)

    # Create bam index files
    create_bam_amplicons_index_command = ['samtools', 'index', file_sorted_bam_amplicons]
    subprocess.call(create_bam_amplicons_index_command)

    os.chdir(initial_dir)
 
    

In [53]:
######## Aligning reads to amplicons ###########
#
# 
#  This will go thorugh and align the amplicons. 
# check_plamid_insertions is if the aligned reads should come from plasmid integration removed fastq
# paired, do you use paired ends for alignment or just Read1 (gene specific primer)
print(directory)

for i in range(30):#was 30
    
    
    amplicon_info = get_csv_data(directory, i)
    align_ampliconLAM(directory, amplicon_info, check_plasmid_insertions = True, paired = True, ncpu = 14)
    
    
    

/home/eric/Data/Spaced_Nicking/LAM2_Miniseq_ELANE


In [9]:
################################################################################
# Function to extract all unmapped reads to the amplicons
# reads, 'single', 'paired'
################################################################################
#  THIS IS USED FOR PIPELINE 2 WHERE WE ALIGN THEM TO THE REST OF HTE GENOME

for i in range(30):
    
    amplicon_info = get_csv_data(directory, i)    
    extract_unmapped_reads_amplicons_LAM(directory, amplicon_info, 'paired')

NameError: name 'extract_unmapped_reads_amplicons_LAM' is not defined

# Analyze the aligned amplicon reads

In [54]:
################### ANALYZING THE ALIGNMENTS ###############
#
#          INDELS AND QUANTIFICATION


#SUMMARY FILE
results_df_all = pd.DataFrame()

results_folder = os.path.join(directory, 'results')
if not os.path.exists(results_folder):
    os.mkdir(results_folder)
results_file = os.path.join(directory, 'results','all_amplicon_counts.xlsx')

# function inputs
window_size = 15
amplicon_window_around_cut = 1000
min_MAPQ = 5
min_AS = -180

for i in range(30):
    
    amplicon_info = get_csv_data(directory, i)    

    result_amplicon_df = analyze_alignments_LAM(directory, amplicon_info, window_size,
                                                    amplicon_window_around_cut, min_MAPQ, min_AS)
    
    
    results_df_all = results_df_all.append(result_amplicon_df, ignore_index=True)
    print('done with sample', i, amplicon_info['description'], amplicon_info['index_I1'],
                                                                  amplicon_info['index_I2'])

results_df_all.to_excel(results_file)   

print(results_df_all)    
    
    


Allreads is :  62208
HDRcount is :  1978
done with sample 0 ELANE_AAV_Fwd N701 N501
Allreads is :  32798
HDRcount is :  814
done with sample 1 ELANE_AAV_Fwd N701 N502
Allreads is :  17949
HDRcount is :  329
done with sample 2 ELANE_AAV_Fwd N701 N504
Allreads is :  14974
HDRcount is :  8405
done with sample 3 ELANE_Cas9WT_Fwd N701 N505
Allreads is :  23336
HDRcount is :  13179
done with sample 4 ELANE_Cas9WT_Fwd N701 N506
Allreads is :  45874
HDRcount is :  26885
done with sample 5 ELANE_Cas9WT_Fwd N701 N507
Allreads is :  74285
HDRcount is :  27734
done with sample 6 ELANE_D10A_Fwd N701 N508
Allreads is :  108044
HDRcount is :  36474
done with sample 7 ELANE_D10A_Fwd N701 N510
Allreads is :  89362
HDRcount is :  34451
done with sample 8 ELANE_D10A_Fwd N702 N501
Allreads is :  5269
HDRcount is :  317
done with sample 9 ELANE_Cas9WT_d3_Fwd N702 N502
Allreads is :  23201
HDRcount is :  21935
done with sample 10 ELANE_Cas9WT_d3_Fwd N702 N504
Allreads is :  11389
HDRcount is :  10877
done w

In [18]:
results_df_all = pd.DataFrame()
print(results_df_all)

Empty DataFrame
Columns: []
Index: []


In [20]:
amplicon_info = get_csv_data(directory, 1) 
print(amplicon_info)


NGS_req-ID                                                             NaN
name                                                                 1.2.F
Sample                                                                 NaN
description                                                  ELANE_AAV_Fwd
Direction                                                                5
Control sample (Y/N)                                                     Y
Notes                                                                  NaN
Dilution                                                               NaN
Cell name_type                                                 human CD34+
UMI_Len                                                                NaN
IndexI7Primer                                                          NaN
I7_Index_ID                                           P7_N701_SBS12nextera
index_I1                                                              N701
barcode_I1               

In [26]:
window_size = 15
amplicon_window_around_cut = 1000
min_MAPQ = 5
min_AS = -180
#result_amplicon_df = analyze_alignments_LAM(directory, amplicon_info, window_size, amplicon_window_around_cut, min_MAPQ, min_AS)
print(result_amplicon_df)

  sample_name    i7    i5 sample_descript  Total_mapped_reads  \
0       1.2.F  N701  N502   ELANE_AAV_Fwd               32799   

   wt_cut1_total_reads  wt_cut1_total_indels  wt_cut1_total_deletions  \
0                16080                     7                        7   

   wt_cut1_total_insertions  wt_cut2_total_reads  ...  1a_1a_cut1_total_reads  \
0                         0                 1457  ...                       0   

   1a_1a_cut1_total_indels  1a_1a_cut1_total_deletions  \
0                        0                           0   

   1a_1a_cut1_total_insertions  2b_2b_cut1_total_reads  \
0                            0                       0   

   2b_2b_cut1_total_indels  2b_2b_cut1_total_deletions  \
0                        0                           0   

   2b_2b_cut1_total_insertions  HDR_total_reads  median_fragment_size  
0                            0              814                 260.0  

[1 rows x 35 columns]
