In [1]:
from __future__ import print_function
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pylab
import pandas as pd
import numpy as np
import os
import sys
import gzip
import itertools
import operator
import subprocess
import twobitreader
from Bio.Alphabet import IUPAC
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pysam
import shutil
from fastDamerauLevenshtein import damerauLevenshtein


from LAM_scripts.LAM_helpersDanner.py import *


#  Pipeline1
# This, for the most part, is the UDITAS pipeline modified to allow for REPLACE targeting. 

Overview:
- Trim off the 5 nt on both sides and check for primering (how many to trim off of read 1 or two)
- need to check for priming (make it able to check priming on Read1 or two. add missmatches. and if input is trimmed or not. pull primer from data sheet. pull primer using coordiantes and get seq downstream depending on lenght. 
- trims short amplicons for reads that go into the adapter or illumina primers
- local align to the plasmid without the AAV seq at all
- local align to the AAV seq without the HDR arms
- after pulling out the reads that didn't align to AAV seq or plasmid backbone, analyze the breaks

- generate table of expected amplicons (use the hdr sample and just replace seq between breaks if thats possible)
- align agasint the reads that didn't map to AAV or plasmid backbone,
  look at indels and quantification

Test pipeline 2:
- Trim up to the cut site. Run end-to-end across the genome. Look for expected off target translocations
- also generally measure the frequency at which integrrations happen

Questions:
- which side is the HDR arm on? which of the two sides should i be looking for extra hdr transcripts from
- Can I bring in some of the analysis tools from crispresso to understand the indel profiles better.



In [2]:
#Directory

directory = '/home/eric/Data/Spaced_Nicking/LAM3_IL7R_PRF1'
print(directory)

/home/eric/Data/Spaced_Nicking/LAM3_IL7R_PRF1


In [3]:
##########        Assign the file_genome_2bit location.     ############ 
#
#   This is needed for pulling sequence from the referene genome by location
#assembly = amplicon_info['genome']
assembly = 'hg38'
file_genome_2bit = os.path.join('/home/eric/Data/Ref_Genomes', assembly + '.2bit')
print(file_genome_2bit)

###############   BOWTIE2_INDEXES for genome alignments    ################
#
#check in bash: > ECHO $GENOMES_2BIT

%env BOWTIE2_INDEXES=/home/eric/Data/Ref_Genomes

/home/eric/Data/Ref_Genomes/hg38.2bit
env: BOWTIE2_INDEXES=/home/eric/Data/Ref_Genomes


In [10]:
########## Remove first 5 nts to remove adapter seq and spot generation sequencs  #########
# 
#     Misha adds 5 nt's to help spot generation on Illumina on both read 1 and 2
#    Read 1 has gene binding primer
#    Read 2 has adapter to ligate on universal reverse seq


                    # decide on the length to trim
adapter_seq = 'GACTATAGGGCACGCGTGG'
adapt_len = len(adapter_seq)
read2trim = 5 + adapt_len
print('read2trim is :', read2trim, ' nucleotides.')


                  # run the trimming
for i in range(14):
        
    amplicon_info = get_csv_data(directory, i)
    
    trimming_R1_R2(directory, amplicon_info, R1trim = 3, R2trim = read2trim, pipeline = 1)
   
    print('done with sample', amplicon_info['name'], amplicon_info['index_I1'],amplicon_info['index_I2'])

read2trim is : 24  nucleotides.
done with sample 4.1.F N709 N501


### Discard Mispriming Reads

LAM uses a anchored primer and then gets ride of the background gDNA. Then it uses a nested primer and so there should be a very clean product. 

In [2]:
#this is an example of the Levenshtein distance
from fastDamerauLevenshtein import damerauLevenshtein
damerauLevenshtein('AAGTGCCCCCTGTCTCTGCAGCTCCATGGCAGCCCGT', 'AAGTGCCCCCTGTCTCTGCAGCTCCATGGCTGCTAGA', similarity=False)

4.0

In [7]:
############### GOOD PRIMING Filter ##########
#
#    Here we assume the NNNNN is no longer on Read1 and I used 'trimmed_R1R2() function'
#   LAM gene specific primer on READ 1 in this case. The program only understnad checing primer 1***
# there can be indels in the frame of the read
# if the primer has a 

# make a dataframe to capture all of the priming information and put it in the 'results' folder
results_df_all = pd.DataFrame()

results_folder = os.path.join(directory, 'results')
if not os.path.exists(results_folder):
    os.mkdir(results_folder)
results_file = os.path.join(directory, 'results','all_priming.xlsx')
    
## inputs for correct_priming2() function
mismatches = 3                       # I am using the Levenshtein distance so if it is out of frame it costs 2 already
trimmed_R1R2 = True                 #if the file has been already tri
removePrimerPlusDownstream = False  # remove the primer/downstream seq if it is good (good for guideseq)
exportMismatch = True               # export the file of mismatches sequences

for i in range(15,33):
    
    amplicon_info = get_csv_data(directory, i)

    
    #EVERYTHING IS CAPITAL
    #                      
    #                    
    ThreePrimeEnd_seq =       'TTCCAGGGCTCCTAGACCACCCAGAGTTTCC'
    #      Ch11:5226605:5226627                   
    ThrePrimeEnd_primeronly = 'TTCCAGGGCTCCTAGACCAC'
    
    #3primer 
    FivePrimeEnd_seq =            'AAGTGCCCCCTGTCTCTGCAGCTCCATGGC'
    #      Ch11:5227054:5227084
    FivePrimeEnd_seq_primeronly = 'AAGTGCCCCCTGTCTCTGCAGC'

    direction = amplicon_info['Direction']

    if direction == 3:
        primer_seq_plus_downstream = ThreePrimeEnd_seq
        primer_seq = ThrePrimeEnd_primeronly
    elif direction == 5:
        primer_seq_plus_downstream = FivePrimeEnd_seq
        primer_seq = FivePrimeEnd_seq_primeronly
    
    df_sample_results = correct_priming2(directory, amplicon_info, primer_seq, primer_seq_plus_downstream, 
                                         mismatches, trimmed_R1R2, removePrimerPlusDownstream, exportMismatch)
    
    #add the results to the ongoing dataframe        
    results_df_all = results_df_all.append(df_sample_results, ignore_index=True)
    print('done with sample', i)

#export the final table
results_df_all.to_excel(results_file)    
print(results_df_all)


done with sample 15
done with sample 16
done with sample 17
done with sample 18
done with sample 19
done with sample 20
done with sample 21
done with sample 22
done with sample 23
done with sample 24
done with sample 25
done with sample 26
done with sample 27
done with sample 28
done with sample 29
done with sample 30
done with sample 31
done with sample 32
        sample_name    i7    i5  total_reads  reads_with_good_priming  \
0      PRF1_AAV_Fwd  N709  N501        76276                    73893   
1      PRF1_AAV_Fwd  N709  N502        72150                    70646   
2      PRF1_AAV_Fwd  N709  N504        27071                    26404   
3   PRF1_Cas9WT_Fwd  N709  N505        86034                    83621   
4   PRF1_Cas9WT_Fwd  N709  N506        28086                    27620   
5   PRF1_Cas9WT_Fwd  N709  N507        71226                    69331   
6     PRF1_D10A_Fwd  N709  N508       175474                   170590   
7     PRF1_D10A_Fwd  N709  N510       159302            

In [8]:
def trim_short_fastq(dir_sample, amplicon_info, direction5primer, direction3primer, adapter):

    
    direction = amplicon_info['Direction']

    # Read1  This is the gene specific primering binding
    if direction == 5:
        gene_specific_primer = direction5primer         # this is the gene specific primer for the 5 side
    elif direction == 3:
        gene_specific_primer = direction3primer         # gene specific primer for the 3 side
    rev_gene_specific_primer = reverse_complement(gene_specific_primer)
    
    # LAM adapter
    adapter_rev = reverse_complement(adapter)
    
    # We first check if the experiment had any guides
    N7 = amplicon_info['index_I1']
    N5 = amplicon_info['index_I2']
    
    file_R1 = create_filename(dir_sample, N7, N5, 'R1fastq_CorrPrime')
    file_R2 = create_filename(dir_sample, N7, N5, 'R2fastq_CorrPrime')

    file_cutadapt_R1 = create_filename(dir_sample, N7, N5, 'R1trimmed')
    file_cutadapt_R2 = create_filename(dir_sample, N7, N5, 'R2trimmed')
    file_cutadapt_report = create_filename(dir_sample, N7, N5, 'trimmed_report')
    
    
    if not os.path.exists(os.path.dirname(file_cutadapt_R1)):
        os.mkdir(os.path.dirname(file_cutadapt_R1))
    
    
    # remove adapters with cutadapt
    #original uditas peramiter had an error -e 0.33 (but was cutting of random stuff too much)
    # -a is hte 3' adapter for Read1
    # -A is 3' adapter for Read2
    # -m minium length
    cutadapt_command = ['cutadapt',
                        '-m', '120',
                        '-e', '0.1',
                        '-a', adapter_rev,
                        '-A', rev_gene_specific_primer,
                        '-o', file_cutadapt_R1, '-p', file_cutadapt_R2,
                        file_R1, file_R2]

    handle_cutadapt_report = open(file_cutadapt_report, 'wb')
    subprocess.call(cutadapt_command, stdout=handle_cutadapt_report)
    handle_cutadapt_report.close()
    

In [9]:
### TRIMMING ####
#need to trim off the end of the short reads. This is for amplicons that were too short and have the other side on them.

direction5primer = 'AAGTGCCCCCTGTCTCTGCAGC'
direction3primer = 'TTCCAGGGCTCCTAGACCAC' 

adapter ='GACTATAGGGCACGCGTGG'

for i in range(15,33):
        
    amplicon_info = get_csv_data(directory, i)
    
    trim_short_fastq(directory, amplicon_info, direction5primer, direction3primer, adapter)
    
    print('done with sample', i)


done with sample 15
done with sample 16
done with sample 17
done with sample 18
done with sample 19
done with sample 20
done with sample 21
done with sample 22
done with sample 23
done with sample 24
done with sample 25
done with sample 26
done with sample 27
done with sample 28
done with sample 29
done with sample 30
done with sample 31
done with sample 32




## Need to make an new bowtie2 index file that includes targeting for alignment agains the whole genome so it is all in one sheet together. 

### Other option would be to align it to the amplicon, extract unaligned files and then align to the genome but seems cleaner this way. 
#### Ideally every sequence is unique between the targeting vector and genome.


1. Build your fastas of interest and label .fa files.
    1. You need fasta of hg38 or reference genome. You can pull this from downloaded bowtie indexed sampels and then use the following command to turn the index into a fasta file: bowtie2-inspect hg38 > hg38.fa   
    2. Put all the fasta files in the same folder. Should also use the transfected plasmid
2. index the files with bowtie
    1. use the command bowtie2-build -f pE049,pe038_mc.fa,hg38.fa -p hg38_plus_targetvectorandplasmid
    2. this has the -p to make it take less ram in my case.
    3. In this case it adds the hg38 and the minicircle targeting file together
    4. I have a Intel® Core™ i7-5500U CPU @ 2.40GHz × 4 with 15.1 GiB ram and it required about 13.8 gigs of ram and 2 hours to do a hg38+small fasta index
    5. be sure to pay attention to the name of the new indexed file. "hg38_plus_targetvector" in example above. Add it to the sample_info.csv sheet. Under the tab 'genome_plus_targeting'.
    6. you can check it indexed correcctly: bowtie2-inspect -s hg38_plus_targetvector
    

    



### Making the reference alingment sequences

- plasmid sequence in .csv:  entire thing without HDR sequence (but with the ITRS) to map backbone integration
- AAV: reference has entire AAV including Homology arms
- HDR seq: The hdr reference sequence will be directly copied out ofthe .csv file and not altered.

### Logic for following AAV integration
1. Map locally to the plasmid without homology. This check for plasmid integration or ITR integration
2. Extract out mappings. Things that don't map will be aligned to amplicons. Things that do map will be aligned to the AAV file.
3. Align files that mapped to the plasmid to the complete AAV vector (they should contain ITR seq)
4. Align files that did not map to the plasmid to the amplicons


In [72]:
# single sample for testing (ignore this)

amplicon_info = get_csv_data(directory, 0)
print('N7 :', amplicon_info['index_I1'], "   N5 : ", amplicon_info['index_I2'])
print ("sample name: ", amplicon_info['name'], "   sample name: ", amplicon_info['description'])
print('rection type', get_reaction_type(amplicon_info))

create_plasmid_reference(directory, amplicon_info)
create_AAV_reference(directory, amplicon_info)
create_amplicon(directory, amplicon_info, file_genome_2bit)

N7 : N701    N5 :  N501
sample name:  1.1.F.1    sample name:  Ctrl (#1) 5 end LAM-HTGTS
rection type double_cut_same_chromosome_and_HDR


In [11]:
######## GENERATING REFERENCE SEQUENCES ###########
#
# MAKE SURE PLASMID DOESNT HAVE THE HDR PORTION IN IT
# MAKE SURE AAV is entire AAV section
# HDR should be entire sequence for reference knock-in


for i in range(15,33):
    
    amplicon_info = get_csv_data(directory, i)
    print('N7 :', amplicon_info['index_I1'], "   N5 : ", amplicon_info['index_I2'])
    print ("sample name: ", amplicon_info['name'], "   sample name: ", amplicon_info['description'])
    get_reaction_type(amplicon_info)

    create_plasmid_reference(directory, amplicon_info)
    create_AAV_reference(directory, amplicon_info)
    create_amplicon(directory, amplicon_info, file_genome_2bit)



N7 : N709    N5 :  N501
sample name:  4.1.F    sample name:  PRF1_AAV_Fwd
N7 : N709    N5 :  N502
sample name:  4.2.F    sample name:  PRF1_AAV_Fwd
N7 : N709    N5 :  N504
sample name:  4.3.F    sample name:  PRF1_AAV_Fwd
N7 : N709    N5 :  N505
sample name:  5.1.F    sample name:  PRF1_Cas9WT_Fwd
N7 : N709    N5 :  N506
sample name:  5.2.F    sample name:  PRF1_Cas9WT_Fwd
N7 : N709    N5 :  N507
sample name:  5.3.F    sample name:  PRF1_Cas9WT_Fwd
N7 : N709    N5 :  N508
sample name:  6.1.F    sample name:  PRF1_D10A_Fwd
N7 : N709    N5 :  N510
sample name:  6.2.F    sample name:  PRF1_D10A_Fwd
N7 : N710    N5 :  N501
sample name:  6.3.F    sample name:  PRF1_D10A_Fwd
N7 : N711    N5 :  N501
sample name:  4.1.R    sample name:  PRF1_AAV_Rev
N7 : N711    N5 :  N502
sample name:  4.2.R    sample name:  PRF1_AAV_Rev
N7 : N711    N5 :  N504
sample name:  4.3.R    sample name:  PRF1_AAV_Rev
N7 : N711    N5 :  N505
sample name:  5.1.R    sample name:  PRF1_Cas9WT_Rev
N7 : N711    N5 :  N506

# Aligning reads to plasmid/AAV (local alignment)

In [12]:
######## ALIGNING LOCAL TO PLASMID FILES ###########
#
# MAKE SURE PLASMID DOESNT HAVE THE AAV PORTION IN IT
# MAKE SURE AAV DOESN'T HAVE HDR ARMS IN IT


for i in range(15,33):
    
    amplicon_info = get_csv_data(directory, i)
    print('N7 :', amplicon_info['index_I1'], "   N5 : ", amplicon_info['index_I2'])
    print ("sample name: ", amplicon_info['name'], "   sample name: ", amplicon_info['description'])
    
    align_plasmid_local(directory, amplicon_info, ncpu=12)
    extract_reads_plasmid(directory, amplicon_info)
    align_AAV_local(directory, amplicon_info, ncpu=12)


N7 : N709    N5 :  N501
sample name:  4.1.F    sample name:  PRF1_AAV_Fwd
made filenames
N7 : N709    N5 :  N502
sample name:  4.2.F    sample name:  PRF1_AAV_Fwd
made filenames
N7 : N709    N5 :  N504
sample name:  4.3.F    sample name:  PRF1_AAV_Fwd
made filenames
N7 : N709    N5 :  N505
sample name:  5.1.F    sample name:  PRF1_Cas9WT_Fwd
made filenames
N7 : N709    N5 :  N506
sample name:  5.2.F    sample name:  PRF1_Cas9WT_Fwd
made filenames
N7 : N709    N5 :  N507
sample name:  5.3.F    sample name:  PRF1_Cas9WT_Fwd
made filenames
N7 : N709    N5 :  N508
sample name:  6.1.F    sample name:  PRF1_D10A_Fwd
made filenames
N7 : N709    N5 :  N510
sample name:  6.2.F    sample name:  PRF1_D10A_Fwd
made filenames
N7 : N710    N5 :  N501
sample name:  6.3.F    sample name:  PRF1_D10A_Fwd
made filenames
N7 : N711    N5 :  N501
sample name:  4.1.R    sample name:  PRF1_AAV_Rev
made filenames
N7 : N711    N5 :  N502
sample name:  4.2.R    sample name:  PRF1_AAV_Rev
made filenames
N7 : N711

In [16]:
######## QUANTIFYING PLASMID/AAV INTEGRATION ###########
#


#SUMMARY FILE
results_df_all = pd.DataFrame()

results_folder = os.path.join(directory, 'results')
if not os.path.exists(results_folder):
    os.mkdir(results_folder)
results_file = os.path.join(directory, 'results','plasmid_AAV_integration.xlsx')


# FUNCTION INPUTS
analysis = 'plasmid'
min_MAPQ = 1

for i in range(15,33):
    
    amplicon_info = get_csv_data(directory, i) #cas9 cutting sample

    df_sample_results = analyze_local_alignments(directory, amplicon_info, min_MAPQ, analysis)

    results_df_all = results_df_all.append(df_sample_results, ignore_index=True)
    print('done with sample', amplicon_info['description'], amplicon_info['index_I1'],amplicon_info['index_I2'])

#export the final table
results_df_all.to_excel(results_file)   

print(results_df_all)

done with sample PRF1_AAV_Fwd N709 N501
done with sample PRF1_AAV_Fwd N709 N502
done with sample PRF1_AAV_Fwd N709 N504
done with sample PRF1_Cas9WT_Fwd N709 N505
done with sample PRF1_Cas9WT_Fwd N709 N506
done with sample PRF1_Cas9WT_Fwd N709 N507
done with sample PRF1_D10A_Fwd N709 N508
done with sample PRF1_D10A_Fwd N709 N510
done with sample PRF1_D10A_Fwd N710 N501
done with sample PRF1_AAV_Rev N711 N501
done with sample PRF1_AAV_Rev N711 N502
done with sample PRF1_AAV_Rev N711 N504
done with sample PRF1_Cas9WT_Rev N711 N505
done with sample PRF1_Cas9WT_Rev N711 N506
done with sample PRF1_Cas9WT_Rev N711 N507
done with sample PRF1_D10A_Rev N711 N508
done with sample PRF1_D10A_Rev N711 N510
done with sample PRF1_D10A_Rev N710 N508
   sample_name    i7    i5  number_plasmid_alignments
0        4.1.F  N709  N501                          0
1        4.2.F  N709  N502                          0
2        4.3.F  N709  N504                          0
3        5.1.F  N709  N505              

In [24]:
############################
#
# Aligns reads globally to amplicon. "end-to-end" as the default function in bowtie2
# Input: directory to be analyzed
#        amplicon_info, slice of sample_info.csv for the sample being processed
#        file_genome_2bit, 2bit file with the reference genome being used
#
#        paired = True or False
# ##########################
def align_ampliconLAM(dir_sample, amplicon_info, check_plasmid_insertions, paired, ncpu=4):

    # We first check if the experiment had any guides
    N7 = amplicon_info['index_I1']
    N5 = amplicon_info['index_I2']

    has_plasmid = type(amplicon_info['plasmid_sequence']) is str or type(amplicon_info['plasmid_sequence']) is unicode

    if check_plasmid_insertions == 1 and has_plasmid:
        file_R1 = create_filename(dir_sample, N7, N5, 'unmapped_plasmid_R1fastqgz')
        file_R2 = create_filename(dir_sample, N7, N5, 'unmapped_plasmid_R2fastqgz')
    else:
        file_R1 = create_filename(dir_sample, N7, N5, 'R1trimmed')
        file_R2 = create_filename(dir_sample, N7, N5, 'R2trimmed')

    if not os.path.exists(os.path.dirname(file_R1)):
        os.mkdir(os.path.dirname(file_R1))

    file_sam_amplicons = create_filename(dir_sample, N7, N5, 'sam_amplicons')
    file_sam_report_amplicons = create_filename(dir_sample, N7, N5, 'sam_report_amplicons')

    if not os.path.exists(os.path.dirname(file_sam_amplicons)):
        os.mkdir(os.path.dirname(file_sam_amplicons))

    file_bam_amplicons = create_filename(dir_sample, N7, N5, 'bam_amplicons')
    file_sorted_bam_amplicons = create_filename(dir_sample, N7, N5, 'sorted_bam_amplicons')

    if not os.path.exists(os.path.dirname(file_bam_amplicons)):
        os.mkdir(os.path.dirname(file_bam_amplicons))

    # global alignment to the amplicons with bowtie2
    initial_dir = os.getcwd()
    folder_amplicons = create_filename(dir_sample, N7, N5, 'amplicons')

    os.chdir(folder_amplicons)
    
    if paired == True:
        bowtie2_command = ['bowtie2', '-p', str(ncpu), '--very-sensitive', 
                           '-X', '5000', '-k', '3', '-x', 'amplicons',
                           '-1', file_R1, '-2', file_R2,
                           '-S', file_sam_amplicons]
    
    else:
        bowtie2_command = ['bowtie2', '--very-sensitive', '-p', str(ncpu),
                   '-X', '5000', '-k', '3', '-x', 'amplicons',
                   '-U', file_R1, '-S', file_sam_amplicons]


    handle_sam_report_amplicons = open(file_sam_report_amplicons, 'wb')

    subprocess.call(bowtie2_command, stderr=handle_sam_report_amplicons)

    handle_sam_report_amplicons.close()

    # convert sam to bam
    sam_to_bam_amplicons_command = ['samtools', 'view', '-Sb', file_sam_amplicons]

    handle_file_bam_amplicons = open(file_bam_amplicons, 'wb')

    subprocess.call(sam_to_bam_amplicons_command, stdout=handle_file_bam_amplicons)

    # sort bam files
    sort_bam_amplicons_command = ['samtools', 'sort', file_bam_amplicons, '-o', file_sorted_bam_amplicons]

    subprocess.call(sort_bam_amplicons_command)

    # Clean up
    #os.remove(file_sam_amplicons)
    os.remove(file_bam_amplicons)

    # Create bam index files
    create_bam_amplicons_index_command = ['samtools', 'index', file_sorted_bam_amplicons]
    subprocess.call(create_bam_amplicons_index_command)

    os.chdir(initial_dir)
 
    

## Align filtered reads to amplicons (end-to-end)

In [8]:
######## Aligning reads to amplicons ###########
#
# 
#  This will go thorugh and align the amplicons. 
# check_plamid_insertions is if the aligned reads should come from plasmid integration removed fastq
# paired, do you use paired ends for alignment or just Read1 (gene specific primer)


for i in range(15,33):
    
    amplicon_info = get_csv_data(directory, i) 
    align_ampliconLAM(directory, amplicon_info, check_plasmid_insertions = True, paired = True, ncpu = 14)
    
    
    

In [21]:
################################################################################
# Function to extract all unmapped reads to the amplicons
# reads, 'single', 'paired'
################################################################################
#  THIS IS USED FOR PIPELINE 2 WHERE WE ALIGN THEM TO THE REST OF HTE GENOME

for i in range(15,33):
    
    amplicon_info = get_csv_data(directory, i)    
    extract_unmapped_reads_amplicons_LAM(directory, amplicon_info, 'paired')

# Analyze the aligned amplicon reads

In [23]:
################### ANALYZING THE ALIGNMENTS ###############
#
#          INDELS AND QUANTIFICATION


#SUMMARY FILE
results_df_all = pd.DataFrame()

results_folder = os.path.join(directory, 'results')
if not os.path.exists(results_folder):
    os.mkdir(results_folder)
results_file = os.path.join(directory, 'results','all_amplicon_counts.xlsx')

# function inputs
window_size = 15
amplicon_window_around_cut = 1000
min_MAPQ = 5
min_AS = -180

for i in range(15,33):
    
    amplicon_info = get_csv_data(directory, i)    

    result_amplicon_df = analyze_alignments_LAM(directory, amplicon_info, window_size, amplicon_window_around_cut, min_MAPQ, min_AS)

    
    results_df_all = results_df_all.append(result_amplicon_df, ignore_index=True)
    print('done with sample', i, amplicon_info['description'], amplicon_info['index_I1'],amplicon_info['index_I2'])


results_df_all.to_excel(results_file)   

print(results_df_all)


Allreads is :  81118
HDRcount is :  107
done with sample 15 PRF1_AAV_Fwd N709 N501
Allreads is :  88978
HDRcount is :  21
done with sample 16 PRF1_AAV_Fwd N709 N502
Allreads is :  24351
HDRcount is :  2
done with sample 17 PRF1_AAV_Fwd N709 N504
Allreads is :  61089
HDRcount is :  18200
done with sample 18 PRF1_Cas9WT_Fwd N709 N505
Allreads is :  25100
HDRcount is :  7843
done with sample 19 PRF1_Cas9WT_Fwd N709 N506
Allreads is :  61315
HDRcount is :  27293
done with sample 20 PRF1_Cas9WT_Fwd N709 N507
Allreads is :  192399
HDRcount is :  111275
done with sample 21 PRF1_D10A_Fwd N709 N508
Allreads is :  175210
HDRcount is :  102913
done with sample 22 PRF1_D10A_Fwd N709 N510
Allreads is :  98606
HDRcount is :  50975
done with sample 23 PRF1_D10A_Fwd N710 N501
Allreads is :  209460
HDRcount is :  127
done with sample 24 PRF1_AAV_Rev N711 N501
Allreads is :  77189
HDRcount is :  58
done with sample 25 PRF1_AAV_Rev N711 N502
Allreads is :  131
HDRcount is :  0
done with sample 26 PRF1_AA

# PIPELINE 2. TRIM TO THE CUTSITE AND THEN ALIGN AGAINST THE GENOME

Do end-to-end alignments of all the reads map to the region of interest? I want to see what that region looks like blown up    

i will only use primer 1 to keep things simpler for now

## TRIMMING OFF TO CUTSITE

In [19]:
################################################################################
# Function to extract all unmapped reads to the amplicons
# reads, 'single', 'paired'
################################################################################
def extract_unmapped_reads_amplicons_LAM(dir_sample, amplicon_info, reads):

    N7 = amplicon_info['index_I1']
    N5 = amplicon_info['index_I2']

    file_sorted_bam_amplicons = create_filename(dir_sample, N7, N5, 'sorted_bam_amplicons')

    file_unmapped_bam_amplicons = create_filename(dir_sample, N7, N5, 'unmapped_bam_amplicons')

    file_qsorted_unmapped_bam_amplicons = create_filename(dir_sample, N7, N5, 'qsorted_unmapped_bam_amplicons')

    file_R1_unmapped = create_filename(dir_sample, N7, N5, 'unmapped_amplicons_R1fastq')
    file_R2_unmapped = create_filename(dir_sample, N7, N5, 'unmapped_amplicons_R2fastq')
    file_unmapped_report = create_filename(dir_sample, N7, N5, 'unmapped_amplicons_report')

    if not os.path.exists(os.path.dirname(file_R1_unmapped)):
        os.mkdir(os.path.dirname(file_R1_unmapped))

    extract_unmapped_bam_command = ['samtools', 'view', '-b', '-f', '0x4', file_sorted_bam_amplicons, '-o',
                                    file_unmapped_bam_amplicons]

    subprocess.call(extract_unmapped_bam_command)

    qsort_unmapped_bam_command = ['samtools', 'sort', '-n', file_unmapped_bam_amplicons, '-o',
                                  file_qsorted_unmapped_bam_amplicons]

    subprocess.call(qsort_unmapped_bam_command)

    if reads == 'paired':
        bamtofastq_command = ['bedtools', 'bamtofastq', '-i', file_qsorted_unmapped_bam_amplicons,
                              '-fq', file_R1_unmapped, '-fq2', file_R2_unmapped]

        handle_unmapped_report = open(file_unmapped_report, 'wb')
        subprocess.call(bamtofastq_command, stderr=handle_unmapped_report)

        for fo in [file_R1_unmapped, file_R2_unmapped]:
            with open(fo) as f_in, gzip.open(fo + '.gz', 'wb') as f_out:
                f_out.writelines(f_in)
            os.remove(fo)
    elif reads == 'single':
        bamtofastq_command = ['bedtools', 'bamtofastq', '-i', file_qsorted_unmapped_bam_amplicons,
                              '-fq', file_R1_unmapped]

        handle_unmapped_report = open(file_unmapped_report, 'wb')
        subprocess.call(bamtofastq_command, stderr=handle_unmapped_report)

        for fo in [file_R1_unmapped]:
            with open(fo) as f_in, gzip.open(fo + '.gz', 'wb') as f_out:
                f_out.writelines(f_in)
            os.remove(fo)
        
        