An input for acceptor site should be a 140-mer string with the AG at positions 69 and 70

An input for donor site should be a 140-mer string with the GT at positions 71 and 72

# Imports

In [None]:
import pandas as pd
import numpy as np
import pybedtools
from Bio import SeqIO
import hgvs
from hgvs.easy import *
import sys
sys.path.insert(1, '../')
from functions import reverse_sequence

# Define Variables

In [12]:
gene = 'ABCA4'
variant = 'DI'
length = 140

excel_file = '../variant_scores.xlsx'
genome = 'GRCh37'
chromosome = 1
reverse = True
# reference fasta can be downloaded from http://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.1.fa.gz
reference_fasta = '../Homo_sapiens.GRCh37.dna.chromosome.1.fa'
dataset = 'ABCA4_DI'

# Read in the data 

In [13]:
# Read the second column of the excel sheet and store the variants in a list
df = pd.read_excel(excel_file, dataset ,index_col=None, usecols = 'B')
variant_list = df['genomic variant'].tolist()

# Store the cDNA variant information
df2 = pd.read_excel(excel_file, dataset ,index_col=None, usecols = 'A')
cDNA = df2['cDNA variant'].tolist()

# Also store if it is a donor or acceptor site
df3 = pd.read_excel(excel_file, dataset ,index_col=None, usecols = 'D')
ss = df3['affects'].tolist()

# Store the distance to the splice site
df4 = pd.read_excel(excel_file, dataset ,index_col=None, usecols = 'E')
positions = df4['position ss'].tolist()

# 1) Store the variant information

In [14]:
info = []

hp = hgvs.parser.Parser()
for i in range(len(variant_list)):
    variant = variant_list[i]
    v = hp.parse_hgvs_variant('Chr' + str(chromosome) + genome + ':' + variant)
    
    # Store the variant information so that it can be accessed separately
    var_info = []
    var_info.append(v.posedit.pos.start.base)
    var_info.append(v.posedit.pos.end.base)
    var_info.append(v.posedit.edit.ref)
    var_info.append(v.posedit.edit.alt)
    
    # add if it affects a donor or acceptor
    var_info.append(ss[i])
    
    # add the distance to the splice site
    var_info.append(positions[i])
    
    info.append(var_info)
    

# 2) Create the BED file

In [15]:
# The BED file defines the sequence range that is written to the fasta file later on
with open ((dataset + '.bed'), 'w') as file:
    for i in range(len(info)):
        # An input for acceptor site should be a 140-mer string with the AG at positions 69 and 70
        if ss[i] == 'acceptor':
            var_loc = info[i][0]
            loc = var_loc - info[i][5] + 1
        
        # An input for donor site should be a 140-mer string with the GT at positions 71 and 72
        else:
            var_loc = info[i][0]
            loc = var_loc + info[i][5]
            
        # If the variant contains a deletion, a longer sequence is needed to end up with a 140nt long sequence
        if 'del' in cDNA[i]:
            # get the length of the deletion
            l = info[i][1] - info[i][0]
            # For deletions affecting the donor site additional bases are added to the end of the sequence
            if ss[i] == 'donor':
                file.write('chr' + str(chromosome) + '\t' + str(loc-(length//2+2)) + '\t' + str(loc+(length//2-1)+l) + '\t\t\t' + '-' + '\n')
            # For deletions affecting the acceptor site additonal bases are added to the beginning of the sequence
            else:
                file.write('chr' + str(chromosome) + '\t' + str(loc-(length//2+1)-l) + '\t' + str(loc+(length//2)) + '\t\t\t' + '-' + '\n')
                
        else:     
            file.write('chr' + str(chromosome) + '\t' + str(loc-(length//2+1)) + '\t' + str(loc+(length//2-1)) + '\t\t\t' + '-' + '\n')

# 3) Get the sequence for each variant and store it in a fasta file

In [17]:
a = pybedtools.BedTool((dataset + '.bed'))
a = a.sequence(fi = reference_fasta, fo = (dataset + '.fa.out'))

NotImplementedError: "fastaFromBed" does not appear to be installed or on the path, so this method is disabled.  Please install a more recent version of BEDTools and re-import to use this method.

# 4) Change the mutated base in the fasta sequence

In [10]:
fasta_sequences = SeqIO.parse(open((dataset + '.fa.out')),'fasta')

# open the new fasta file to save the mutated sequences (separate file for donor and acceptor variants)
with open ((dataset + '_donor.fa.out'), 'w') as file:
    with open ((dataset + '_acceptor.fa.out'), 'w') as file2:
        i = 0
        for fasta in fasta_sequences:
            # get the name and the sequence
            name, sequence = fasta.id, str(fasta.seq)
            
            # reverse the sequence if necessasry
            if reverse == True:
                if ss[i] == 'acceptor':
                    wt_sequence = reverse_sequence(sequence)[:140]
                if ss[i] == 'acceptor':
                    wt_sequence = reverse_sequence(sequence)[-140:]

            else:
                wt_sequence = sequence

            # get the location of the variant
            if ss[i] == 'acceptor':
                loc = length//2 + info[i][5] - 1
            else:
                loc = length//2 - info[i][5]

            # change tha base at the variant position    
            # variants where one base is changed
            if info[i][0] == info[i][1] and info[i][2] != '':
                assert sequence[loc] == info[i][2]
                # change the base at the variant position
                l = list(sequence)
                l[loc] = info[i][3]
                s = ''.join(l)
                # test if the base at the variant position in the sequence is now the same as the mutated base
                assert s[loc] == info[i][3]  

            # filter for variants where one single base is deleted
            elif info[i][0] == info[i][1]:
                s = sequence[:loc+1] + sequence[(loc+2):]

            # handle deletions with more bases
            else:
                size = info[i][1] - info[i][0] + 1
                s = sequence[:loc + 1] + sequence[(loc + size + 1):]

            #if necessary reverse the sequence
            if reverse == True:
                s = reverse_sequence(s)
                
            # Check if the bases at the donor/acceptor position are correct
            if ss[i] == 'acceptor':
                #print(df['cDNA variant'][i], s[68:70], len(s))
                assert s[68:70] == 'AG'
            else:
                #print(df['cDNA variant'][i], s[70:72])
                assert s[70:72] in ['GT','GC']
                
            # write the result to a file
            if ss[i] == 'acceptor':
                file2.write('>' + cDNA[i] + '\n' + wt_sequence + '\n') 
                file2.write('>' + cDNA[i] + '_var\n' + s + '\n') 
            else:
                file.write('>' + cDNA[i] + '\n' + wt_sequence + '\n') 
                file.write('>' + cDNA[i] + '_var\n' + s + '\n') 

            i += 1

FileNotFoundError: [Errno 2] No such file or directory: 'ABCA4_DI.fa.out'