The required input is a 170-mer base sequence with the mutated nucleotide converted as "M" (It should comprise both splicing sites as the original traing data).

# Import

In [None]:
import pandas as pd
import numpy as np
import pybedtools
from Bio import SeqIO
import hgvs
from hgvs.easy import *

# Functions

In [None]:
def reverse_sequence(s):
    ''' Converts a sequence into the sequence of the complementary strand'''
    new_sequence = ''
    for base in s:
        if base == 'A':
            new_sequence = new_sequence + 'T'
        elif base == 'T':
            new_sequence = new_sequence + 'A'
        elif base == 'G':
            new_sequence = new_sequence + 'C'
        elif base == 'C':
            new_sequence = new_sequence + 'G'
        else:
            new_sequence = new_sequence + base
    return new_sequence[::-1]

# Variables

In [None]:
gene = 'MYBPC3'
variant = 'NCSS'

# length of the generated fasta sequence
length = 170
# genome version
genome = 'GRCh37'
excel_file = '../variants_scores.xlsx'

if gene == 'ABCA4':
    chromosome = 1
    reverse = True
    # reference fasta can be downloaded from http://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.1.fa.gz
    reference_fasta = '../Homo_sapiens.GRCh37.dna.chromosome.1.fa'
    if variant == 'NCSS':
        dataset = 'ABCA4_NCSS'
        sheet_name = 'NCSS'
    elif variant == 'DI':
        dataset = 'ABCA4_DI'
        sheet_name = 'DI'

        
elif gene == 'MYBPC3':
    chromosome = 11
    reverse = True
    # reference fasta can be downloaded from https://www.ncbi.nlm.nih.gov/nuccore/NC_000011.9?report=fasta
    reference_fasta = '../chr11.fa'
    dataset = 'MYBPC3_NCSS'
    sheet_name = 'NCSS'

# Get the variant information

In [None]:
# Read the second column of the excel sheet and store the variants in a list
df = pd.read_excel(excel_file, sheet_name ,index_col=None, usecols = 'B')
variant_list = df['genomic variant'].tolist()

# Store the cDNA variant information
df2 = pd.read_excel(excel_file, sheet_name ,index_col=None, usecols = 'A')
cDNA = df2['cDNA variant'].tolist()

# Also store if it is a donor or acceptor site
if variant == 'NCSS':
    df3 = pd.read_excel(excel_file, sheet_name ,index_col=None, usecols = 'D')
else:
    df3 = pd.read_excel(excel_file, sheet_name ,index_col=None, usecols = 'E')
ss = df3['affects'].tolist()

# Store the variant information for later access

In [None]:
info = []

hp = hgvs.parser.Parser()
for i in range(len(variant_list)):
    variant = variant_list[i]
    v = hp.parse_hgvs_variant('Chr' + str(chromosome) + genome + ':' + variant)
    
    # Store the variant information so that it can be accessed separately
    var_info = []
    var_info.append(v.posedit.pos.start.base)
    var_info.append(v.posedit.pos.end.base)
    var_info.append(v.posedit.edit.ref)
    var_info.append(v.posedit.edit.alt)
    
    # add if it affects a donor or acceptor
    var_info.append(ss[i])
    
    # deletions are neglected for this tools because the input format does not support them
    if v.posedit.edit.alt != None:
        info.append(var_info)

# Store the genomic locations in a BED file to retrieve the sequence later on

In [None]:
# The BED file defines the sequence range that is written to the fasta file later on
with open ((dataset + '.bed'), 'w') as file:
    for i in range(len(info)):
        var_loc = info[i][0]
        
        # We want the acceptor to be located in the beginning of the sequence and the donor in the end of the sequence
        if info[i][4] == 'acceptor':
            loc = var_loc - 20
            file.write('chr' + str(chromosome) + '\t' + str(loc) + '\t' + str(loc+length) + '\t\t\t' + '-' + '\n')
        else:
            loc = var_loc + 20
            file.write('chr' + str(chromosome) + '\t' + str(loc-length) + '\t' + str(loc) + '\t\t\t' + '-' + '\n')


# Get the fasta sequence for each variant

In [None]:
a = pybedtools.BedTool((dataset + '.bed'))
a = a.sequence(fi = reference_fasta, fo = (dataset + '.fa.out'))

# Change the mutated base to M and save the fasta sequences in a file

In [None]:
fasta_sequences = SeqIO.parse(open((dataset + '.fa.out')),'fasta')

# open the new fasta file to save the mutated sequences
with open ((dataset + '_var.fa.out'), 'w') as file:
    i = 0
    for fasta in fasta_sequences:
        # get the name and sequence of each variant
        name, sequence = fasta.id, str(fasta.seq)
        
        # get the location of the variant
        if info[i][4] == 'acceptor':
            loc = 19
        else:
            loc = 149
            
        # check if the base at the variant position is actual the base it is supposed to be
        assert sequence[loc] == info[i][2]

        # change the base at the variant position to M
        l = list(sequence)
        l[loc] = 'M'
        s = ''.join(l)

        #if neccessary convert the sequence to the complementary sequence
        if reverse == True:
            s = reverse_sequence(s)
            
        # write the result to a file
        file.write('>' + str(info[i][0]) + str(info[i][2]) + '>' + str(info[i][3]) + '\n' + s + '\n') 

        i += 1