In [1]:
import os, glob
import pandas as pd
from io import StringIO
import pysam
import pybedtools 
pybedtools.helpers.set_tempdir('/data/private/pdutta/PSB_Data')

In [2]:
chr_wise_files = glob.glob('/data/private/pdutta/PSB_Data/Acceptor/Chrwise/*.tsv')
reference_path  = "/data/projects/Resources/HumanReferenceGenome/"
output_path = "/data/private/pdutta/PSB_Data/Acceptor/DNABERT_data/" 

In [3]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [7]:
for intersect_file in chr_wise_files:
    # Read the csv file into a DataFrame
    chromosome_name = intersect_file.split('_')[-2]
    if (chromosome_name != 'chrM'):
        print(chromosome_name)
        df = pd.read_csv(intersect_file, header=None, sep= '\t')
        print(df.shape)
        df = df[df[15]<3].reset_index(drop=True)
        print(df.shape)
        df = df.loc[df[14] != '.'].reset_index(drop=True)
        print(df.shape)
        genome = pysam.FastaFile(reference_path+chromosome_name+".fa")
        
        # Lists to hold the reference and alternative sequences
        ref_sequences = []
        alt_sequences = []
        df = df[df[15]>1]
        df = df.head(10)
        # Iterate over DataFrame rows
        for idx, row in df.iterrows():
            try:
                ref_sequence = genome.fetch(row[0], row[1], row[2])
                ref_sequences.append(ref_sequence)
                print(ref_sequence)

                # Calculate the variant position relative to the fetched sequence
                variant_pos = row['VCF_position'] - row['start']


                # Replace the reference nucleotide with the alternate nucleotide to get the alternate sequence
                alt_sequence = ref_sequence[:variant_pos] + row['alt_nucleotide'] + ref_sequence[variant_pos + len(row['ref_nucleotide']):]

                # Fetch the reference sequence
                

                # Generate the alternative sequence
                alt_sequence = ref_sequence[:row[11] - row[1] - 1] + row[16].replace(",", "") + ref_sequence[row[11] - row[1]:]
                alt_sequences.append(alt_sequence)
                print(ref_sequence)
                print(alt_sequence)
            except KeyError:
                # If sequence is not present, append a default value
                ref_sequences.append("NA")
                alt_sequences.append("NA")

        # Add new columns to the DataFrame
        df['ref_sequence'] = ref_sequences
        df['alt_sequence'] = alt_sequences
        merged_list = [i for pair in zip(ref_sequences, alt_sequences) for i in pair]
        # Apply function to all items in the list
        kmer_lst = list(map(seq2kmer, merged_list))

        # Convert to DataFrame
        df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
        values = [0] * (len(df_kmer) // 2) + [1] * (len(df_kmer) // 2)

        # If the DataFrame has an odd number of rows, add one more 0 or 1 to make the length match
        if len(df_kmer) % 2:
            values += [np.random.choice([0, 1])]
        df_kmer['Label'] = values
        print(df)
        print(df_kmer)
        out_folder_path = output_path+chromosome_name
        if not os.path.exists(out_folder_path):
            os.makedirs(out_folder_path)
        df.to_csv(out_folder_path + "/all_data.tsv", sep="\t", index= False)
        df_kmer.to_csv(out_folder_path + "/dev.tsv", sep="\t", index= False)
        print("All the files of ", chromosome_name, "are saved !!!\n ")
        input()
        # Close the reference genome
        genome.close()

chr1
(41016818, 17)


KeyboardInterrupt: Interrupted by user

In [8]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,chr1,24401417,24401496,ENST00000003583.12,-,chr1,24401079,24410779,ENST00000475760,-1,chr1,24401420,24401421,rs767109914,T,1,"C,"
1,chr1,24401417,24401496,ENST00000003583.12,-,chr1,24401079,24410779,ENST00000475760,-1,chr1,24401421,24401421,rs1557461252,.,1,CCACTCCTCAGGTATAGACCCATGAAATACATAAATATGTGTCCCA...
2,chr1,24401417,24401496,ENST00000003583.12,-,chr1,24401079,24410779,ENST00000475760,-1,chr1,24401424,24401425,rs1643224572,T,1,"C,"
3,chr1,24401417,24401496,ENST00000003583.12,-,chr1,24401079,24410779,ENST00000475760,-1,chr1,24401427,24401428,rs1402144573,A,1,"G,"
4,chr1,24401417,24401496,ENST00000003583.12,-,chr1,24401079,24410779,ENST00000475760,-1,chr1,24401433,24401434,rs1643224756,C,1,"G,"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41016813,chr1,236762420,236762499,ENST00000684763.1,+,chr1,236686682,236764595,ENST00000683111,1,chr1,236762488,236762489,rs727502888,G,1,"A,"
41016814,chr1,236762420,236762499,ENST00000684763.1,+,chr1,236686682,236764595,ENST00000683111,1,chr1,236762489,236762490,rs1362873015,G,1,"A,"
41016815,chr1,236762420,236762499,ENST00000684763.1,+,chr1,236686682,236764595,ENST00000683111,1,chr1,236762493,236762494,rs370992948,C,1,"T,"
41016816,chr1,236762420,236762499,ENST00000684763.1,+,chr1,236686682,236764595,ENST00000683111,1,chr1,236762496,236762497,rs746887157,C,1,"T,"


In [77]:
df[df[16]=='C,T,']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
182,chr1,24383964,24384043,ENST00000003583.12,-,chr1,24357005,24413725,ENST00000003583,-1,chr1,24383964,24383965,rs1642394632,G,2,"C,T,"
200,chr1,24383964,24384043,ENST00000003583.12,-,chr1,24357005,24413725,ENST00000003583,-1,chr1,24384014,24384015,rs771170905,G,2,"C,T,"
213,chr1,24383964,24384043,ENST00000003583.12,-,chr1,24356999,24413782,ENST00000337248,-1,chr1,24383964,24383965,rs1642394632,G,2,"C,T,"
231,chr1,24383964,24384043,ENST00000003583.12,-,chr1,24356999,24413782,ENST00000337248,-1,chr1,24384014,24384015,rs771170905,G,2,"C,T,"
244,chr1,24383964,24384043,ENST00000003583.12,-,chr1,24356999,24415097,ENST00000374409,-1,chr1,24383964,24383965,rs1642394632,G,2,"C,T,"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41016626,chr1,236762420,236762499,ENST00000684763.1,+,chr1,236686639,236764595,ENST00000684050,1,chr1,236762446,236762447,rs753022801,G,2,"C,T,"
41016667,chr1,236762420,236762499,ENST00000684763.1,+,chr1,236686639,236764595,ENST00000684502,1,chr1,236762446,236762447,rs753022801,G,2,"C,T,"
41016708,chr1,236762420,236762499,ENST00000684763.1,+,chr1,236686674,236764595,ENST00000682692,1,chr1,236762446,236762447,rs753022801,G,2,"C,T,"
41016749,chr1,236762420,236762499,ENST00000684763.1,+,chr1,236686675,236764595,ENST00000682966,1,chr1,236762446,236762447,rs753022801,G,2,"C,T,"


In [12]:
num_instances = (df[12] >df[2]).sum()
num_instances

175344

In [67]:
df[df[12] >df[2]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
1061,chr1,24361002,24361081,ENST00000003583.12,-,chr1,24319425,24364482,ENST00000350501,1,chr1,24361078,24361086,rs1641082067,AGTCTAGT,1,"AGT,"
1088,chr1,24361002,24361081,ENST00000003583.12,-,chr1,24357005,24413725,ENST00000003583,-1,chr1,24361078,24361086,rs1641082067,AGTCTAGT,1,"AGT,"
1115,chr1,24361002,24361081,ENST00000003583.12,-,chr1,24356999,24413782,ENST00000337248,-1,chr1,24361078,24361086,rs1641082067,AGTCTAGT,1,"AGT,"
1142,chr1,24361002,24361081,ENST00000003583.12,-,chr1,24356999,24415097,ENST00000374409,-1,chr1,24361078,24361086,rs1641082067,AGTCTAGT,1,"AGT,"
1169,chr1,24361002,24361081,ENST00000003583.12,-,chr1,24357005,24416934,ENST00000468303,-1,chr1,24361078,24361086,rs1641082067,AGTCTAGT,1,"AGT,"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41014654,chr1,236757445,236757524,ENST00000684763.1,+,chr1,236686639,236764595,ENST00000684050,1,chr1,236757521,236757557,rs1572148914,ATCGCCAGAACCATCAATGAGGTGGAGACTCAGATC,1,"ATC,"
41014680,chr1,236757445,236757524,ENST00000684763.1,+,chr1,236686639,236764595,ENST00000684502,1,chr1,236757521,236757557,rs1572148914,ATCGCCAGAACCATCAATGAGGTGGAGACTCAGATC,1,"ATC,"
41014706,chr1,236757445,236757524,ENST00000684763.1,+,chr1,236686674,236764595,ENST00000682692,1,chr1,236757521,236757557,rs1572148914,ATCGCCAGAACCATCAATGAGGTGGAGACTCAGATC,1,"ATC,"
41014732,chr1,236757445,236757524,ENST00000684763.1,+,chr1,236686675,236764595,ENST00000682966,1,chr1,236757521,236757557,rs1572148914,ATCGCCAGAACCATCAATGAGGTGGAGACTCAGATC,1,"ATC,"


In [42]:
df = df[df[15]<3].reset_index(drop=True)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,chrY,2953868,2953947,ENST00000155093.8,+,chrY,2935381,2982506,ENST00000155093,1,chrY,2953879,2953880,rs2051285710,T,1,"C,"
1,chrY,2953868,2953947,ENST00000155093.8,+,chrY,2935281,2982506,ENST00000383052,1,chrY,2953879,2953880,rs2051285710,T,1,"C,"
2,chrY,2953868,2953947,ENST00000155093.8,+,chrY,2935505,2961286,ENST00000443793,1,chrY,2953879,2953880,rs2051285710,T,1,"C,"
3,chrY,2953868,2953947,ENST00000155093.8,+,chrY,2935651,2981146,ENST00000449237,1,chrY,2953879,2953880,rs2051285710,T,1,"C,"
4,chrY,2953868,2953947,ENST00000155093.8,+,chrY,2935500,2978053,ENST00000469869,1,chrY,2953879,2953880,rs2051285710,T,1,"C,"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107283,chrY,13234787,13234866,ENST00000684326.1,-,chrY,13234599,13336129,ENST00000682761,-1,chrY,13234794,13234795,rs1422913673,G,1,"A,"
107284,chrY,13234787,13234866,ENST00000684326.1,-,chrY,13234599,13336129,ENST00000682761,-1,chrY,13234797,13234798,rs1162204554,G,1,"A,"
107285,chrY,13234787,13234866,ENST00000684326.1,-,chrY,13234599,13336129,ENST00000682761,-1,chrY,13234801,13234802,rs1355901236,G,1,"C,"
107286,chrY,13234787,13234866,ENST00000684326.1,-,chrY,13234599,13336129,ENST00000682761,-1,chrY,13234814,13234815,rs1462345242,C,1,"A,"


In [61]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,ref_sequence,alt_sequence
0,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230760,7230761,rs1296032425,G,1,"A,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AAGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...
1,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230761,7230762,rs143043931,C,1,"T,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AGTCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...
2,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230762,7230763,rs758501117,G,1,"A,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AGGAGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...
3,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230764,7230765,rs1264988936,G,1,"T,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AGGCGTGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...
4,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230765,7230766,rs1282730801,C,1,"T,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AGGCGGTCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...
5,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230766,7230767,rs766636917,G,1,"A,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AGGCGGGAGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...
6,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230767,7230768,rs2071530777,T,1,"C,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AGGCGGGCCTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...
7,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230773,7230774,rs1215823324,T,1,"C,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AGGCGGGCGTTGTCCTCTGAAATTTCTTCCTTCACCACCCTGCCAA...
8,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230775,7230776,rs1253332796,T,1,"C,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...
9,chr17,7230758,7230837,ENST00000005340.10,-,chr17,7228007,7234061,ENST00000574143,-1,chr17,7230777,7230778,rs989449469,A,1,"T,",AGGCGGGCGTTGTCATCTGAAATTTCTTCCTTCACCACCCTGCCAA...,AGGCGGGCGTTGTCATCTTAAATTTCTTCCTTCACCACCCTGCCAA...


In [60]:
# Load the reference genome
genome = pysam.FastaFile(reference_path+chromosome_name+".fa")
print(reference_path+intersect_file.split('_')[-2]+".fa")
# This is just for the first row (or the first variant) as an example
row = df.iloc[344]
print(row[0], row[1], row[2], row[16])
# Fetch the reference sequence
ref_sequence = genome.fetch(row[0], row[1], row[2])

# Now replace the reference nucleotide at the VCF position with the alternative nucleotide to get the alternative sequence
alt_sequence = ref_sequence[:row[11] - row[1] - 1] + row[16].replace(",", "") + ref_sequence[row[11] - row[1]:]

print('Ref:', ref_sequence)
print('Alt:', alt_sequence)

genome.close()

/data/projects/Resources/HumanReferenceGenome/chr5.fa


IndexError: single positional indexer is out-of-bounds

In [48]:
list1 = [1, 2, 3, 4, 5]
list2 = ['a', 'b', 'c', 'd', 'e']

# Merge lists so the result contains alternating elements from both lists
merged_list = [i for pair in zip(list1, list2) for i in pair]

print(merged_list)

[1, 'a', 2, 'b', 3, 'c', 4, 'd', 5, 'e']


In [73]:
import pandas as pd
import pysam

# Sample DataFrame
df = pd.DataFrame({
    'chromosome': ['chr1', 'chr2', 'chr3'],
    'start': [110000, 200200, 200300],
    'end': [110100, 200300, 200400],
    'VCF_position': [110002, 200201, 200301],
    'ref_nucleotide': ['C', 'GC', 'A'],
    'alt_nucleotide': ['G', 'T', 'TG']
})

# Load the reference genome
genome = pysam.FastaFile("/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa")

# Store the sequences in lists
ref_sequences = []
alt_sequences = []

for _, row in df.iterrows():
    # Fetch the reference sequence from start to end
    ref_sequence = genome.fetch(row['chromosome'], row['start'], row['end'])
    print(ref_sequence)
    
    # Calculate the variant position relative to the fetched sequence
    variant_pos = row['VCF_position'] - row['start']
    

    # Replace the reference nucleotide with the alternate nucleotide to get the alternate sequence
    alt_sequence = ref_sequence[:variant_pos] + row['alt_nucleotide'] + ref_sequence[variant_pos + len(row['ref_nucleotide']):]
    print(alt_sequence)
    print(variant_pos)
    
    # Store the sequences
    ref_sequences.append(ref_sequence)
    alt_sequences.append(alt_sequence)

# Add the sequences to the dataframe
df['ref_sequence'] = ref_sequences
df['alt_sequence'] = alt_sequences

# Print the updated dataframe

genome.close()

ACCATGTCCACCTTTATGCTTTTTAAAGTGAAAAACCATACTAAGAATGAGGCAGCTCAACTTAATAATAAAAACATTTCAAATGTAAAGAAATTTACAA
ACGATGTCCACCTTTATGCTTTTTAAAGTGAAAAACCATACTAAGAATGAGGCAGCTCAACTTAATAATAAAAACATTTCAAATGTAAAGAAATTTACAA
2
GGCAGGCATGGGAAATACAGTGCGTTAAAGGTAGGTTCAGATTGAGTGGAGCAGCTTTAAAGGACTAAGTGTCTGGCCAGTCTTCAATAAACATGAAACT
GTAGGCATGGGAAATACAGTGCGTTAAAGGTAGGTTCAGATTGAGTGGAGCAGCTTTAAAGGACTAAGTGTCTGGCCAGTCTTCAATAAACATGAAACT
1
AAGAGTAGGAAAACATGACAGATTGGGAGGTGTTGAGTTCTGAATTTGGGGACCTAATTGAAATGCAAAGGTATAATCAAGAGAATGAACTTGTCTCTGT
ATGGAGTAGGAAAACATGACAGATTGGGAGGTGTTGAGTTCTGAATTTGGGGACCTAATTGAAATGCAAAGGTATAATCAAGAGAATGAACTTGTCTCTGT
1


In [71]:
df

Unnamed: 0,chromosome,start,end,VCF_position,ref_nucleotide,alt_nucleotide,ref_sequence,alt_sequence
0,chr1,100000,100200,100102,T,G,ACTAAGCACACAGAGAATAATGTCTAGAATCTGAGTGCCATGTTAT...,ACTAAGCACACAGAGAATAATGTCTAGAATCTGAGTGCCATGTTAT...
1,chr2,200200,200300,200201,GC,T,GGCAGGCATGGGAAATACAGTGCGTTAAAGGTAGGTTCAGATTGAG...,GTAGGCATGGGAAATACAGTGCGTTAAAGGTAGGTTCAGATTGAGT...
2,chr3,200300,200400,200301,A,TG,AAGAGTAGGAAAACATGACAGATTGGGAGGTGTTGAGTTCTGAATT...,ATGGAGTAGGAAAACATGACAGATTGGGAGGTGTTGAGTTCTGAAT...


In [22]:
import pandas as pd
from Bio import SeqIO

In [23]:
# Load the genome
genome = SeqIO.to_dict(SeqIO.parse("/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa", "fasta"))

In [24]:
def get_sequences(df):
    sequences = []
    for _, row in df.iterrows():
        chrom = row['chromosome-number']
        ref_start = row['varinat_start']
        ref_end = row['variant_end']
        ref_nucleotide = row['ref_neucleotide']
        alts = row['alternative_neucleotide']
        number_of_alts = row['number_of_alts']
        
        # Get reference sequence
        ref_seq = str(genome[chrom].seq[ref_start:ref_end])

        # Ensure the reference sequence matches what we expect
        if ref_seq != ref_nucleotide:
            print(f"Warning: expected reference sequence {ref_nucleotide}, but got {ref_seq}")

        # Replace reference nucleotide with each alternative
        for i in range(number_of_alts):
            alt_seq = ref_seq.replace(ref_nucleotide, alts.split(',')[i])
            sequences.append((ref_seq, alt_seq))

    return sequences

In [100]:
# Your dataframe (sample)
df1 = pd.DataFrame({
    'chromosome-number': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
    'chromosome_start': [24401417, 24383964, 234429638, 67049811, 85183161],
    'chromosome_end': [24401496, 24384043, 234429717, 67049890, 85183240],
    'varinat_start': [24401424, 24383964, 234429682, 67049880, 85183235],
    'variant_end': [24401425, 24383965, 234429684, 67049882, 85183239],
    'ref_neucleotide': ['T', 'G', 'AT', 'AT', 'ATAT'],
    'number_of_alts': [1, 2, 1,1,2],
    'alternative_neucleotide': ['C,', 'C,T,', 'ATAT,', ',', 'AT,CGT,']
})

In [101]:
df1

Unnamed: 0,chromosome-number,chromosome_start,chromosome_end,varinat_start,variant_end,ref_neucleotide,number_of_alts,alternative_neucleotide
0,chr1,24401417,24401496,24401424,24401425,T,1,"C,"
1,chr1,24383964,24384043,24383964,24383965,G,2,"C,T,"
2,chr1,234429638,234429717,234429682,234429684,AT,1,"ATAT,"
3,chr1,67049811,67049890,67049880,67049882,AT,1,","
4,chr1,85183161,85183240,85183235,85183239,ATAT,2,"AT,CGT,"


In [102]:
def get_sequences(df):
    sequences = []
    for _, row in df.iterrows():
        chrom = row['chromosome-number']
        ref_start = row['chromosome_start']
        ref_end = row['chromosome_end']
        variant_start = row['varinat_start']
        variant_end = row['variant_end']
        ref_nucleotide = row['ref_neucleotide']
        alts = row['alternative_neucleotide'].rstrip(',').split(',')
        
        # Get reference sequence
        ref_seq = str(genome[chrom].seq[ref_start:ref_end])

        # Adjust for 0-based indexing in python
        variant_pos_start = variant_start - ref_start
        variant_pos_end = variant_end - ref_start

        # Replace reference nucleotide with each alternative
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[chrom].seq[ref_end:ref_end + len(ref_nucleotide)])
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[chrom].seq[ref_end:ref_end + len(ref_nucleotide) - len(alt)])
                alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
            sequences.append((ref_seq, alt_seq))

    return sequences

In [103]:
sequences = get_sequences(df1)
for seq in sequences:
    print(seq[0])
    print(seq[1],"\n")

TGTTTGATGAAAAGTTCTACTGCATGTTCTCCTAAGCACCTGAAACAGCAAAACACAGCATTTGTAGAGATCATTTCAC
TGTTTGACGAAAAGTTCTACTGCATGTTCTCCTAAGCACCTGAAACAGCAAAACACAGCATTTGTAGAGATCATTTCAC 

GTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAGGGAAGAGAAGGTGGTGTCATCGAGATACTTCAA
CTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAGGGAAGAGAAGGTGGTGTCATCGAGATACTTCAA 

GTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAGGGAAGAGAAGGTGGTGTCATCGAGATACTTCAA
TTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAGGGAAGAGAAGGTGGTGTCATCGAGATACTTCAA 

TTTCCCCATCCTTGGGAAGATGACGATTCTTCCAAAACACTGAAATAAAAAATAAAGTTACTAGGCCATACTTCCCCAA
TTTCCCCATCCTTGGGAAGATGACGATTCTTCCAAAACACTGAAATATAAAAAATAAAGTTACTAGGCCATACTTCCCCAA 

GCAAATACAGTCATTTTAATACCCCAAGAAAAAGTCTTCCTACAAAACAAAAAATTTTAAAATACACACATGACTTTAT
GCAAATACAGTCATTTTAATACCCCAAGAAAAAGTCTTCCTACAAAACAAAAAATTTTAAAATACACACGACTTTATTC 

CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAAGAAAGAAAGAAAAATACGTTATAAAGCAATATG
CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAAGAAAGAAAGAAAAATACGTTATAAAGCAATGTT 

CCAATTCAGTACTATTTGTAGCAGCC

In [117]:
def get_sequences_new(df):
    new_df = pd.DataFrame(columns=['reference_id', 'varinat_start', 'variant_end', 
                                   'ref_neucleotide', 'alternative_neucleotide', 
                                   'reference_seq', 'alt_seq'])

    for idx, row in df.iterrows():
        chrom = row['chromosome-number']
        ref_start = row['chromosome_start']
        ref_end = row['chromosome_end']
        variant_start = row['varinat_start']
        variant_end = row['variant_end']
        ref_nucleotide = row['ref_neucleotide']
        alts = row['alternative_neucleotide'].rstrip(',').split(',')
        
        # Get reference sequence
        ref_seq = str(genome[chrom].seq[ref_start:ref_end])

        # Adjust for 0-based indexing in python
        variant_pos_start = variant_start - ref_start
        variant_pos_end = variant_end - ref_start

        # Replace reference nucleotide with each alternative
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[chrom].seq[ref_end:ref_end + len(ref_nucleotide)])
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[chrom].seq[ref_end:ref_end + len(ref_nucleotide) - len(alt)])
                alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
                
            new_df = new_df.append({'reference_id': chrom,
                                    'varinat_start': variant_start,
                                    'variant_end': variant_end,
                                    'ref_neucleotide': ref_nucleotide,
                                    'alternative_neucleotide': alt,
                                    'reference_seq': ref_seq,
                                    'alt_seq': alt_seq}, ignore_index=True)
            
    return new_df

In [119]:
new_df = get_sequences_new(df1)

  new_df = new_df.append({'reference_id': chrom,
  new_df = new_df.append({'reference_id': chrom,
  new_df = new_df.append({'reference_id': chrom,
  new_df = new_df.append({'reference_id': chrom,
  new_df = new_df.append({'reference_id': chrom,
  new_df = new_df.append({'reference_id': chrom,
  new_df = new_df.append({'reference_id': chrom,


In [120]:
new_df

Unnamed: 0,reference_id,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,chr1,24401424,24401425,T,C,TGTTTGATGAAAAGTTCTACTGCATGTTCTCCTAAGCACCTGAAAC...,TGTTTGACGAAAAGTTCTACTGCATGTTCTCCTAAGCACCTGAAAC...
1,chr1,24383964,24383965,G,C,GTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAG...,CTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAG...
2,chr1,24383964,24383965,G,T,GTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAG...,TTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAG...
3,chr1,234429682,234429684,AT,ATAT,TTTCCCCATCCTTGGGAAGATGACGATTCTTCCAAAACACTGAAAT...,TTTCCCCATCCTTGGGAAGATGACGATTCTTCCAAAACACTGAAAT...
4,chr1,67049880,67049882,AT,,GCAAATACAGTCATTTTAATACCCCAAGAAAAAGTCTTCCTACAAA...,GCAAATACAGTCATTTTAATACCCCAAGAAAAAGTCTTCCTACAAA...
5,chr1,85183235,85183239,ATAT,AT,CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAA...,CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAA...
6,chr1,85183235,85183239,ATAT,CGT,CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAA...,CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAA...


In [123]:
import pandas as pd
from Bio import SeqIO

# Load the genome
genome = SeqIO.to_dict(SeqIO.parse("/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa", "fasta"))


def get_sequences(df):
    data = []
    for idx, row in df.iterrows():
        chrom = row['chromosome-number']
        ref_start = row['chromosome_start']
        ref_end = row['chromosome_end']
        variant_start = row['varinat_start']
        variant_end = row['variant_end']
        ref_nucleotide = row['ref_neucleotide']
        alts = row['alternative_neucleotide'].rstrip(',').split(',')
        
        # Get reference sequence
        ref_seq = str(genome[chrom].seq[ref_start:ref_end])

        # Adjust for 0-based indexing in python
        variant_pos_start = variant_start - ref_start
        variant_pos_end = variant_end - ref_start

        # Replace reference nucleotide with each alternative
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[chrom].seq[ref_end:ref_end + len(ref_nucleotide)])
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[chrom].seq[ref_end:ref_end + len(ref_nucleotide) - len(alt)])
                alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
                
            # Append to the list as a dictionary
            data.append({
                'reference_id': chrom,
                'varinat_start': variant_start,
                'variant_end': variant_end,
                'ref_neucleotide': ref_nucleotide,
                'alternative_neucleotide': alt,
                'reference_seq': ref_seq,
                'alt_seq': alt_seq
            })
    
    # Convert the list of dictionaries to a DataFrame
    new_df = pd.DataFrame(data)
    return new_df

# Your dataframe (sample)
df = pd.DataFrame({
    'chromosome-number': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'],
    'chromosome_start': [24401417, 24383964, 234429638, 67049811, 85183161],
    'chromosome_end': [24401496, 24384043, 234429717, 67049890, 85183240],
    'varinat_start': [24401424, 24383964, 234429682, 67049880, 85183235],
    'variant_end': [24401425, 24383965, 234429684, 67049882, 85183239],
    'ref_neucleotide': ['T', 'G', 'AT', 'AT', 'ATAT'],
    'number_of_alts': [1, 2, 1,1,2],
    'alternative_neucleotide': ['C,', 'C,T,', 'ATAT,', ',', 'AT,CGT,']
})


new_df = get_sequences(df)

In [124]:
new_df

Unnamed: 0,reference_id,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,chr1,24401424,24401425,T,C,TGTTTGATGAAAAGTTCTACTGCATGTTCTCCTAAGCACCTGAAAC...,TGTTTGACGAAAAGTTCTACTGCATGTTCTCCTAAGCACCTGAAAC...
1,chr1,24383964,24383965,G,C,GTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAG...,CTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAG...
2,chr1,24383964,24383965,G,T,GTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAG...,TTGAATAACATTGTAGAACCCAGGTCCTGGGATATCATTCTGAAAG...
3,chr1,234429682,234429684,AT,ATAT,TTTCCCCATCCTTGGGAAGATGACGATTCTTCCAAAACACTGAAAT...,TTTCCCCATCCTTGGGAAGATGACGATTCTTCCAAAACACTGAAAT...
4,chr1,67049880,67049882,AT,,GCAAATACAGTCATTTTAATACCCCAAGAAAAAGTCTTCCTACAAA...,GCAAATACAGTCATTTTAATACCCCAAGAAAAAGTCTTCCTACAAA...
5,chr1,85183235,85183239,ATAT,AT,CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAA...,CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAA...
6,chr1,85183235,85183239,ATAT,CGT,CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAA...,CCAATTCAGTACTATTTGTAGCAGCCAGGATCCCAGAACCTTAAAA...
