In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [96]:
# show  all columns
pd.set_option('display.max_columns', None)

df = pd.read_csv('..\\datasets\\aku_prin_v2.0_cleaned.csv')
df.head()

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2
0,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8
1,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13
2,S47L,c.140C>T,ex3,S47L,c.140C>T,ex3
3,W97C,c.291G>C,ex5,W97C,c.291G>C,ex5
4,R225P,c.674G>C,ex10,I216T,c.647T>C,ex9


In [97]:
mutations_allele1 = df['Protein change allele 1 ']
mutations_allele1
mutation ='G161R'

In [98]:
import Bio
from Bio import SeqIO

from Bio.Seq import Seq, MutableSeq

def read_faa_file(file_path):
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(record)
    return sequences

# Specify the path to your .faa file
file_path = "..\\datasets\\HGD_datasets\\ncbi_dataset\\data\\protein.faa"


# Read the .faa file
sequences = read_faa_file(file_path)

sequences

[SeqRecord(seq=Seq('MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRS...EPN'), id='NP_000178.2', name='NP_000178.2', description='NP_000178.2 HGD [organism=Homo sapiens] [GeneID=3081]', dbxrefs=[]),
 SeqRecord(seq=Seq('MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRS...EPN'), id='XP_005247469.1', name='XP_005247469.1', description='XP_005247469.1 HGD [organism=Homo sapiens] [GeneID=3081] [isoform=X1]', dbxrefs=[]),
 SeqRecord(seq=Seq('MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRS...NLA'), id='XP_005247470.1', name='XP_005247470.1', description='XP_005247470.1 HGD [organism=Homo sapiens] [GeneID=3081] [isoform=X2]', dbxrefs=[]),
 SeqRecord(seq=Seq('MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRS...VNL'), id='XP_005247471.1', name='XP_005247471.1', description='XP_005247471.1 HGD [organism=Homo sapiens] [GeneID=3081] [isoform=X6]', dbxrefs=[]),
 SeqRecord(seq=Seq('MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRS...HAI'), id='XP_011511048.1', name='XP_0115

In [99]:
hgd_protein = sequences[0].seq
hgd_protein = Bio.Seq.MutableSeq(hgd_protein)
hgd_protein


MutableSeq('MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRS...EPN')

In [100]:
def replace(sequence: MutableSeq, mutation:str, verbose = False) -> MutableSeq:
    """
    takes a MutableSequence and a string  of type "A000B" and performs
    """

    original = mutation[0]
    mut = mutation[-1]
    sequence_copy = sequence[:]
    try:
        position = int(mutation[1:-1])
    except:
        if verbose:
            print('Invalid mutation format: should be AnB where n is an integer, representing the position, A and B two letters representing an aminoacid')
        return
    
    if len(sequence_copy)< position-1:
        if verbose:
            print('Sequence too short')
        return
    print
    if sequence_copy[position-1] != original.upper():
        if verbose:
            print(f' aminoacid in the sequence and in the mutation do not correspond: original aminoacid: {sequence_copy[position-1]}')
        return
    
    sequence_copy[position-1] = mut.upper()
    return sequence_copy

In [101]:
# concatenate Protein change allele 1 and Protein change allele 2 series
mutations = pd.concat([df['Protein change allele 1 '], df['Protein change allele 2']]).dropna()

print(mutations)
#mutations = mutations.dropna()

#mutated_sequences = mutations.apply(lambda x: replace(mut_sequence, x))

0     G161R
1     G161R
2      S47L
3      W97C
4     R225P
      ...  
48     R53Q
49    Y272C
50     D18Y
51    E401Q
52    G360R
Length: 106, dtype: object


In [120]:
mutations_dict ={ mutation: replace(hgd_protein, mutation) for mutation in mutations}
mutations_dict = {k:v for k,v in mutations_dict.items() if v is not None}

In [126]:
# show full prints
pd.set_option('display.max_colwidth', None)
for key in mutations_dict:
    seq = mutations_dict[key]
    #convert MutableSeq to string
    seq = str(seq)
    print(f'{key}: {seq}')
len(mutations_dict)

G161R: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKRNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPPRWGVADKTFRPPYYHRNCMSEFMGLIRGHYEAKQGGFLPGGGSLHSTMTPHGPDADCFEKASKVKLAPERIADGTMAFMFESSLSLAVTKWGLKASRCLDENYHKCWEPLKSHFTPNSRNPAEPN
S47L: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGLAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPPRWGVADKTFRPPYYHRNCMSEFMGLIRGHYEAKQGGFLPGGGSLHSTMTPHGPDADCFEKASKVKLAPERIADGTMAFMFESSLSLAVTKWGLKASRCLDENYHKCWEPLKSHFTPNSRNPAEPN
W97C: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVD

44

In [130]:
#save mutated sequences to a file
with open('..\\datasets\\HGD_datasets\\ncbi_dataset\\data\\mutated_sequences.csv', 'w') as f:
    for key in mutations_dict:
        seq = mutations_dict[key]
        #convert MutableSeq to string
        seq = str(seq)
        f.write(f'{key}, ')
        f.write(f'{seq}\n')