In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import data_cleanup as dc
import data_exploration as de
from Bio.Seq import Seq
import severity_score as ses
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
dna_sequence = "ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGAC"
rna_sequence = dna_sequence.replace("T", "U")

In [None]:
p53_codons = [rna_sequence[i:i+3] for i in range(0, len(rna_sequence), 3)]

p53_codons

In [None]:
#Die RNA Sequenz wird in Codons geslicet und als Liste in p53_codons gespeichert.

In [None]:
# Alle möglichen Codons für tp53
mutated_p53 = ses.generate_codon_variations(p53_codons)
mutated_p53

In [None]:
# Übersetzen aller möglichen Codons
aa = ses.translate_codons_df(mutated_p53)
ses.prob_aa_position(0, aa)

In [None]:
##### Wichtig
## Function was updated on 26.06.23 -> slightly different parameters needed. Usage → see in codon.py
# Das DMS_score df soll in ein df umgewandelt werden, indem nur die single mutations drinstehen. In dieser Zelle probiere ich das erst einmal für eine Zeile, bevor ich über alle iteriere.

p53_var_frame_raw: pd.DataFrame = ses.translate_codons_df(ses.generate_codon_variations(ses.p53_codons_gia))
p53_var_frame = ses.clean_variation_matrix(p53_var_frame_raw)

gia_null_eto: pd.DataFrame = pd.read_csv('../../DMS_data/P53_HUMAN_Giacomelli_NULL_Etoposide_2018.csv')
df = dc.min_max_norm(dc.df_transform(gia_null_eto).T)

sel_mut: pd.DataFrame = ses.select_smut(df, p53_var_frame).sort_index(axis=1)

sel_mut

In [None]:
dms_scores = dc.norm(dc.df_split(gia_null_eto))
dms_scores

In [None]:
res = pd.DataFrame(columns=dms_scores.columns, index=dms_scores.index, data=np.zeros(dms_scores.shape))
res

In [None]:
position = 0
var_mat_clean = p53_var_frame
prob_dict = ses.exchange_prob_dict(var_mat_clean)

In [None]:
dms_scores

In [None]:
for position in prob_dict.keys():
    res.loc[position] = res.loc[position].add(prob_dict[position])

res

In [None]:
prob_mut = ses.prob_smut(var_mat_clean, dms_scores, True)

In [None]:
severity_score = ses.dms_smut(ses.p53_codons_gia, gia_null_eto, bias_dms=True, include_original=True)

In [None]:
prob_mut.sum(axis=1)

In [None]:
severity_score.sum(axis=1)

In [None]:
comp = severity_score.compare(dms_scores, keep_equal=True, keep_shape=True, result_names=('smut', 'dms'))
plt.figure(figsize=(50, 10))
sns.heatmap(comp, cmap='seismic', xticklabels=True)
plt.savefig('./comparison_smu_dms.png')
plt.show()

In [None]:
#Updating pre-existing functions to eliminate Bio package

In [None]:
def translate_codon_to_aa(codon):
    codon_table = {
        'UUU': 'F', 'UUC': 'F', 'UUA': 'L', 'UUG': 'L',
        'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L',
        'AUU': 'I', 'AUC': 'I', 'AUA': 'I', 'AUG': 'M',
        'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V',
        'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S',
        'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
        'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
        'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
        'UAU': 'Y', 'UAC': 'Y', 'UAA': '*', 'UAG': '*',
        'CAU': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
        'AAU': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
        'GAU': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
        'UGU': 'C', 'UGC': 'C', 'UGA': '*', 'UGG': 'W',
        'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
        'AGU': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
        'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
    }

    if codon in codon_table:
        return codon_table[codon]
    else:
        return 'Unknown'

In [None]:
def translate_codons_df(df: pd.DataFrame) -> pd.DataFrame:
    translated_df = pd.DataFrame()

    for column in df.columns:
        codons = df[column]
        amino_acids = [translate_codon_to_aa(codon) for codon in codons]
        #amino_acids = [seq.translate() for seq in seqs]
        translated_df[column] = amino_acids

    return translated_df

In [None]:
#Test to see if it worked
translate_codons_df(mutated_p53)

In [None]:
#Test for the aminoacid sequence obtained from the DNA sequence online
amino_acids_string = ses.translate_codons_to_string(p53_codons)

print(amino_acids_string)

# BLASTing shows: DNA sequence encodes the AA sequence which is the one the Kotler dataset uses as a reference!