## IDEAS FOR VISUALIZATION

-> 1.) Which amino acid caused the largest drops in DMS scores in different datasets?

-> 2.) Which amino acid was the most resistant across the datasets?

-> 3.) Can a trend be observed within the best/worst substitution pairs? Could this be represented in a matrix?

In [None]:
import pandas as pd
import data_cleanup as dc
import matplotlib.pyplot as plt
import seaborn as sns
import domain_comparison as domc
import data_exploration as de

In [None]:
gia_null_eto: pd.DataFrame = pd.read_csv('../DMS_data/P53_HUMAN_Giacomelli_NULL_Etoposide_2018.csv')
gia_null_nut: pd.DataFrame = pd.read_csv('../DMS_data/P53_HUMAN_Giacomelli_NULL_Nutlin_2018.csv')
gia_wt_nut: pd.DataFrame = pd.read_csv('../DMS_data/P53_HUMAN_Giacomelli_WT_Nutlin_2018.csv')
kot_hum: pd.DataFrame = pd.read_csv('../DMS_data/P53_HUMAN_Kotler_2018.csv')

In [None]:
gia_null_eto_amp: pd.DataFrame = dc.aufteilung_mut_pos(gia_null_eto)
gia_null_nut_amp: pd.DataFrame = dc.aufteilung_mut_pos(gia_null_eto)
gia_wt_nut_amp: pd.DataFrame = dc.aufteilung_mut_pos(gia_wt_nut)
kot_hum_amp: pd.DataFrame = dc.aufteilung_mut_pos(kot_hum)

-> 1.)

In [None]:
def calculate_average_dms_score(df):
    grouped = df.groupby('AS_new')
    sums = grouped['DMS_score'].sum()
    counts = grouped['DMS_score'].count()
    averages = sums / counts
    return averages

In [None]:
de.calculate_average_dms_score_new(gia_null_eto_amp)

In [None]:
de.calculate_average_dms_score_new(kot_hum_amp)

In [None]:
de.calculate_average_dms_score_new(gia_null_nut_amp)

In [None]:
#DNA Binding Domain
dna_start = 102
dna_end = 292

In [None]:
dna_list = domc.slice_domain(gia_null_eto_amp, start= dna_start, end= dna_end)

In [None]:
calculate_average_dms_score(dna_list)

-> 3.)

## GIA NULL ETO

In [None]:
fpathGNE = '../DMS_data/P53_HUMAN_Giacomelli_NULL_Etoposide_2018.csv'
dfGNE = pd.read_csv(fpathGNE)
mutations_dfGNE = dc.aufteilung_mut_pos(dfGNE)
subs_df = mutations_dfGNE.groupby(["AS_old", "AS_new"])
mean_scoresGNE = subs_df.DMS_score.mean()
mean_scores_dfGNE = mean_scoresGNE.reset_index()
mean_substitutionsGNE = mean_scores_dfGNE.pivot(index="AS_old", columns="AS_new", values="DMS_score")
dc.rmv_na(mean_substitutionsGNE)

## GIA NULL NUT

In [None]:
fpathGNN = '../DMS_data/P53_HUMAN_Giacomelli_NULL_Nutlin_2018.csv'
dfGNN = pd.read_csv(fpathGNN)
mutations_dfGNN = dc.aufteilung_mut_pos(dfGNN)
subs_df = mutations_dfGNN.groupby(["AS_old", "AS_new"])
mean_scoresGNN = subs_df.DMS_score.mean()
mean_scores_dfGNN = mean_scoresGNN.reset_index()
mean_substitutionsGNN = mean_scores_dfGNN.pivot(index="AS_old", columns="AS_new", values="DMS_score")
dc.rmv_na(mean_substitutionsGNN)

## GIA WT NUT

In [None]:
fpathGWN = '../DMS_data/P53_HUMAN_Giacomelli_WT_Nutlin_2018.csv'
dfGWN = pd.read_csv(fpathGWN)
mutations_dfGWN = dc.aufteilung_mut_pos(dfGWN)
subs_df = mutations_dfGWN.groupby(["AS_old", "AS_new"])
mean_scoresGWN = subs_df.DMS_score.mean()
mean_scores_dfGWN = mean_scoresGWN.reset_index()
mean_substitutionsGWN = mean_scores_dfGWN.pivot(index="AS_old", columns="AS_new", values="DMS_score")
dc.rmv_na(mean_substitutionsGWN)

## KOT HUM

In [None]:
fpathKH = '../DMS_data/P53_HUMAN_Giacomelli_WT_Nutlin_2018.csv'
dfKH = pd.read_csv(fpathKH)
mutations_dfKH = dc.aufteilung_mut_pos(dfKH)
subs_df = mutations_dfKH.groupby(["AS_old", "AS_new"])
mean_scoresKH = subs_df.DMS_score.mean()
mean_scores_dfKH = mean_scoresKH.reset_index()
mean_substitutionsKH = mean_scores_dfKH.pivot(index="AS_old", columns="AS_new", values="DMS_score")
dc.rmv_na(mean_substitutionsKH)

In [None]:
def hmap(frame: pd.DataFrame) -> None:
    frame_reset = frame.reset_index()  # Reset index to make 'AS_new' a column
    hmap_frame = frame_reset.pivot(index='AS_new', columns=['position_mut', 'AS_old'], values='DMS_score')
    plt.figure(figsize=(50, 8))
    sns.set(font_scale=2)
    sns.heatmap(hmap_frame, cmap='seismic')
    plt.title('DMS Scores for Mutations')
    plt.show()
    return None



In [None]:
mean_substitutionsGNE.head(20)

In [None]:
df = mean_substitutionsGNE
plt.figure(figsize=(50, 10))
sns.heatmap(df, cmap='coolwarm', annot=True, fmt=".2f", linewidths=0.5)
plt.title('Heatmap of AS_new vs AS_old')
plt.xlabel('AS_new')
plt.ylabel('AS_old')

plt.show()

In [None]:
def hmap_mean_variance (df = pd.DataFrame) -> None:
    plt.figure(figsize=(50, 10))
    sns.heatmap(df, cmap='coolwarm', annot=True, fmt=".2f", linewidths=0.5)
    plt.title('Heatmap of AS_new vs AS_old')
    plt.xlabel('AS_new')
    plt.ylabel('AS_old')
    plt.show()
    return None

In [None]:
hmap_mean_variance(mean_substitutionsGNE)
hmap_mean_variance(mean_substitutionsGNN)
hmap_mean_variance(mean_substitutionsGWN)
hmap_mean_variance(mean_substitutionsKH)

In [None]:
gia_null_eto_amp.head(20)

In [None]:
def calculate_average_dms_score_old(df):
    grouped = df.groupby('AS_old')
    sums = grouped['DMS_score'].sum()
    counts = grouped['DMS_score'].count()
    averages = sums / counts
    return averages


In [None]:
calculate_average_dms_score_old(gia_null_eto_amp)

In [None]:
def calculate_average_dms_score_old(*args):
    results = {}

    for arg in args:
        df_name = arg[0]
        df = arg[1]
        grouped = df.groupby('AS_old')
        sums = grouped['DMS_score'].sum()
        counts = grouped['DMS_score'].count()
        averages = sums / counts
        results[df_name] = averages

    result_df = pd.DataFrame(results)
    return result_df

In [None]:
calculate_average_dms_score_old(('GNE', gia_null_eto_amp), ('GNN', gia_null_nut_amp), ('GWN', gia_wt_nut_amp), ('KH', kot_hum_amp))


In [None]:
#this function shows us the aminoacids that, when mutated into, caused the greatest dropoff/increase of the DMS score