In [None]:
import pandas as pd
import data_cleanup as dc
import domain_comparison as doc
import data_exploration as de
import seaborn as sns
import matplotlib.pyplot as plt
import severity_score as ses
import numpy as np
from scipy.stats import gaussian_kde

### In this document the same operations as in all_dms are done. In this case the Amino acids, that cannot be reached with single mutations of the DNA are removed before.

Starting out with Generation of the new DF with only the single mutations

In [None]:
fpath = '../../DMS_data/P53_HUMAN_Giacomelli_NULL_Etoposide_2018.csv'

gia_null_eto =dc.norm(pd.read_csv(fpath))
dc.aufteilung_mut_pos(gia_null_eto)

p53_seq = ses.p53_codons_gia
translated_p53 = ses.clean_variation_matrix(ses.translate_codons_df(ses.generate_codon_variations(p53_seq)))

cleaned_p53 = ses.clean_variation_matrix(translated_p53)

df = gia_null_eto[gia_null_eto.apply(lambda row: (cleaned_p53.loc[row['position_mut']-1, 'Original'] == row['AS_old']) and (row['AS_new'] in cleaned_p53.loc[row['position_mut']-1, ['Variation 1', 'Variation 2', 'Variation 3', 'Variation 4', 'Variation 5', 'Variation 6', 'Variation 7', 'Variation 8', 'Variation 9']].values), axis=1)]

In [None]:
domains_regulatory = doc.domains_regulatory

def adjust_domain(regulatory: dict, name: str, frame: pd.DataFrame) -> dict:

    domain = dc.rmv_na(dc.df_transform(doc.slice_domain(frame, start= regulatory[name][0], end=regulatory[name][1])))
    domain_list = doc.slice_domain(frame, start= regulatory[name][0], end=regulatory[name][1])
    mean = domain.mean().rename('mean')
    res: dict = {'domain' : domain, 'domain_list': domain_list, 'mean': mean}
    return res


In [None]:
t1_domain = adjust_domain(domains_regulatory, name='t1_domain', frame=df)
t2_domain = adjust_domain(domains_regulatory, 't2_domain', frame=df)
pr_domain = adjust_domain(domains_regulatory, 'pr_domain', frame=df)
dna_domain = adjust_domain(domains_regulatory, 'dna_b_domain', frame=df)
tetra_domain = adjust_domain(domains_regulatory, 'tetra_domain', frame=df)
reg_domain = adjust_domain(domains_regulatory, 'reg_domain', frame=df)



## Mean values

In [None]:

t1_mean = t1_domain['mean']

t2_mean = t2_domain['mean']

pr_mean = pr_domain['mean']

dna_mean = dna_domain['mean']

tetra_mean = tetra_domain['mean']

reg_mean = reg_domain['mean']


## Distribution and Heatmaps
### Transactivation Domain 1

In [None]:
def distr_and_hmap(domain_reg: dict, domain_name: str) -> None:
    dms_scores = domain_reg['domain_list']['DMS_score']
    plt.hist(dms_scores, bins=50)
    plt.xlabel('DMS Score')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of DMS Scores in the {domain_name} Domain')
    plt.show()

    print(f'Mean: {dms_scores.mean()}')
    print(f'Median: {dms_scores.median()}')
    sns.heatmap(domain_reg['domain'])

    return None

In [None]:
distr_and_hmap(t1_domain, 't1')

### Transactivation Domain 2

In [None]:
t2 = adjust_domain(domains_regulatory, 't2_domain', df)
distr_and_hmap(t2, 'T2')

### Proline-rich Region

In [None]:
pr = adjust_domain(domains_regulatory, 'pr_domain', df)
distr_and_hmap(pr, 'Proline-rich Region')

### DNA Binding Domain

In [None]:
dna_b_domain = adjust_domain(domains_regulatory, 'dna_b_domain', df)
distr_and_hmap(dna_b_domain, 'DNA Binding')

### Tetramerization Domain

In [None]:
tetra = adjust_domain(domains_regulatory, 'tetra_domain', df)
distr_and_hmap(tetra, 'Tetramerisation')

### Regulatory Domain

In [None]:
reg_domain = adjust_domain(domains_regulatory, 'reg_domain', df)
distr_and_hmap(reg_domain, 'Regulatory')

In [None]:
t1_domain['domain']

## Comparing all DMS score distributions

In [None]:
def comp_dms_distr(regulatory: dict, frame) -> None:
    regions = [adjust_domain(regulatory, elem, frame)['domain_list']['DMS_score'] for elem in regulatory.keys()]
    dms_scores = np.concatenate(regions)

    fig, ax = plt.subplots(figsize=(10, 6))

    datasets = ['Transactivation domain 1', 'Transactivation domain 2', 'Proline rich region',
                'DNA binding domain', 'Tetramerization domain', 'Regulatory domain']
    for data, label in zip(regions, datasets):
        kde = gaussian_kde(data)
        x_vals = np.linspace(np.min(dms_scores), np.max(dms_scores), 1000)
        y_vals = kde(x_vals)

        ax.plot(x_vals, y_vals, linewidth=2, label=label)

    ax.set_xlabel('Value')
    ax.set_ylabel('Density')
    ax.set_title('Different Domains with only Single Mutation DMS Scores')
    ax.legend()

    plt.show()
    return None

In [None]:
comp_dms_distr(domains_regulatory, df)

## Standard deviation


In [None]:
t1_std = t1_mean.std()
t2_std = t2_mean.std()
pr_std = pr_mean.std()
dna_std = dna_mean.std()
tetra_std = tetra_mean.std()
reg_std = reg_mean.std()

std_combined = pd.DataFrame({
    't1_std': [t1_std],
    't2_std': [t2_std],
    'pr_std': [pr_std],
    'dna_std': [dna_std],
    'tetra_std': [tetra_std],
    'reg_std': [reg_std]
})

std_combined

## Mean value matrix

Creating a matrix showing the mean substitution DMS score for each aminoacid. For further information see the data_exploration folder.

In [None]:
sns.heatmap(dc.rmv_na(de.mean_substitutions(t1_domain['domain_list'])))

In [None]:
sns.heatmap(dc.rmv_na(de.mean_substitutions(t2_domain['domain_list'])))

In [None]:
sns.heatmap(dc.rmv_na(de.mean_substitutions(pr_domain['domain_list'])))

In [None]:
sns.heatmap(dc.rmv_na(de.mean_substitutions(dna_domain['domain_list'])))

In [None]:
sns.heatmap(dc.rmv_na(de.mean_substitutions(tetra_domain['domain_list'])))

In [None]:
sns.heatmap(dc.rmv_na(de.mean_substitutions(reg_domain['domain_list'])))

## Shapiro Wilk test

In [None]:
domains_regulatory

In [None]:
def shapiro_wilk_test(regulatory: dict, frame) -> None:
    regions = [adjust_domain(regulatory, elem, frame)['domain_list']['DMS_score'] for elem in regulatory.keys()]
    reg_names = list(regulatory.keys())
    counter = 0
    for region in regions:
        print(reg_names[counter])
        counter += 1
        print(doc.test_normality(region), '\n')
    return None

In [None]:
shapiro_wilk_test(domains_regulatory, df)