#### PCA, Hierarchical clustering and plotting implemented for each domain
In this notebook, we performed a pca dimension reduction, continued with a hierarchical clustering and plotted the results for different domains. To plot the different domains, you need to select the domain you want to plot in the cell labeled with "### EDIT FOR THE DOMAIN OF YOUR LIKING ###".

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import data_exploration as de
import data_cleanup as dc
import domain_comparison as doc

In [None]:
aa = pd.read_csv('../DMS_data/aminoacids.csv')

df: pd.DataFrame = dc.norm(pd.read_csv('../DMS_data/P53_HUMAN_Giacomelli_NULL_Etoposide_2018.csv'))
norm_frame = dc.aufteilung_mut_pos(dc.norm(df))

In [None]:
# Saving the different domains for better accessibility

#Transactivation Domain 1
t1_start = 1
t1_end = 43

#Transactivation Domain 2
t2_start = 44
t2_end = 63

#Proline-rich Region
pr_start = 64
pr_end = 92

#DNA Binding Domain
dna_start = 102
dna_end = 292

#Tetramerization Domain
tetra_start = 320
tetra_end = 355

#Regulatory Domain
reg_start = 356
reg_end = 393



t1_domain = dc.rmv_na(dc.df_transform(doc.slice_domain(df, start= t1_start, end= t1_end)))
t1_list = doc.slice_domain(df, start= t1_start, end= t1_end)
t1_dist = de.dms_distance_matrix_wt(t1_list)

t2_domain = dc.rmv_na(dc.df_transform(doc.slice_domain(df, start= t2_start, end= t2_end)))
t2_list = doc.slice_domain(df, start= t2_start, end= t2_end)
t2_dist = de.dms_distance_matrix_wt(t2_list)

pr_domain = dc.rmv_na(dc.df_transform(doc.slice_domain(df, start= pr_start, end= pr_end)))
pr_list = doc.slice_domain(df, start= pr_start, end= pr_end)
pr_dist = de.dms_distance_matrix_wt(pr_list)

dna_domain = dc.rmv_na(dc.df_transform(doc.slice_domain(df, start= dna_start, end= dna_end)))
dna_list = doc.slice_domain(df, start= dna_start, end= dna_end)
dna_dist = de.dms_distance_matrix_wt(dna_list)

tetra_domain = dc.rmv_na(dc.df_transform(doc.slice_domain(df, start= tetra_start, end= tetra_end)))
tetra_list = doc.slice_domain(df, start= tetra_start, end= tetra_end)
tetra_dist = de.dms_distance_matrix_wt(tetra_list)

reg_domain = dc.rmv_na(dc.df_transform(doc.slice_domain(df, start= reg_start, end= reg_end)))
reg_list = doc.slice_domain(df, start= reg_start, end= reg_end)
reg_dist = de.dms_distance_matrix_wt(reg_list)

# to be more efficient for greater amounts of domains: Transfer into a dictionary

In [None]:
t2_dist.shape
## some domains don't contain all substitutions, so the distance matrices are of lower dimension than 20x20. Thus, there cannot be 20 datapoints in the plots created further down.

In [None]:
### EDIT FOR THE DOMAIN OF YOUR LIKING ###

#Variable containing the domain, the plots are created for
domain = dna_dist

#Variable to print correct plot titles
title = "DNA-binding domain of p53"


In [None]:
de.plot_hier_clust(domain, title = title)

In [None]:
best_clusters = de.determine_clusters_silhouette(domain)
print(best_clusters)

In [None]:
de.pca_hierarchical_plot(domain, optimal_num_cluster= best_clusters, title= title)