#### Distance matrix
In this notebook, we used the function "mean_substitutions" to create a function being able to compute a distance matrix from our DMS_data.

In [41]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import data_cleanup as dc
from sklearn.metrics.pairwise import euclidean_distances
import data_exploration as de

In [30]:
fpath = '../DMS_data/P53_HUMAN_Giacomelli_NULL_Etoposide_2018.csv'
df = pd.read_csv(fpath)


In [31]:
# Auftrennen der Mutante in alte und neue AS
mutations_df = dc.aufteilung_mut_pos(df)

In [32]:
# Gruppieren der Substitutionen, geordnet erst nach alter und dann nach neuer AS mit Inspiration von ChatGPT (.groupby in der Pandas Dokumentation beschreibt nur das groupen nach einem Parameter)
subs_df = mutations_df.groupby(["AS_old", "AS_new"])

# Berechnen des means für jede Kombination
mean_scores = subs_df.DMS_score.mean()
# Es gibt 380 Möglichkeiten, 20 AS mit Reihenfolge und ohne zurücklegen zu kombinieren. Aus den Mittelwerten dieser 380 möglichen Substitutionen soll nun eine Distanzmatrix erstellt werden.

In [46]:
# Da "mean_scores" eine Serie ist, muss diese erst wieder in ein Data Frame umgewandelt werden. Dafür müssen den einzelnen Zeilen wieder konkrete Indizes zugewiesen werden, damit wieder ein Data Frame entsteht.
mean_scores_df = mean_scores.reset_index()

# Pivot erstellt aus unserer in einem Data Frame aufgelisteten Substitutionen ein Data Frame, das in Form einer Tabelle eine Übersicht der Mittelwerte zeigt. Die Reihen sind die alten AS, die Spalten die neuen AS.
mean_substitutions = mean_scores_df.pivot(index="AS_old", columns="AS_new", values= "DMS_score").T # this .T decides if distance_matrix_wt or _mutated is calculated
dc.rmv_na(mean_substitutions)

print(mean_substitutions.columns)

Index(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
       'R', 'S', 'T', 'V', 'W', 'Y'],
      dtype='object', name='AS_old')


*So far, these are the results from calculating the values for the mean substitutions. Now, the code for the distance matrix will be implemented. Based on this code, the function "de.dms_distance_matrix" was created.*

In [44]:
mean_substitutions.head(20)

AS_old,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
AS_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,0.0,-0.984361,0.33642,0.439452,-0.689239,0.269993,0.068617,-1.301357,0.564866,-0.054848,0.366844,0.525371,-0.113301,0.326615,-0.269422,0.110171,0.175812,0.122737,-0.15374,-0.866431
C,0.398729,0.0,0.36449,0.382776,-0.378202,0.523857,0.244989,-0.320707,0.674043,-0.049377,-0.025214,0.525193,0.373813,0.715331,0.209994,0.488338,0.575852,-0.188178,-0.051575,-0.729975
D,0.515101,-1.510386,0.0,0.376245,-1.36794,0.214476,-0.059473,-1.656635,0.163895,-0.272737,-0.502976,0.130449,0.29861,0.428674,-0.737071,0.253284,-0.323039,-0.92588,-0.41439,-0.761619
E,0.457134,-1.416325,0.538215,0.0,-0.864939,0.018696,-0.11844,-1.631776,0.157561,-0.207789,-0.26267,0.017914,0.306073,0.418482,-0.720329,0.341156,-0.073779,-0.521313,-0.308171,-1.27006
F,0.381672,-0.191154,0.343919,0.349576,0.0,0.199421,0.304758,-0.231583,0.335243,0.121967,0.099787,0.209884,0.178908,0.627402,-0.217364,0.194215,0.039907,-0.406127,-0.199816,0.182602
G,0.012152,-0.942953,0.203707,0.067008,-0.744887,0.0,-0.290995,-1.367628,0.352034,-0.55996,-0.232334,0.465068,0.28423,0.661938,-0.234577,0.374783,-0.166369,-0.761413,-1.039825,-1.295045
H,0.10708,-1.161374,0.489931,0.507057,-0.501397,0.287239,0.0,-1.444026,0.540112,0.152569,-0.372969,0.158096,0.358459,0.51874,-0.128117,0.272749,-0.323128,-0.809327,-0.039015,-0.301181
I,0.322948,-0.309752,0.10119,0.371153,-0.138207,-0.06806,-0.201286,0.0,0.516667,0.638786,0.155454,0.09802,0.318722,0.451678,-0.049724,0.387556,0.271527,0.307803,-0.121015,-0.918069
K,0.181169,-1.349437,0.321578,0.112546,-0.800865,0.080282,-0.285608,-1.430235,0.0,-0.225543,0.047197,0.208273,0.320253,0.596154,0.309015,0.313318,0.154649,-0.805664,-0.314064,-1.342734
L,0.010173,-1.29337,0.181842,0.306692,8.7e-05,0.01271,0.012453,0.06451,0.58271,0.0,0.415377,0.308124,0.056795,0.52528,0.129087,-0.026329,0.146178,-0.199019,-0.25241,-1.060839


In [45]:
# calculate the distances of the AA to one another
dms_distances = euclidean_distances(mean_substitutions.values)

# convert distances back to a pd.Dataframe
mean_substitutions_df = pd.DataFrame(dms_distances, index=mean_substitutions.index, columns=mean_substitutions.index)

# Print the first 20 rows of mean_substitutions
mean_substitutions_df.head(20)


AS_new,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
AS_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,0.0,1.870447,2.057424,1.677953,2.074712,1.800098,1.61668,1.972078,1.663155,1.731438,2.464359,1.605218,2.599323,1.324979,1.734199,1.334446,1.719908,2.085699,1.770016,1.858589
C,1.870447,0.0,2.932487,2.646722,1.435872,2.400857,2.179488,1.457241,2.341538,1.861285,1.543734,1.546109,3.430827,2.037637,2.128601,1.433622,1.808142,1.330392,2.08656,1.847943
D,2.057424,2.932487,0.0,1.147741,2.851973,1.623479,1.562002,3.116516,1.675408,2.846908,3.331162,1.705688,1.851973,1.845346,1.894457,2.039805,2.226007,3.020563,1.91252,2.303482
E,1.677953,2.646722,1.147741,0.0,2.716283,1.383653,1.591554,2.687395,1.2667,2.444568,3.072152,1.613702,1.688013,1.448361,1.69737,1.790569,1.874593,2.744142,1.711126,2.160131
F,2.074712,1.435872,2.851973,2.716283,0.0,2.604831,1.940718,1.640068,2.634391,1.889827,1.739786,1.771891,3.318476,2.28617,2.310283,1.752132,2.09508,1.503799,1.734475,1.324595
G,1.800098,2.400857,1.623479,1.383653,2.604831,0.0,1.812648,2.646116,1.24179,2.267662,2.792097,1.59564,1.66955,1.808504,1.539402,1.626238,1.9248,2.621017,1.822127,2.155682
H,1.61668,2.179488,1.562002,1.591554,1.940718,1.812648,0.0,2.420617,1.625366,2.21261,2.62243,1.331438,2.415598,1.438266,1.746828,1.538785,1.981881,2.435773,1.111196,1.259381
I,1.972078,1.457241,3.116516,2.687395,1.640068,2.646116,2.420617,0.0,2.528704,1.50437,1.357431,2.109293,3.210016,2.202334,2.317928,1.955986,1.579251,0.918103,2.373551,2.249416
K,1.663155,2.341538,1.675408,1.2667,2.634391,1.24179,1.625366,2.528704,0.0,2.065978,2.711528,1.464649,2.097893,1.391438,1.339549,1.518363,1.716352,2.545937,1.81454,2.160693
L,1.731438,1.861285,2.846908,2.444568,1.889827,2.267662,2.21261,1.50437,2.065978,0.0,1.329744,1.942971,2.858723,2.105607,2.167016,1.824881,1.656438,1.319766,2.24797,2.243036


In [42]:
# we created two functions out of this: One for the the distance matrix of the mutated AAs, one for the distance matrix of the wild-type AAs.
de.dms_distance_matrix_mutated(df)

AS_new,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
AS_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,0.0,1.870447,2.057424,1.677953,2.074712,1.800098,1.61668,1.972078,1.663155,1.731438,2.464359,1.605218,2.599323,1.324979,1.734199,1.334446,1.719908,2.085699,1.770016,1.858589
C,1.870447,0.0,2.932487,2.646722,1.435872,2.400857,2.179488,1.457241,2.341538,1.861285,1.543734,1.546109,3.430827,2.037637,2.128601,1.433622,1.808142,1.330392,2.08656,1.847943
D,2.057424,2.932487,0.0,1.147741,2.851973,1.623479,1.562002,3.116516,1.675408,2.846908,3.331162,1.705688,1.851973,1.845346,1.894457,2.039805,2.226007,3.020563,1.91252,2.303482
E,1.677953,2.646722,1.147741,0.0,2.716283,1.383653,1.591554,2.687395,1.2667,2.444568,3.072152,1.613702,1.688013,1.448361,1.69737,1.790569,1.874593,2.744142,1.711126,2.160131
F,2.074712,1.435872,2.851973,2.716283,0.0,2.604831,1.940718,1.640068,2.634391,1.889827,1.739786,1.771891,3.318476,2.28617,2.310283,1.752132,2.09508,1.503799,1.734475,1.324595
G,1.800098,2.400857,1.623479,1.383653,2.604831,0.0,1.812648,2.646116,1.24179,2.267662,2.792097,1.59564,1.66955,1.808504,1.539402,1.626238,1.9248,2.621017,1.822127,2.155682
H,1.61668,2.179488,1.562002,1.591554,1.940718,1.812648,0.0,2.420617,1.625366,2.21261,2.62243,1.331438,2.415598,1.438266,1.746828,1.538785,1.981881,2.435773,1.111196,1.259381
I,1.972078,1.457241,3.116516,2.687395,1.640068,2.646116,2.420617,0.0,2.528704,1.50437,1.357431,2.109293,3.210016,2.202334,2.317928,1.955986,1.579251,0.918103,2.373551,2.249416
K,1.663155,2.341538,1.675408,1.2667,2.634391,1.24179,1.625366,2.528704,0.0,2.065978,2.711528,1.464649,2.097893,1.391438,1.339549,1.518363,1.716352,2.545937,1.81454,2.160693
L,1.731438,1.861285,2.846908,2.444568,1.889827,2.267662,2.21261,1.50437,2.065978,0.0,1.329744,1.942971,2.858723,2.105607,2.167016,1.824881,1.656438,1.319766,2.24797,2.243036


In [37]:
de.dms_distance_matrix_wt(df)

AS_old,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
AS_old,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,0.0,5.118419,1.001127,0.950791,3.865506,1.093999,1.727396,5.869602,1.354001,1.858657,1.982457,1.449749,0.619728,1.422936,2.382516,0.879574,1.428805,3.844111,2.678816,5.224924
C,5.118419,0.0,5.157128,5.225621,2.266408,4.671423,4.215585,2.799431,5.807398,4.01894,3.87878,4.844397,5.130724,6.064423,3.461968,5.276711,4.375534,2.45743,3.015105,2.006232
D,1.001127,5.157128,0.0,0.98128,3.75842,1.082337,1.65667,6.041829,1.354857,2.016012,1.948455,1.268876,0.886359,1.481424,2.328884,0.981295,1.516787,3.841171,2.628414,5.168214
E,0.950791,5.225621,0.98128,0.0,3.851885,0.958826,1.639293,5.929282,1.079586,1.820398,1.895125,1.302363,0.923046,1.366785,2.30771,1.034484,1.386531,3.846769,2.680332,5.261142
F,3.865506,2.266408,3.75842,3.851885,0.0,3.462147,2.805199,3.117083,4.394297,2.539148,2.594098,3.492566,3.820806,4.605781,2.295875,3.973795,3.162375,1.985099,2.09192,2.351548
G,1.093999,4.671423,1.082337,0.958826,3.462147,0.0,1.322393,5.478072,1.584928,1.630219,1.579012,1.216704,0.958718,1.847809,1.846426,1.157093,1.073299,3.380896,2.270993,4.789685
H,1.727396,4.215585,1.65667,1.639293,2.805199,1.322393,0.0,5.192499,2.217342,1.468733,1.719725,1.441529,1.681892,2.346398,1.849399,1.679055,1.373227,3.28327,1.932457,4.188192
I,5.869602,2.799431,6.041829,5.929282,3.117083,5.478072,5.192499,0.0,6.404813,4.5936,4.560649,5.714493,5.889325,6.628173,4.191128,6.130616,5.075546,2.83029,4.26468,3.487852
K,1.354001,5.807398,1.354857,1.079586,4.394297,1.584928,2.217342,6.404813,0.0,2.334108,2.444159,1.628534,1.270695,1.17366,2.85919,1.243476,1.835478,4.438679,3.420946,5.98473
L,1.858657,4.01894,2.016012,1.820398,2.539148,1.630219,1.468733,4.5936,2.334108,0.0,1.411209,1.904752,1.82147,2.641414,1.651514,1.970667,1.508891,2.683475,1.860435,4.18585
