In [11]:
# Hier berechne ich einen Distanzmatrix aus den Datensätzen

In [12]:
import pandas as pd
import data_cleanup as dc
import data_exploration as de

In [5]:
fpath = '../DMS_data/P53_HUMAN_Giacomelli_NULL_Etoposide_2018.csv'
df = pd.read_csv(fpath)


In [6]:
# Auftrennen der Mutante in alte und neue AS
mutations_df = dc.aufteilung_mut_pos(df)

In [7]:
# Gruppieren der Substitutionen, geordnet erst nach alter und dann nach neuer AS mit Inspiration von ChatGPT (.groupby in der Pandas Dokumentation beschreibt nur das groupen nach einem Parameter)
subs_df = mutations_df.groupby(["AS_old", "AS_new"])

# Berechnen des means für jede Kombination
mean_scores = subs_df.DMS_score.mean()
# Es gibt 380 Möglichkeiten, 20 AS mit Reihenfolge und ohne zurücklegen zu kombinieren. Aus den Mittelwerten dieser 380 möglichen Substitutionen soll nun eine Distanzmatrix erstellt werden.

In [8]:
# Da "mean_scores" eine Serie ist, muss diese erst wieder in ein Data Frame umgewandelt werden. Dafür müssen den einzelnen Zeilen wieder konkrete Indizes zugewiesen werden, damit wieder ein Data Frame entsteht.
mean_scores_df = mean_scores.reset_index()

# Pivot erstellt aus unserer in einem Data Frame aufgelisteten Substitutionen ein Data Frame, das in Form einer Tabelle eine Übersicht der Mittelwerte zeigt. Die Reihen sind die alten AS, die Spalten die neuen AS.
mean_substitutions = mean_scores_df.pivot(index="AS_old", columns="AS_new", values= "DMS_score")
dc.rmv_na(mean_substitutions)

print(mean_substitutions.columns)

Index(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
       'R', 'S', 'T', 'V', 'W', 'Y'],
      dtype='object', name='AS_new')


b### Ergänzungen von Frido für Distanzmatrix

In [10]:
mean_substitutions

AS_new,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
AS_old,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,0.0,0.398729,0.515101,0.457134,0.381672,0.012152,0.10708,0.322948,0.181169,0.010173,0.290941,0.243262,-0.118068,0.124772,0.116215,0.280331,0.476545,0.486577,0.223571,0.370961
C,-0.984361,0.0,-1.510386,-1.416325,-0.191154,-0.942953,-1.161374,-0.309752,-1.349437,-1.29337,-0.735466,-0.871731,-1.376094,-0.883989,-0.514196,-0.447143,-0.752822,-0.561304,-0.678823,-0.587628
D,0.33642,0.36449,0.0,0.538215,0.343919,0.203707,0.489931,0.10119,0.321578,0.181842,0.021296,0.259362,-0.054858,0.215632,-0.151917,0.347274,0.24609,0.173693,0.364983,0.389608
E,0.439452,0.382776,0.376245,0.0,0.349576,0.067008,0.507057,0.371153,0.112546,0.306692,0.375952,0.362416,-0.331368,0.437464,-0.094437,0.272662,0.433548,0.375813,0.214914,0.379793
F,-0.689239,-0.378202,-1.36794,-0.864939,0.0,-0.744887,-0.501397,-0.138207,-0.800865,8.7e-05,0.048709,-0.793803,-0.833767,-0.505582,-0.768096,-0.607131,-0.852016,-0.389713,-0.112035,0.159711
G,0.269993,0.523857,0.214476,0.018696,0.199421,0.0,0.287239,-0.06806,0.080282,0.01271,0.233815,0.335321,-0.176705,0.319435,-0.072564,0.217142,0.137428,0.222302,-0.286193,0.127274
H,0.068617,0.244989,-0.059473,-0.11844,0.304758,-0.290995,0.0,-0.201286,-0.285608,0.012453,0.339755,0.453583,-0.435685,0.313392,0.042533,-0.135786,-0.556795,-0.016774,0.313742,0.405657
I,-1.301357,-0.320707,-1.656635,-1.631776,-0.231583,-1.367628,-1.444026,0.0,-1.430235,0.06451,0.607118,-0.923276,-1.309344,-1.55046,-1.285973,-0.986439,-0.416796,0.413857,-1.451584,-1.333389
K,0.564866,0.674043,0.163895,0.157561,0.335243,0.352034,0.540112,0.516667,0.0,0.58271,0.639539,0.391722,0.327485,0.355389,0.278626,0.494709,0.537629,0.611187,0.388272,0.460607
L,-0.054848,-0.049377,-0.272737,-0.207789,0.121967,-0.55996,0.152569,0.638786,-0.225543,0.0,0.49989,-0.21044,-0.272881,0.178392,-0.015421,-0.435978,-0.231729,0.374415,-0.156716,0.12548


In [9]:
from sklearn.metrics.pairwise import euclidean_distances

# Speichern der "AS_new"-Spalte für Benennung der neuen Spalten
labels_column = 'AS_new'

# Transpose the DataFrame
mean_substitutions_t = mean_substitutions.T

# Berechnen der Distanzen der AA zueinander
dms_distances = euclidean_distances(mean_substitutions_t.values)

# Umwandeln zurück in einen DF zur besseren Übersicht
mean_substitutions_df = pd.DataFrame(dms_distances, index=mean_substitutions_t.index, columns=mean_substitutions_t.index)

# Print the first 20 rows of mean_substitutions
mean_substitutions_df.head(20)


AS_new,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
AS_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,0.0,1.870447,2.057424,1.677953,2.074712,1.800098,1.61668,1.972078,1.663155,1.731438,2.464359,1.605218,2.599323,1.324979,1.734199,1.334446,1.719908,2.085699,1.770016,1.858589
C,1.870447,0.0,2.932487,2.646722,1.435872,2.400857,2.179488,1.457241,2.341538,1.861285,1.543734,1.546109,3.430827,2.037637,2.128601,1.433622,1.808142,1.330392,2.08656,1.847943
D,2.057424,2.932487,0.0,1.147741,2.851973,1.623479,1.562002,3.116516,1.675408,2.846908,3.331162,1.705688,1.851973,1.845346,1.894457,2.039805,2.226007,3.020563,1.91252,2.303482
E,1.677953,2.646722,1.147741,0.0,2.716283,1.383653,1.591554,2.687395,1.2667,2.444568,3.072152,1.613702,1.688013,1.448361,1.69737,1.790569,1.874593,2.744142,1.711126,2.160131
F,2.074712,1.435872,2.851973,2.716283,0.0,2.604831,1.940718,1.640068,2.634391,1.889827,1.739786,1.771891,3.318476,2.28617,2.310283,1.752132,2.09508,1.503799,1.734475,1.324595
G,1.800098,2.400857,1.623479,1.383653,2.604831,0.0,1.812648,2.646116,1.24179,2.267662,2.792097,1.59564,1.66955,1.808504,1.539402,1.626238,1.9248,2.621017,1.822127,2.155682
H,1.61668,2.179488,1.562002,1.591554,1.940718,1.812648,0.0,2.420617,1.625366,2.21261,2.62243,1.331438,2.415598,1.438266,1.746828,1.538785,1.981881,2.435773,1.111196,1.259381
I,1.972078,1.457241,3.116516,2.687395,1.640068,2.646116,2.420617,0.0,2.528704,1.50437,1.357431,2.109293,3.210016,2.202334,2.317928,1.955986,1.579251,0.918103,2.373551,2.249416
K,1.663155,2.341538,1.675408,1.2667,2.634391,1.24179,1.625366,2.528704,0.0,2.065978,2.711528,1.464649,2.097893,1.391438,1.339549,1.518363,1.716352,2.545937,1.81454,2.160693
L,1.731438,1.861285,2.846908,2.444568,1.889827,2.267662,2.21261,1.50437,2.065978,0.0,1.329744,1.942971,2.858723,2.105607,2.167016,1.824881,1.656438,1.319766,2.24797,2.243036


### Kommentar von Enno:
Ich glaube du kannst statt mit mean_distances.T einfach mit der Funktion mean_distances_inverted von Dario arbeiten... Ach und kannst du das ganze noch in ne Funktion umschreiben und in statistical functions importieren? Für das AS Datenset hab ich das schon gemacht. Das brauche ich, damit ich das dann schön und sleak in dem Documentations File einfach als eine Funktion benutzen kann. Das wäre niceeee