In [1]:
import os 
os.chdir("/biodata/franco/zsuzsa_lab/jupyter")
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'mono', fontsize = 20, colors = 'banskt', dpi = 300)

df_monomer = pd.read_table("monomer_outcontext_regions_distances_df.txt", header=0)
df_disprot = pd.read_table("disprot_outcontext_regions_distances_df.txt", header=0)

In [None]:
df_disprot["SS"].drop_duplicates()

In [46]:
# import collections
# from Bio import SeqIO

# disprot_regions_dict = collections.defaultdict(dict)
# disprot_dict = collections.defaultdict(dict)
# with open("/biodata/franco/datasets/disprot/disprot_regions_seq_2023_12_Structure_namespaces_CDHIT_1.0_OK.fasta") as instrm:
#     for record in SeqIO.parse(instrm, "fasta"):
#         unip = record.id.split('|')[0]
#         disprot_regions_dict[unip]['seq'] = str(record.seq)
# with open("/biodata/franco/datasets/disprot/disprot_regions_annot_2023_12_Structure_namespaces_CDHIT_1.0_OK.fasta") as instrm:
#     for record in SeqIO.parse(instrm, "fasta"):
#         unip = record.id.split('|')[0]
#         disprot_regions_dict[unip]['disorder'] = str(record.seq)


# with open("/biodata/franco/datasets/disprot/disprot_OK_fullset_2023_12.fasta") as instrm:
#     for record in SeqIO.parse(instrm, "fasta"):
#             disprot_dict[record.id]['seq'] = str(record.seq)
# with open("/biodata/franco/datasets/disprot/disprot_OK_fullset_annotations_2023_12.fasta") as instrm:
#     for record in SeqIO.parse(instrm, "fasta"):
#             disprot_dict[record.id]['disorder'] = str(record.seq)


# with open("disprot_regions_sequences_and_annots_concat.txt") as instrm:
#     sequence_concat = instrm.readline().strip()
#     disorder_concat = instrm.readline().strip()
#     concat_order = [x.strip().split() for x in instrm.readlines()]


In [None]:
disprot_coverages = df_disprot[["rid", "coverage"]].drop_duplicates()["coverage"]
monomer_coverages = df_monomer[["rid", "coverage"]].drop_duplicates()["coverage"]

plt.figure(figsize=(6,6), dpi=72)
plt.hist(disprot_coverages, alpha=0.4, label="disprot")
plt.hist(monomer_coverages, alpha=0.4, label="monomer")
plt.xlabel("Coverage of region in protein")
plt.legend()
plt.show()

disprot_lengths = df_disprot[["rid", "aa_reg_len"]].drop_duplicates()["aa_reg_len"]
monomer_lengths = df_monomer[["rid", "aa_reg_len"]].drop_duplicates()["aa_reg_len"]

plt.figure(figsize=(6,6), dpi=72)
plt.hist(disprot_lengths, bins=np.arange(10,50), alpha=0.4, label="disprot")
plt.hist(monomer_lengths, bins=np.arange(10,50), alpha=0.4, label="monomer")
plt.xlabel("Protein region length")
plt.legend()
plt.show()

In [None]:
df_disprot["prot_len"] = np.round(df_disprot["aa_reg_len"]/df_disprot["coverage"])
df_monomer["prot_len"] = np.round(df_monomer["aa_reg_len"]/df_monomer["coverage"])

plt.figure(figsize=(6,6), dpi=72)
plt.hist(df_disprot["prot_len"],bins=range(0,1050,25), alpha=0.4, label="disprot")
plt.hist(df_monomer["prot_len"],bins=range(0,1050,25), alpha=0.4, label="monomer")

plt.xlabel("Protein length")
plt.show()

In [None]:
df_disprot = df_disprot[df_disprot["AA"] != "Z"]

df_monomer_noZ = df_monomer[df_monomer["AA"] != "Z"]
df_monomer = df_monomer_noZ[df_monomer_noZ["AA"] != "X"]

df_disprot["SS"] = "D"

df_joint = pd.concat([df_disprot, df_monomer])
df_joint_50 = df_joint[df_joint["aa_reg_len"] <= 50]
df_joint_50_L400 = df_joint_50[df_joint_50["prot_len"] <= 400 ]

In [None]:

fig = px.scatter(df_disprot, x='aa_reg_relativelen', y='cos_distances', color='AA', size='aa_reg_len', trendline="lowess", 
                 width=1000,
                 height=400,
                 trendline_options=dict(frac=0.1),
                 labels={
                     "cos_distances": "Cosine Distance",
                     "aa_reg_relativelen": "Relative position in region",
                     "AA": "Aminoacid",
                 })
#fig.update_traces(marker=dict(opacity=0.3))
fig.update_traces(visible=False, selector=dict(mode="markers"))
fig.update_traces(showlegend=True, selector=dict(mode='lines'))
fig.write_image(f"plots/disorder_regions_context_cos_distance_to_original_per_aa.png", engine="kaleido")
#fig.show()

from IPython.display import Image
img_bytes = fig.to_image(format="png")
Image(img_bytes)


#### WITH EUCLIDEAN DISTANCE BELOW

# fig = px.scatter(df_disprot, x='aa_reg_relativelen', y='euc_distances', color='AA', size='aa_reg_len', trendline="lowess", 
#                  width=1000,
#                  height=400,
#                  trendline_options=dict(frac=0.1),
#                  labels={
#                      "euc_distances": "Euclidean Distance",
#                      "aa_reg_relativelen": "Relative position in region",
#                      "AA": "Aminoacid",
#                  })
# #fig.update_traces(marker=dict(opacity=0.3))
# fig.update_traces(visible=False, selector=dict(mode="markers"))
# fig.update_traces(showlegend=True, selector=dict(mode='lines'))
# fig.write_image(f"plots/disorder_regions_context_euc_distance_to_original_per_aa.png", engine="kaleido")
# fig.show()

# fig = px.scatter(df_monomer, x='aa_reg_relativelen', y='euc_distances', color='AA', size='aa_reg_len', trendline="lowess", 
#                  width=1000,
#                  height=400,
#                  trendline_options=dict(frac=0.1),
#                  labels={
#                      "euc_distances": "Euclidean Distance",
#                      "aa_reg_relativelen": "Relative position in region",
#                      "AA": "Aminoacid",
#                  })
# #fig.update_traces(marker=dict(opacity=0.3))
# fig.update_traces(visible=False, selector=dict(mode="markers"))
# fig.update_traces(showlegend=True, selector=dict(mode='lines'))
# fig.write_image(f"plots/monomer_regions_context_euc_distance_to_original_per_aa.png", engine="kaleido")
# fig.show()


In [None]:


fig = px.scatter(df_monomer, x='aa_reg_relativelen', y='cos_distances', color='AA', size='aa_reg_len', trendline="lowess", 
                 width=1000,
                 height=400,
                 trendline_options=dict(frac=0.1),
                 labels={
                     "cos_distances": "Cosine Distance",
                     "aa_reg_relativelen": "Relative position in region",
                     "AA": "Aminoacid",
                 })
#fig.update_traces(marker=dict(opacity=0.3))
fig.update_traces(visible=False, selector=dict(mode="markers"))
fig.update_traces(showlegend=True, selector=dict(mode='lines'))
fig.write_image(f"plots/monomer_regions_context_cos_distance_to_original_per_aa.png", engine="kaleido")
#fig.show()

from IPython.display import Image
img_bytes = fig.to_image(format="png")
Image(img_bytes)

In [None]:

fig = px.scatter(df_joint_50, x='aa_reg_relativelen', y='cos_distances', color='SS', size='aa_reg_len', trendline="lowess", 
                 width=1000,
                 height=400,
                 trendline_options=dict(frac=0.1),
                 labels={
                     "cos_distances": "Cosine Distance",
                     "aa_reg_relativelen": "Relative position in region",
                     "SS": "Secondary Structure",
                 },title=f"Embedding distance of regions with and without context. Lengths < 50aa")
#fig.update_traces(marker=dict(opacity=0.3))
fig.update_traces(visible=False, selector=dict(mode="markers"))
fig.update_traces(showlegend=True, selector=dict(mode='lines'))
fig.write_image(f"plots/joint_regions_len50_context_cos_distance_to_original_per_aa.png", engine="kaleido")
fig.show()


In [32]:
# aa_target = "E"
# df_disprot_aa = df_disprot[df_disprot["AA"] == aa_target]
# df_disprot_aa = df_disprot_aa[df_disprot["aa_reg_len"] <= 50]
# min_len = df_disprot_aa["aa_reg_len"].min()
# fig = px.scatter(df_disprot_aa, x='aa_reg_relativelen', y='cos_distances', color='aa_reg_len',range_color=[min_len,50], size='aa_reg_len',
#                  width=1000,
#                  height=400,
#                  trendline="lowess", trendline_options=dict(frac=0.1), size_max=20,
#                  labels={
#                      "cos_distances": "Cosine Distance",
#                      "aa_reg_relativelen": "Relative position in region",
#                      "aa_reg_len": "Region length",
#                  },
#                  title=f"Disprot - Cosine distance of {aa_target} aminoacid in context vs out of context")
# fig.update_traces(marker=dict(opacity=0.3))
# fig.write_image(f"plots/disorder_regions_context_cos_distance_to_original_aatarget_{aa_target}.png", engine="kaleido")
# fig.show()

# #fig.update_layout(xaxis=dict(range=[-.02,1.02]), yaxis=dict(range=[-.1,13]))
# #fig.update_layout(legend= {'itemsizing': 'trace', 'marker_size':50})

# df_monomer_aa = df_monomer[df_monomer["AA"] == aa_target]
# df_monomer_aa = df_monomer_aa[df_monomer["aa_reg_len"] <= 50]
# min_len_mono = df_monomer_aa["aa_reg_len"].min()

# fig = px.scatter(df_monomer_aa, x='aa_reg_relativelen', y='cos_distances', color='aa_reg_len',range_color=[min_len_mono,50], size='aa_reg_len',
#                  width=1000,
#                  height=400,
#                  trendline="lowess", trendline_options=dict(frac=0.1), size_max=20,
#                  labels={
#                      "cos_distances": "Cosine Distance",
#                      "aa_reg_relativelen": "Relative position in region",
#                      "aa_reg_len": "Region length",
#                  },
#                  title=f"Monomer - Cosine distance of {aa_target} aminoacid in context vs out of context")
# fig.update_traces(marker=dict(opacity=0.3))
# fig.write_image(f"plots/monomer_regions_context_cos_distance_to_original_aatarget_{aa_target}.png", engine="kaleido")
# fig.show()


In [None]:
fig = px.scatter(df_disprot, x="aa_reg_len", y="euc_distances", color='AA', hover_data=["aa_reg_relativelen"], 
                 trendline="lowess", trendline_options=dict(frac=0.1),
                 width=1000,
                 height=400,
                 labels={
                     "euc_distances": "Euclidean Distance",
                     "AA": "Aminoacid",
                     "aa_reg_len": "Region length",
                 })
#fig.update_traces(marker=dict(opacity=0.3))
fig.update_traces(visible=False, selector=dict(mode="markers"))
fig.update_traces(showlegend=True, selector=dict(mode='lines'))
fig.write_image(f"plots/disorder_regions_context_euc_distance_to_original_per_aa_by_length.png", engine="kaleido")
fig.show()

fig = px.scatter(df_monomer, x="aa_reg_len", y="euc_distances", color='AA', hover_data=["aa_reg_relativelen"], 
                 trendline="lowess", trendline_options=dict(frac=0.1),
                 width=1000,
                 height=400,
                 labels={
                     "euc_distances": "Euclidean Distance",
                     "AA": "Aminoacid",
                     "aa_reg_len": "Region length",
                 })
#fig.update_traces(marker=dict(opacity=0.3))
fig.update_traces(visible=False, selector=dict(mode="markers"))
fig.update_traces(showlegend=True, selector=dict(mode='lines'))
fig.write_image(f"plots/monomer_regions_context_euc_distance_to_original_per_aa_by_length.png", engine="kaleido")
fig.show()


In [None]:


fig = px.scatter(df_joint_50, x="aa_reg_len", y="euc_distances", color='SS', hover_data=["aa_reg_relativelen"], 
                 trendline="lowess", trendline_options=dict(frac=0.1),
                 width=1000,
                 height=400,
                 labels={
                     "euc_distances": "Euclidean Distance",
                     "SS": "Secondary Structure",
                     "aa_reg_len": "Region length",
                 })
#fig.update_traces(marker=dict(opacity=0.3))
fig.update_traces(visible=False, selector=dict(mode="markers"))
fig.update_traces(showlegend=True, selector=dict(mode='lines'))
fig.write_image(f"plots/joint_regions_context_euc_distance_to_original_per_ss_by_length.png", engine="kaleido")
fig.show()

fig = px.scatter(df_joint_50, x="aa_reg_len", y="cos_distances", color='SS', hover_data=["aa_reg_relativelen"], 
                 trendline="lowess", trendline_options=dict(frac=0.1),
                 width=1000,
                 height=400,
                 labels={
                     "cos_distances": "Cosine Distance",
                     "SS": "Secondary Structure",
                     "aa_reg_len": "Region length",
                 })
#fig.update_traces(marker=dict(opacity=0.3))
fig.update_traces(visible=False, selector=dict(mode="markers"))
fig.update_traces(showlegend=True, selector=dict(mode='lines'))
fig.write_image(f"plots/joint_regions_context_cos_distance_to_original_per_ss_by_length.png", engine="kaleido")
fig.show()

In [31]:
### SAME AS ABOVE BUT WITH COVERAGE INSTEAD OF LENGTH

# fig = px.scatter(df_disprot, x="coverage", y="euc_distances", color='AA', hover_data=["aa_reg_relativelen"], 
#                  trendline="lowess", trendline_options=dict(frac=0.1),
#                  width=1000,
#                  height=400,
#                  labels={
#                      "euc_distances": "Euclidean Distance",
#                      "AA": "Aminoacid",
#                      "aa_reg_len": "Region length",
#                      "coverage": "Coverage of IDR in protein"
#                  })
# #fig.update_traces(marker=dict(opacity=0.3))
# fig.update_traces(visible=False, selector=dict(mode="markers"))
# fig.update_traces(showlegend=True, selector=dict(mode='lines'))
# fig.write_image(f"plots/disorder_regions_context_euc_distance_to_original_per_aa_by_coverage.png", engine="kaleido")

# fig.show()

# fig = px.scatter(df_monomer, x="coverage", y="euc_distances", color='AA', hover_data=["aa_reg_relativelen"], 
#                  trendline="lowess", trendline_options=dict(frac=0.1),
#                  width=1000,
#                  height=400,
#                  labels={
#                      "euc_distances": "Euclidean Distance",
#                      "AA": "Aminoacid",
#                      "aa_reg_len": "Region length",
#                      "coverage": "Coverage of SS in protein"
#                  })
# #fig.update_traces(marker=dict(opacity=0.3))
# fig.update_traces(visible=False, selector=dict(mode="markers"))
# fig.update_traces(showlegend=True, selector=dict(mode='lines'))
# fig.write_image(f"plots/monomer_regions_context_euc_distance_to_original_per_aa_by_coverage.png", engine="kaleido")

# fig.show()

In [None]:
disprot_reg_prot_len = df_disprot[["aa_reg_len", "prot_len"]].drop_duplicates()
monomer_reg_prot_len = df_monomer[["aa_reg_len", "prot_len"]].drop_duplicates()

plt.figure(figsize=(8,8), dpi=72)
plt.scatter(disprot_reg_prot_len["prot_len"], disprot_reg_prot_len["aa_reg_len"], alpha=0.3, color="red")
plt.scatter(monomer_reg_prot_len["prot_len"], monomer_reg_prot_len["aa_reg_len"], alpha=0.3, color="blue")
plt.show()