In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statannotations.Annotator import Annotator
import pickle
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import FormatStrFormatter
import os
# Print current working directory
curr_wd = os.path.abspath(os.getcwd())
print(curr_wd)

def assign_group(protein, group1_name: list, group1_string, group2_name: list, group2_string ):
    if protein in group1_name:
        return group1_string
    elif protein in group2_name:
        return group2_string
    else:
        return 'Not in any group'

def assign_groups_advanced(protein, group1_name: list, group1_string, group2_name: list, group2_string, group3_name: list, group3_string, group4_name: list, group4_string ):
    if protein in group1_name:
        return group1_string
    elif protein in group2_name:
        return group2_string
    elif protein in group3_name:
        return group3_string
    elif protein in group4_name:
        return group4_string
    else:
        return 'Not in any group'


def count_consecutive_stretches_of_1(lst, label="IDR"):
    count = 0
    current_stretch = False
    start_stop_list = []
    for index, num in enumerate(lst):
        if num == 1:
            if current_stretch == False:
                start = index
                current_stretch = True
                count += 1
        else:
            if current_stretch == True:
                current_stretch = False
                stop = index
                start_stop_list.append((start, stop, label))
    if current_stretch == True:
        start_stop_list.append((start, len(lst), label))
    return start_stop_list
### count_consecutive_stretches_of_1([1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0], "IDR") --> [(0, 4, 'IDR'), (13, 21, 'IDR')]

def is_either_between(low_range, high_range, a, b):
    ### a and b are start and end of the motif
    ### low_range and high_range are the start and end of the IDR
    ### it will output true if the motif only as much as toucehs the IDR region
    return (low_range <= a <= high_range) or (low_range <= b <= high_range)

def create_boxplot_with_dots(data, xorder, hue_order, group_perspective, ax_object, xticks=None, scatter=True, style='b', **kwargs):
    """
    Creates a boxplot or violin plot with optional dot overlay and statistical annotation.
    """

    # Select plot style
    palette = {'pos': '#8DB600', 'neg': '#FF4040'}
    if style == "b":
        sns.boxplot(
            data=data, y='data', x='metric', hue=group_perspective,
            width=0.8, hue_order=hue_order, order=xorder,
            palette=palette, showfliers=False, ax=ax_object, zorder=3,
            whiskerprops={'color': 'black'}, capprops={'color': 'black'},
            medianprops={'color': 'black'}, **kwargs
        )
    elif style == "v":
        sns.violinplot(
            data=data, y='data', x='metric', hue=group_perspective,
            width=0.99, hue_order=hue_order, order=xorder,
            palette=palette, ax=ax_object, zorder=3,
            native_scale=True, bw_adjust=0.1, split=False,
            gap=0.5, cut=0, common_norm=True, **kwargs
        )
    else:
        print("Style not supported")
        return

    # Overlay individual data points
    if scatter:
        sns.stripplot(
            data=data, y='data', x='metric', hue="Group",
            color='black', order=xorder, hue_order=hue_order,
            size=3, jitter=0.2, alpha=1, dodge=True, zorder=2
        )

    # Add statistical annotations (Mann-Whitney U)
    pairs = [[(metric, 'pos'), (metric, 'neg')] for metric in data['metric'].unique()]
    annotator = Annotator(
        ax_object, pairs=pairs, data=data, y='data', x='metric',
        hue=group_perspective, order=xorder
    )
    annotator.configure(
        test='Mann-Whitney', text_format='star', loc='inside',
        hide_non_significant=False, verbose=0
    ).apply_test().annotate()

    # Axes cleanup
    ax_object.set_xlabel("")
    ax_object.legend().remove()
    ax_object.grid(axis='y', zorder=0)
    ax_object.set_axisbelow(True)
    ax_object.yaxis.set_major_locator(MaxNLocator(nbins=6))
    ax_object.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))

    # Set x-tick labels
    if xticks:
        ax_object.set_xticks(range(len(xticks)))
        ax_object.set_xticklabels(xticks)
    else:
        ax_object.set_xticks([])

def parent_create_boxplot_with_dots(raw_data, categories_ordered, metrics_ordered, group_perspective,
                                     ytitle, ylim_bottom, ax_object, xticks=None, style='b', scatter=False):
    """
    Filters the data and invokes the boxplot creation with formatting.
    """

    # Filter relevant data
    cut_data = raw_data[
        raw_data[group_perspective].isin(categories_ordered) &
        raw_data['metric'].isin(metrics_ordered)
    ].infer_objects()

    # Generate plot
    create_boxplot_with_dots(
        data=cut_data,
        xorder=metrics_ordered,
        hue_order=categories_ordered,
        group_perspective=group_perspective,
        ax_object=ax_object,
        xticks=xticks,
        scatter=scatter,
        style=style
    )

    # Set y-axis limits and label
    upper_ylim = ax_object.get_ylim()[1]
    ax_object.set_ylim(bottom=ylim_bottom, top=upper_ylim)
    ax_object.set_ylabel(ytitle, size=10)

# print(is_either_between(10,20, 5,50))
# print(is_either_between(10,70, 65,80))

# Read data
motif_info_set_df = pd.read_parquet(curr_wd + '/data/processed/GAR_motif_Wang_set_human_cleaned_annot_filtered.parquet')
annotated_IDR_df = pd.read_parquet(curr_wd + '/data/processed/annotation_datasets/all_IDR_human.parquet')
annotated_domain_df = pd.read_parquet(curr_wd + '/data/processed/annotation_datasets/all_domains_human.parquet')



In [None]:
ver = "v3"

# Mapping of descriptive set names to list of associated file names
set_definitions = {
    "GAR_full": ["GAR_subset_full"],
    "GAR_LLPS_pos": [
        "4_LLPS_positive_set_and_GAR_subset",
        "5_LLPS_positive_set_and_NA_positive_set_and_GAR_subset"
    ],
    "GAR_LLPS_pos_NA_neg": [
        "4_LLPS_positive_set_and_GAR_subset"
    ],
    "GAR_LLPS_neg": [
        "6_NA_positive_set_and_GAR_subset",
        "7_GAR_subset_only"
    ],
    "GAR_LLPS_neg_NA_pos": [
        "6_NA_positive_set_and_GAR_subset"
    ],
    "GAR_NA_pos": [
        "5_LLPS_positive_set_and_NA_positive_set_and_GAR_subset",
        "6_NA_positive_set_and_GAR_subset"
    ],
    "GAR_NA_neg": [
        "4_LLPS_positive_set_and_GAR_subset",
        "7_GAR_subset_only"
    ],
    "GAR_pos": [
        "5_LLPS_positive_set_and_NA_positive_set_and_GAR_subset"
    ],
    "GAR_neg": [
        "7_GAR_subset_only"
    ],
}

# Initialize containers
set_list = []
set_dict = {}
proteins_sets_dict = {}

# Read each set from its associated file(s)
for set_name, file_names in set_definitions.items():
    proteins = []
    for fname in file_names:
        file_path = f"{curr_wd}/data/processed/final_set_lists/{fname}.txt"
        with open(file_path, "r") as fl:
            proteins.extend([line.strip() for line in fl])
    set_dict[set_name] = proteins
    set_list.append(proteins)

# Add full proteome
full_proteome_path = f"{curr_wd}/data/processed/list_of_human_proteins.csv"
with open(full_proteome_path, "r") as fl:
    full_proteome = [line.strip() for line in fl]
set_dict["full_proteome"] = full_proteome
set_list.append(full_proteome)

# Store all in versioned dictionary
set_names = list(set_dict.keys())
proteins_sets_dict[ver] = set_dict


In [None]:
protein_dict_with_idr_metrics = {}

for i, curr_protein in enumerate(proteins_sets_dict['v3']['full_proteome']):
    # Skip proteins not present in motif info
    if motif_info_set_df[motif_info_set_df["UniqueID"] == curr_protein].empty:
        continue

    protein_dict_with_idr_metrics[curr_protein] = {}

    # Check if IDR info is available
    if annotated_IDR_df[annotated_IDR_df["protein_name"] == curr_protein].empty:
        flag_IDR_info_available = False
    elif annotated_IDR_df[annotated_IDR_df["protein_name"] == curr_protein]["prediction-disorder-mobidb_lite"].tolist()[0].tolist().count(-1) > 0:
        flag_IDR_info_available = False  # Invalid data marked with -1
    else:
        flag_IDR_info_available = True

        IDR_info_curr = annotated_IDR_df[annotated_IDR_df["protein_name"] == curr_protein]["prediction-disorder-mobidb_lite"].tolist()[0].tolist()
        all_0s = IDR_info_curr.count(0)
        all_1s = IDR_info_curr.count(1)
        motif_1s = 0
        motif_IDR_bounds, motif_range = [], []

        for _, m in motif_info_set_df[motif_info_set_df["UniqueID"] == curr_protein][["start", "end", "full_seq"]].iterrows():
            curr_seq = m['full_seq']
            motif_range.append((m['start'], m['end']))

            if m['start'] > len(IDR_info_curr) - 1:
                flag_IDR_info_available = False
                break

            # Skip overlapping motifs
            if any(is_either_between(lr, hr, m['start'], m['end']) for lr, hr in motif_IDR_bounds):
                continue

            # Determine IDR bounds of the motif
            if IDR_info_curr[m['start']] == 1:
                low_range, high_range = m['start'], m['start']
                for curr_stat in IDR_info_curr[m['start'] - 1::-1]:
                    if curr_stat == 1:
                        motif_1s += 1
                        low_range -= 1
                    else:
                        break
                for curr_stat in IDR_info_curr[m['start']:]:
                    if curr_stat == 1:
                        motif_1s += 1
                        high_range += 1
                    else:
                        break

            elif IDR_info_curr[m['end'] - 1] == 1:
                low_range, high_range = m['end'], m['end']
                for curr_stat in IDR_info_curr[m['end'] - 1::-1]:
                    if curr_stat == 1:
                        motif_1s += 1
                        low_range -= 1
                    else:
                        break
                for curr_stat in IDR_info_curr[m['end']:]:
                    if curr_stat == 1:
                        motif_1s += 1
                        high_range += 1
                    else:
                        break

            else:
                flag_IDR_info_available = False
                continue

            motif_IDR_bounds.append((low_range, high_range))

        IDR_bounds = count_consecutive_stretches_of_1(IDR_info_curr)

        for motifIDRtup in motif_IDR_bounds:
            index_change = next((index for index, tpl in enumerate(IDR_bounds) if tpl[0] == motifIDRtup[0]), None)
            if index_change is not None:
                IDR_bounds[index_change] = (IDR_bounds[index_change][0], IDR_bounds[index_change][1], "MOTIF")

        rg_count = sum(curr_seq[lr:hr].count("R") + curr_seq[lr:hr].count("G") for lr, hr in motif_IDR_bounds)

        len_motif_IDR = motif_1s
        len_other_IDR = all_1s - motif_1s
        len_non_IDR = all_0s
        len_motif_IDR_woRG = len_motif_IDR - rg_count

        protein_dict_with_idr_metrics[curr_protein]['num_of_IDR_regions'] = len(count_consecutive_stretches_of_1(IDR_info_curr))
        protein_dict_with_idr_metrics[curr_protein]['num_of_IDR_regions_w_motif'] = len(motif_IDR_bounds)
        protein_dict_with_idr_metrics[curr_protein]['num_of_IDR_regions_wo_motif'] = protein_dict_with_idr_metrics[curr_protein]['num_of_IDR_regions'] - len(motif_IDR_bounds)
        protein_dict_with_idr_metrics[curr_protein]['IDR_bounds'] = IDR_bounds
        protein_dict_with_idr_metrics[curr_protein]['motif_range'] = motif_range

        prot_length = len(motif_info_set_df[motif_info_set_df["UniqueID"] == curr_protein]['full_seq'].tolist()[0])
        protein_dict_with_idr_metrics[curr_protein]['protein_length'] = prot_length

# Convert to DataFrame
proteins_with_idr_metrics_df = pd.DataFrame(protein_dict_with_idr_metrics).transpose()
proteins_with_idr_metrics_df.reset_index(inplace=True)
proteins_with_idr_metrics_df.rename(columns={'index': 'proteins'}, inplace=True)

# Assign protein group labels
proteins_with_idr_metrics_df['Groups_num'] = proteins_with_idr_metrics_df['proteins'].apply(
    assign_groups_advanced,
    args=(
        set_list[set_names.index("GAR_pos")], "5",
        set_list[set_names.index("GAR_neg")], "7",
        set_list[set_names.index("GAR_LLPS_pos_NA_neg")], "4",
        set_list[set_names.index("GAR_LLPS_neg_NA_pos")], "6"
    )
)
proteins_with_idr_metrics_df['Groups_all'] = proteins_with_idr_metrics_df['proteins'].apply(
    assign_group,
    args=(set_list[set_names.index("GAR_pos")], "pos", set_list[set_names.index("GAR_neg")], "neg")
)
proteins_with_idr_metrics_df['Groups_LLPS'] = proteins_with_idr_metrics_df['proteins'].apply(
    assign_group,
    args=(set_list[set_names.index("GAR_LLPS_pos")], "pos", set_list[set_names.index("GAR_LLPS_neg")], "neg")
)
proteins_with_idr_metrics_df['Groups_NA'] = proteins_with_idr_metrics_df['proteins'].apply(
    assign_group,
    args=(set_list[set_names.index("GAR_NA_pos")], "pos", set_list[set_names.index("GAR_NA_neg")], "neg")
)

# Ensure correct datatypes
proteins_with_idr_metrics_df = proteins_with_idr_metrics_df.infer_objects()

#### save and load the dict

output_path = f"{curr_wd}/data/results/protein_df_with_IDR_metrics_cleaned.pkl"
with open(output_path, "wb") as fp:
    pickle.dump(proteins_with_idr_metrics_df, fp)
    print("DataFrame saved successfully to file.")


In [None]:
# Load the cleaned protein DataFrame with physicochemical metrics
with open(os.path.join(curr_wd, 'data/results/protein_df_with_IDR_metrics_cleaned.pkl'), 'rb') as fp:
    proteins_with_idr_metrics_df = pickle.load(fp)

# Expand the IDR bounds into separate rows
expanded_df = proteins_with_idr_metrics_df.explode('IDR_bounds')
expanded_df.reset_index(drop=True, inplace=True)

# Compute IDR length and its relative length
expanded_df["length_IDR"] = expanded_df["IDR_bounds"].apply(lambda x: x[1] - x[0] if isinstance(x, tuple) else None)
expanded_df["length_IDR_rel"] = 100 * (expanded_df["length_IDR"] / expanded_df["protein_length"])
expanded_df["type_IDR"] = expanded_df["IDR_bounds"].apply(lambda x: x[2] if isinstance(x, tuple) else None)

# Separate motif-containing IDRs and ordinary IDRs
expanded_df['length_mIDR'] = np.where(expanded_df['type_IDR'] == 'MOTIF', expanded_df['length_IDR'], np.nan)
expanded_df['length_mIDR_rel'] = np.where(expanded_df['type_IDR'] == 'MOTIF', expanded_df['length_IDR_rel'], np.nan)

expanded_df['length_oIDR'] = np.where(expanded_df['type_IDR'] == 'IDR', expanded_df['length_IDR'], np.nan)
expanded_df['length_oIDR_rel'] = np.where(expanded_df['type_IDR'] == 'IDR', expanded_df['length_IDR_rel'], np.nan)

# Melt relevant numeric columns for plotting or analysis
expanded_df_melted = expanded_df.melt(
    id_vars=['proteins', 'Groups_num', 'Groups_all', 'Groups_LLPS', 'Groups_NA'],
    value_vars=[
        'num_of_IDR_regions', 'num_of_IDR_regions_w_motif', 'num_of_IDR_regions_wo_motif',
        'IDR_bounds', 'motif_range', 'protein_length',
        'length_mIDR', 'length_mIDR_rel', 'length_oIDR', 'length_oIDR_rel'
    ],
    var_name='metric',
    value_name='data'
)

# Remove NaN values from the melted data
expanded_df_melted.dropna(subset=['data'], inplace=True)


In [None]:
# Define comparison group column
group_perspective = "Groups_all"

# Create 4 subplots in one row
fig, ax = plt.subplots(1, 4, sharex=False, figsize=(8, 3))
categories_ordered = ["neg", "pos"]

# ---- Panel 1: Protein length ----
metrics_ordered = ["protein_length"]
ytitle = "length of protein [-]"
ylim_bottom = -0.55
parent_create_boxplot_with_dots(
    expanded_df_melted, categories_ordered, metrics_ordered,
    group_perspective, ytitle, ylim_bottom, ax[0], xticks=False
)

# ---- Panel 2: Number of IDRs ----
metrics_ordered = ["num_of_IDR_regions"]
ytitle = "# of IDRs per protein [-]"
ylim_bottom = -0.55
parent_create_boxplot_with_dots(
    expanded_df_melted, categories_ordered, metrics_ordered,
    group_perspective, ytitle, ylim_bottom, ax[1]
)

# ---- Panel 3: Absolute IDR lengths ----
metrics_ordered = ["length_mIDR", "length_oIDR"]
xticks = ["mIDR", "oIDR"]
ytitle = "absolute length of IDR [-]"
ylim_bottom = -0.25
parent_create_boxplot_with_dots(
    expanded_df_melted, categories_ordered, metrics_ordered,
    group_perspective, ytitle, ylim_bottom, ax[2],
    xticks=xticks, style='b', scatter=False
)

# ---- Panel 4: Relative IDR lengths ----
metrics_ordered = ["length_mIDR_rel", "length_oIDR_rel"]
xticks = ["mIDR", "oIDR"]
ytitle = "relative length of IDRs [%]"
ylim_bottom = -0.25
parent_create_boxplot_with_dots(
    expanded_df_melted, categories_ordered, metrics_ordered,
    group_perspective, ytitle, ylim_bottom, ax[3],
    xticks=xticks, style='b', scatter=False
)

# Add global legend (from last axis)
handles, labels = ax[3].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.55, 1.08), ncol=2)

plt.tight_layout()

os.makedirs(os.path.join(curr_wd, "data/results/subfigures/"), exist_ok=True)
plt.savefig(os.path.join(curr_wd, "data/results/subfigures/fig1_F_G_H_I.svg"), transparent=True)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde, ttest_ind

# --- Extract motif location info from IDR bounds ---

def extract_motif_tuples(row):
    motif_range = row['motif_range']
    IDR_bounds = row['IDR_bounds']
    for el in IDR_bounds:
        if el[2] == "MOTIF" and is_either_between(el[0], el[1], motif_range[0], motif_range[1]):
            idr_len = el[1] - el[0]
            left = motif_range[0] - el[0]
            right = el[1] - motif_range[1]
            scale_pos = ((motif_range[0] + motif_range[1]) / 2 - el[0]) / idr_len

            one_third = idr_len / 3
            if left < one_third and right < one_third:
                category = "ENTIRE"
            elif left < one_third:
                category = "N-TERMINAL"
            elif right < one_third:
                category = "C-TERMINAL"
            else:
                category = "CENTRAL"
            return category, left, right, idr_len, scale_pos
    return None, None, None, None, None

# --- Prepare data ---

expanded_motifs_df = proteins_with_idr_metrics_df.explode('motif_range')
expanded_motifs_df[["motif_pos_cat_in_IDR", "distance_to_NTend_of_IDR", 
                    "distance_to_CTend_of_IDR", "len_IDR", 
                    "motif_pos_scale_in_IDR"]] = expanded_motifs_df.apply(extract_motif_tuples, axis=1, result_type='expand')

expanded_motifs_df = expanded_motifs_df.dropna(subset=["motif_pos_cat_in_IDR"])
expanded_motifs_df["distance_to_NTend_of_IDR_rel"] = expanded_motifs_df["distance_to_NTend_of_IDR"] / expanded_motifs_df["len_IDR"]
expanded_motifs_df["distance_to_CTend_of_IDR_rel"] = expanded_motifs_df["distance_to_CTend_of_IDR"] / expanded_motifs_df["len_IDR"]

# --- Group and data separation ---

group_perspective = 'Groups_all'
group_mapping = {'pos': 1, 'neg': 0}
expanded_motifs_df['Group_numeric'] = expanded_motifs_df[group_perspective].map(group_mapping)

positive_data = expanded_motifs_df[expanded_motifs_df['Group_numeric'] == 1]['motif_pos_scale_in_IDR']
negative_data = expanded_motifs_df[expanded_motifs_df['Group_numeric'] == 0]['motif_pos_scale_in_IDR']

# --- Plot setup ---

fig, ax = plt.subplots(figsize=(8, 4))

def plot_violin(ax, data, pos, width, color):
    density = gaussian_kde(data, bw_method=0.1)
    xs = np.linspace(-0.05, 1.05, 200)
    ys = density(xs)
    ys = ys / ys.max() * width
    ax.fill_between(xs, ys + pos, pos, alpha=0.6, color=color)

positions = [0, 1]
width = 0.4
vshift = 0.15

plot_violin(ax, negative_data, positions[0] + vshift, width, '#FF4040')
plot_violin(ax, positive_data, positions[1] + vshift, width, '#8DB600')

# --- Add boxplots ---

boxplot_args = dict(widths=0.15, vert=False,
                    boxprops=dict(color='black', linewidth=2),
                    whiskerprops=dict(color='black', linewidth=2),
                    capprops=dict(color='black', linewidth=2))

ax.boxplot(positive_data, positions=[1 - vshift * 1.3], 
           medianprops=dict(color='#8DB600', linewidth=1), **boxplot_args)

ax.boxplot(negative_data, positions=[0 - vshift * 1.3], 
           medianprops=dict(color='#FF4040', linewidth=1), **boxplot_args)

# --- Add jittered scatter points ---

jitter = 0.025
ax.scatter(negative_data, positions[0] + np.random.normal(0, jitter, len(negative_data)),
           color='#FF4040', alpha=0.6)
ax.scatter(positive_data, positions[1] + np.random.normal(0, jitter, len(positive_data)),
           color='#8DB600', alpha=0.6)

# --- Add sample size text ---

ax.text(max(negative_data)*1.02, 0, f'n={len(negative_data)}', fontsize=9, ha='left', va='center', color='#FF4040')
ax.text(max(positive_data)*1.02, 1, f'n={len(positive_data)}', fontsize=9, ha='left', va='center', color='#8DB600')

# --- Stats test ---

t_stat, p_val = ttest_ind(negative_data, positive_data)
print(f"T-test: t = {t_stat:.3f}, p = {p_val:.3g}")

# --- Final touches ---

ax.set_yticks(positions)
ax.set_yticklabels(['negative subset', 'positive subset'], rotation=90, va='center')
ax.set_ylim(-0.3, 1.65)
ax.set_xlabel('relative position of the RG-motif inside the IDR')
plt.tight_layout()

os.makedirs(os.path.join(curr_wd, "data/results/subfigures/"), exist_ok=True)
plt.savefig(os.path.join(curr_wd, "data/results/subfigures/suppl_fig_S3.svg"), transparent=True)
