In [None]:
import os
import math
import pickle
from collections import Counter

import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from localcider.sequenceParameters import SequenceParameters
from scipy.stats import ttest_ind

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter
import seaborn as sns
from statannotations.Annotator import Annotator

# Print current working directory
curr_wd = os.path.abspath(os.getcwd())
print(curr_wd)

def count_consecutive_stretches_of_1(lst, label="IDR"):
    """
    Identify stretches of consecutive 1s in a list and return start/stop positions.

    Parameters:
        lst (list): A list of binary values (0s and 1s).
        label (str): Label to assign to each stretch.

    Returns:
        list of tuples: Each tuple contains (start, stop, label) of a 1-stretch.
    """
    count = 0
    current_stretch = False
    start_stop_list = []
    for index, num in enumerate(lst):
        if num == 1:
            if not current_stretch:
                start = index
                current_stretch = True
                count += 1
        else:
            if current_stretch:
                stop = index
                start_stop_list.append((start, stop, label))
                current_stretch = False
    if current_stretch:
        start_stop_list.append((start, len(lst), label))
    return start_stop_list

def is_either_between(low_range, high_range, a, b):
    """
    Check if either 'a' or 'b' falls within the interval [low_range, high_range].

    Parameters:
        low_range (int): Start of the interval.
        high_range (int): End of the interval.
        a (int): Start of motif.
        b (int): End of motif.

    Returns:
        bool: True if a or b falls within the range.
    """
    return (low_range <= a <= high_range) or (low_range <= b <= high_range)

def assign_group(protein, group1_name, group1_string, group2_name, group2_string):
    """
    Assign a protein to one of two groups based on presence in name lists.

    Parameters:
        protein (str): Protein identifier.
        group1_name (list): List of proteins in group 1.
        group1_string (str): Label for group 1.
        group2_name (list): List of proteins in group 2.
        group2_string (str): Label for group 2.

    Returns:
        str: Group label or 'Not in any group'.
    """
    if protein in group1_name:
        return group1_string
    elif protein in group2_name:
        return group2_string
    else:
        return 'Not in any group'

def extract_sequences_from_fasta(fasta_file):
    """
    Extract protein sequences from a FASTA file.

    Parameters:
        fasta_file (str): Path to the FASTA file.

    Returns:
        dict: Mapping from UniProt IDs to sequences.
    """
    return {record.id.split("|")[1]: str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")}

def filter_sequences(sequences, masks):
    """
    Filter protein sequences based on binary mask.

    Parameters:
        sequences (dict): Mapping from IDs to sequences.
        masks (dict): Mapping from IDs to binary mask lists.

    Returns:
        dict: Filtered sequences retaining only masked positions.
    """
    return {
        identifier: ''.join(residue for residue, flag in zip(seq, masks[identifier]) if flag)
        for identifier, seq in sequences.items()
        if identifier in masks and len(seq) == len(masks[identifier])
    }

def compute_aa_proportions(sequences):
    """
    Compute amino acid proportions across a set of sequences.

    Parameters:
        sequences (dict): Mapping from IDs to amino acid sequences.

    Returns:
        tuple:
            - dict of mean proportions (%)
            - dict of standard error of the mean (SEM) for proportions (%)
    """
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    proportions = {aa: [] for aa in amino_acids}
    
    for seq in sequences.values():
        if not seq:
            continue
        seq_len = len(seq)
        counts = Counter(seq)
        for aa in amino_acids:
            proportions[aa].append(counts.get(aa, 0) / seq_len)
    
    avg_proportions = {aa: 100 * np.mean(values) for aa, values in proportions.items()}
    sem_proportions = {
        aa: 100 * (np.std(values, ddof=1) / np.sqrt(len(values)))
        for aa, values in proportions.items() if values
    }
    
    return avg_proportions, sem_proportions

def plot_group(ax, group_proportions, avg_proportions, group, els, bar_width, row, col):
    """
    Plot amino acid proportions and their differences between two groups.

    Parameters:
        ax (np.ndarray): 2D array of matplotlib axes (2 rows x N cols).
        group_proportions (pd.DataFrame): DataFrame with group labels as index and AA proportions as columns.
        avg_proportions (dict): Background amino acid proportions.
        group (str): Name of the group (used for titles/labels).
        els (list): List of amino acids to include.
        bar_width (float): Width of the bars.
        row (int): Current subplot row index.
        col (int): Current subplot column index.
    """
    # Extract and sort proportions
    group_neg = group_proportions.loc["neg", els]
    group_pos = group_proportions.loc["pos", els]
    difference = group_pos - group_neg
    difference_sorted = difference.sort_values(ascending=False)
    keys_sorted = difference_sorted.index

    group_neg = group_neg.loc[keys_sorted]
    group_pos = group_pos.loc[keys_sorted]
    indices = np.arange(len(els))

    # Top plot: stacked proportions
    ax[0, col].bar(indices - bar_width / 2, group_neg, bar_width, 
                   label="neg", color="#FF4040", edgecolor="black")
    ax[0, col].bar(indices + bar_width / 2, group_pos, bar_width, 
                   label="pos", color="#8DB600", edgecolor="black")
    ax[0, col].bar(indices, [avg_proportions[aa] for aa in keys_sorted], 
                   bar_width * 0.8, fill=False, edgecolor="black", linestyle="dotted")

    ax[0, col].set_xticks([])
    ax[0, col].grid(axis="y", zorder=0)
    ax[0, col].set_axisbelow(True)
    ax[0, col].tick_params(axis='x', length=0)
    ax[0, col].tick_params(axis='y', length=0)
    ax[0, 0].set_ylabel("Proportion of amino acid\nin motif [%]")
    ax[0, 0].tick_params(axis='y', length=3)

    # Bottom plot: differences
    ax[1, col].bar(indices, difference_sorted, color="#A9A9A9", edgecolor="black")
    ax[1, col].set_xticks(indices)
    ax[1, col].set_xticklabels(keys_sorted)
    ax[1, col].set_ylim(-3, 3)
    ax[1, col].grid(axis="y", zorder=0)
    ax[1, col].set_axisbelow(True)
    ax[1, col].tick_params(axis='x', length=0)
    ax[1, col].tick_params(axis='y', length=0)
    ax[1, 0].tick_params(axis='y', length=3)

# Function to plot the separate R and G subplot
def plot_high_values(ax, group_proportions, avg_proportions, amino_acids, bar_width, col):
    indices = np.arange(len(amino_acids))
    curr_neg = group_proportions.loc["neg", amino_acids]
    curr_pos = group_proportions.loc["pos", amino_acids]

    # Twin axis for proportion plot
    twin_ax = ax[0, col].twinx()
    twin_ax.bar(indices - bar_width / 2, curr_neg, bar_width, label="neg", color="#FF4040", edgecolor="black")
    twin_ax.bar(indices + bar_width / 2, curr_pos, bar_width, label="pos", color="#8DB600", edgecolor="black")
    twin_ax.bar(indices, [avg_proportions[aa] for aa in amino_acids], bar_width * 0.8,
                fill=False, edgecolor="black", linestyle="dotted")

    twin_ax.set_ylabel("Proportion of R & G in motif [%]")
    twin_ax.grid(axis="y", zorder=0)
    twin_ax.set_axisbelow(True)

    # Difference plot on a second row
    twin_ax_1 = ax[1, col].twinx()
    difference = curr_pos - curr_neg
    twin_ax_1.bar(indices, difference, color="#A9A9A9", edgecolor="black")

    twin_ax_1.set_xticks(indices)
    twin_ax_1.set_xticklabels(amino_acids)
    twin_ax_1.grid(axis="y", zorder=0)
    twin_ax_1.set_axisbelow(True)
    twin_ax_1.set_ylim(-6, 6)


def create_boxplot_with_dots(data, xorder, hue_order, ax_object, xticks=None, scatter=True, style='b', **kwargs):
    palette = {'pos': '#8DB600', 'neg': '#FF4040'}

    if style == "b":
        sns.boxplot(data=data, y='data', x='metric', hue='Group', width=0.8, order=xorder,
                    hue_order=hue_order, palette=palette, showfliers=False, ax=ax_object, zorder=3,
                    whiskerprops={'color': 'black'}, capprops={'color': 'black'},
                    medianprops={'color': 'black'}, **kwargs)

    elif style == "v":
        data = data[data["data"] < 6]  # Top cutoff
        sns.violinplot(data=data, y='data', x='metric', hue='Group', width=0.99, order=xorder,
                       hue_order=hue_order, palette=palette, showfliers=False, ax=ax_object, zorder=3,
                       bw_adjust=0.1, cut=0, split=False, gap=0.5, native_scale=True, common_norm=True,
                       whiskerprops={'color': 'black'}, capprops={'color': 'black'},
                       medianprops={'color': 'black'}, **kwargs)
    else:
        raise ValueError("Style must be 'b' (boxplot) or 'v' (violinplot)")

    # Optional scatter overlay
    if scatter:
        sns.stripplot(data=data, y='data', x='metric', hue='Group', color='black', order=xorder,
                      hue_order=hue_order, size=3, jitter=0.2, alpha=1, dodge=True, zorder=2)

    # Statistical annotation
    pairs = [[(_, 'pos'), (_, 'neg')] for _ in data.metric.unique()]
    annotator = Annotator(ax_object, pairs, data=data, y='data', x='metric', hue='Group', order=xorder)
    annotator.configure(test='Mann-Whitney', text_format='star', loc='outside',
                        hide_non_significant=False, verbose=0)
    annotator.apply_test().annotate()

    ax_object.set_xlabel("")
    ax_object.legend().remove()
    ax_object.grid(axis='y', zorder=0)
    ax_object.set_axisbelow(True)
    ax_object.yaxis.set_major_locator(MaxNLocator(nbins=6))

    if xticks:
        ax_object.set_xticks(list(range(len(xticks))))
        ax_object.set_xticklabels(xticks)
    else:
        ax_object.set_xticks([])


def parent_create_boxplot_with_dots(protein_df_with_physchem_metrics_cleaned, categories_ordered,
                                     metrics_ordered, ytitle, ax_object,
                                     ylim_bottom=None, xticks=None, style='b',
                                     decimal_places=2, scatter=False):

    # Filter data
    cut_data = protein_df_with_physchem_metrics_cleaned[
        protein_df_with_physchem_metrics_cleaned["Group"].isin(categories_ordered)
    ]
    cut_data = cut_data[cut_data['metric'].isin(metrics_ordered)].infer_objects()

    # Create plot
    create_boxplot_with_dots(cut_data, metrics_ordered, categories_ordered,
                              ax_object, xticks, scatter, style)

    # Adjust Y-axis limits
    lower_ylim, upper_ylim = ax_object.get_ylim()
    ax_object.set_ylim(bottom=(ylim_bottom if ylim_bottom is not None else lower_ylim), top=upper_ylim)

    ax_object.yaxis.set_major_formatter(FormatStrFormatter(f'%.{decimal_places}f'))
    ax_object.set_ylabel(ytitle, size=10)


# Read data
motif_info_set_df = pd.read_parquet(curr_wd + '/data/processed/GAR_motif_Wang_set_human_cleaned_annot_filtered.parquet')
annotated_IDR_df = pd.read_parquet(curr_wd + '/data/processed/annotation_datasets/all_IDR_human.parquet')
annotated_domain_df = pd.read_parquet(curr_wd + '/data/processed/annotation_datasets/all_domains_human.parquet')


In [None]:
ver = "v3"

# Mapping of descriptive set names to list of associated file names
set_definitions = {
    "GAR_full": ["GAR_subset_full"],
    "GAR_LLPS_pos": [
        "4_LLPS_positive_set_and_GAR_subset",
        "5_LLPS_positive_set_and_NA_positive_set_and_GAR_subset"
    ],
    "GAR_LLPS_pos_NA_neg": [
        "4_LLPS_positive_set_and_GAR_subset"
    ],
    "GAR_LLPS_neg": [
        "6_NA_positive_set_and_GAR_subset",
        "7_GAR_subset_only"
    ],
    "GAR_LLPS_neg_NA_pos": [
        "6_NA_positive_set_and_GAR_subset"
    ],
    "GAR_NA_pos": [
        "5_LLPS_positive_set_and_NA_positive_set_and_GAR_subset",
        "6_NA_positive_set_and_GAR_subset"
    ],
    "GAR_NA_neg": [
        "4_LLPS_positive_set_and_GAR_subset",
        "7_GAR_subset_only"
    ],
    "GAR_pos": [
        "5_LLPS_positive_set_and_NA_positive_set_and_GAR_subset"
    ],
    "GAR_neg": [
        "7_GAR_subset_only"
    ],
}

# Initialize containers
set_list = []
set_dict = {}
proteins_sets_dict = {}

# Read each set from its associated file(s)
for set_name, file_names in set_definitions.items():
    proteins = []
    for fname in file_names:
        file_path = f"{curr_wd}/data/processed/final_set_lists/{fname}.txt"
        with open(file_path, "r") as fl:
            proteins.extend([line.strip() for line in fl])
    set_dict[set_name] = proteins
    set_list.append(proteins)

# Add full proteome
full_proteome_path = f"{curr_wd}/data/processed/list_of_human_proteins.csv"
with open(full_proteome_path, "r") as fl:
    full_proteome = [line.strip() for line in fl]
set_dict["full_proteome"] = full_proteome
set_list.append(full_proteome)

# Store all in versioned dictionary
set_names = list(set_dict.keys())
proteins_sets_dict[ver] = set_dict


In [None]:
from collections import defaultdict

def compute_metrics_for_region(seq):
    sp = SequenceParameters(seq)
    pa = ProteinAnalysis(seq)
    return {
        "ncpr": sp.get_NCPR(),
        "hydropathy": sp.get_mean_hydropathy(),
        "kappa": sp.get_kappa(),
        "aromaticity": pa.aromaticity(),
        "flexibility": np.mean(pa.flexibility()) if len(seq) > 0 else np.nan,
        "scd": sp.get_SCD(),
        "disorder_promo": sp.get_fraction_disorder_promoting()
    }

protein_dict_with_physchem_metrics = defaultdict(dict)
full_proteome = proteins_sets_dict['v3']['full_proteome']
motif_grouped = motif_info_set_df.groupby("UniqueID")
idr_grouped = annotated_IDR_df.groupby("protein_name")

for i, curr_protein in enumerate(full_proteome):
    if curr_protein not in motif_grouped.groups or curr_protein not in idr_grouped.groups:
        continue

    print(i, "/", len(full_proteome), curr_protein)

    IDR_info_curr = idr_grouped.get_group(curr_protein)["prediction-disorder-mobidb_lite"].tolist()[0]
    IDR_bounds = count_consecutive_stretches_of_1(IDR_info_curr)

    for i2, row2 in motif_grouped.get_group(curr_protein).iterrows():
        motif = row2["motif"]
        full_seq = row2["full_seq"]
        motif_start = row2["start"]
        motif_end = row2["end"]
        motif_len = len(motif)

        metrics = {
            "full_seq": full_seq,
            "motif_seq": motif,
            "RG_count": row2["RG_count"],
            "RG_count_rel": row2["RG_count"] / motif_len,
            "RGG_count": row2["RGG_count"],
            "RGG_count_rel": row2["RGG_count"] / motif_len,
            "perc_impure": 100 - (row2["r_perc"] + row2["g_perc"]),
            "len_motif": motif_len,
            "len_motif_rel": motif_len / len(full_seq),
            "other_aas_array": ProteinAnalysis(motif).count_amino_acids(),
        }

        # Main motif region
        metrics.update({
            f"{k}_motif": v for k, v in compute_metrics_for_region(motif).items()
        })

        # Find IDR bounds for the motif
        motif_IDR_bounds = next(((start, end) for start, end, _ in IDR_bounds
                                 if is_either_between(start, end, motif_start, motif_end)), (motif_start, motif_end))
        left_diff = motif_start - motif_IDR_bounds[0]
        right_diff = motif_IDR_bounds[1] - motif_end

        # Regions: -40, -30, -20, -10, +10, +20, +30, +40
        for offset in [10, 20, 30, 40]:
            if left_diff >= offset:
                seq = full_seq[motif_start - offset: motif_start - offset + 10]
                region_metrics = compute_metrics_for_region(seq)
                for k, v in region_metrics.items():
                    metrics[f"{k}_motif-{offset}"] = v
            else:
                for k in ["ncpr", "hydropathy", "kappa", "aromaticity", "flexibility", "scd", "disorder_promo"]:
                    metrics[f"{k}_motif-{offset}"] = np.nan

            if right_diff >= offset:
                seq = full_seq[motif_end + offset - 10: motif_end + offset]
                region_metrics = compute_metrics_for_region(seq)
                for k, v in region_metrics.items():
                    metrics[f"{k}_motif+{offset}"] = v
            else:
                for k in ["ncpr", "hydropathy", "kappa", "aromaticity", "flexibility", "scd", "disorder_promo"]:
                    metrics[f"{k}_motif+{offset}"] = np.nan

        # Full IDR region if valid
        if motif_IDR_bounds[1] > motif_IDR_bounds[0]:
            idr_seq = full_seq[motif_IDR_bounds[0]:motif_IDR_bounds[1]]
            idr_metrics = compute_metrics_for_region(idr_seq)
            for k, v in idr_metrics.items():
                metrics[f"{k}_motifIDR"] = v
        else:
            for k in ["ncpr", "hydropathy", "kappa", "aromaticity", "flexibility", "scd", "disorder_promo"]:
                metrics[f"{k}_motifIDR"] = np.nan

        protein_dict_with_physchem_metrics[curr_protein][i2] = metrics

# Flatten nested dictionary structure into lists
records = [
    (prot, mot, metric, data)
    for prot, motifs in protein_dict_with_physchem_metrics.items()
    for mot, metrics in motifs.items()
    for metric, data in metrics.items()
]

# Create DataFrame
protein_df = pd.DataFrame(records, columns=["prot", "mot", "metric", "data"])
print(f"Total entries before cleaning: {len(protein_df)}")

# Drop entries with missing data
protein_df_cleaned = protein_df.dropna(subset=["data"])
print(f"Total entries after dropping NA: {len(protein_df_cleaned)}")

# Reset index
protein_df_cleaned.reset_index(drop=True, inplace=True)

# Assign group labels using the "assign_group" function and GAR sets
pos_set = set_list[set_names.index("GAR_pos")]
neg_set = set_list[set_names.index("GAR_neg")]
protein_df_cleaned["Group"] = protein_df_cleaned["prot"].apply(assign_group, args=(pos_set, "pos", neg_set, "neg"))

# Convert dtypes to most suitable ones
protein_df_cleaned = protein_df_cleaned.infer_objects()

# Save the DataFrame
output_path = f"{curr_wd}/data/results/protein_df_with_physchem_metrics_cleaned.pkl"
with open(output_path, "wb") as fp:
    pickle.dump(protein_df_cleaned, fp)
    print("DataFrame saved successfully to file.")


In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the cleaned protein DataFrame with physicochemical metrics
with open(os.path.join(curr_wd, 'data/results/protein_df_with_physchem_metrics_cleaned.pkl'), 'rb') as fp:
    protein_df_with_physchem_metrics_cleaned = pickle.load(fp)

# Define amino acid groupings for plotting
amino_acid_groupings = [
    ["F", "Y", "W"],              # Aromatic
    ["E", "D", "N", "Q"],         # Acidic and Amides
    ["L", "V", "I", "M", "C"],    # Hydrophobic
    ["K", "H"],                   # Basic (excluding R)
    ["A", "S", "T", "P"]          # Small/Polar
]
amino_acids_high_values = ["R", "G"]  # Shown separately due to high relevance

# Specify group and metric filters
categories_ordered = ["neg", "pos"]
metrics_ordered = ["other_aas_array"]

# Filter data for selected groups and metrics
cut_data = protein_df_with_physchem_metrics_cleaned[
    protein_df_with_physchem_metrics_cleaned["Group"].isin(categories_ordered)
]
cut_data = cut_data[cut_data['metric'].isin(metrics_ordered)]
cut_data = cut_data.infer_objects()

# Drop unnecessary columns
df = cut_data.drop(columns=["prot", "mot"])

# Expand the 'data' dictionary column into separate columns for each amino acid
data_expanded = pd.json_normalize(df['data'])

# Merge expanded data with the rest of the DataFrame
df_expanded = pd.concat([df.reset_index(drop=True), data_expanded], axis=1).drop(columns=['data'])

# Group by 'Group' and compute total counts for each amino acid
group_sums = df_expanded.groupby('Group').sum()
print(group_sums)

# Calculate relative proportions (%) for each amino acid in each group
group_proportions = group_sums.div(group_sums.sum(axis=1), axis=0) * 100

# --- Load reference disorder annotations and sequences (functions are defined elsewhere) ---

sequences = extract_sequences_from_fasta(os.path.join(curr_wd, 'data/external/UniProt/UP000005640_9606.fasta'))
masks = annotated_IDR_df.set_index("protein_name")["prediction-disorder-mobidb_lite"].to_dict()
filtered_sequences = filter_sequences(sequences, masks)
avg_proportions, sem_proportions = compute_aa_proportions(filtered_sequences)

print("Average Proportions:", avg_proportions)
print("Sum of proportions:", sum(avg_proportions.values()))
print("Standard Error:", sem_proportions)

# --- Plotting ---

# Set bar width for the plots
bar_width = 0.30

# Create a 2-row grid for grouped AA categories and R/G, with custom size ratios
fig, ax = plt.subplots(
    2, 6, sharey='row', figsize=(11, 3),
    gridspec_kw={
        'height_ratios': [3, 1],
        'width_ratios': [len(g) for g in amino_acid_groupings] + [2]  # R and G in a wide last column
    }
)

# Plot each amino acid group
for i, group in enumerate(amino_acid_groupings):
    plot_group(ax, group_proportions, avg_proportions, group, group, bar_width, row=0, col=i)

# Plot R and G separately
plot_high_values(ax, group_proportions, avg_proportions, amino_acids_high_values, bar_width, col=5)

# Layout and aesthetics adjustments
fig.tight_layout()
plt.subplots_adjust(hspace=0.09, wspace=0)

# Slightly adjust position of the R/G subplot to make space
for row in [0, 1]:
    pos = ax[row, 5].get_position()
    ax[row, 5].set_position([pos.x0 + 0.01, pos.y0, pos.width, pos.height])
    ax[row, 5].tick_params(axis='y', left=False)

# Add common legend
handles, labels = ax[0, 0].get_legend_handles_labels()
fig.legend(handles, ["negative", "positive"], loc="upper center", bbox_to_anchor=(0.5, 1.07), ncol=2)

# Save figure
os.makedirs(os.path.join(curr_wd, "data/results/subfigures/"), exist_ok=True)
plt.savefig(os.path.join(curr_wd, "data/results/subfigures/fig3_A.svg"), transparent=True)
plt.show()


In [None]:
import os
import matplotlib.pyplot as plt

# Create 2x2 subplot grid for NCPR, Aromaticity, Hydrophobicity, Disorder-promo metrics
fig, ax = plt.subplots(2, 2, sharex=True, figsize=(10, 5))

# Define common x-axis labels for motif-centered bins
xticks = ["-30", "-20", "-10", "", "+10", "+20", "+30"]
xticks_labels = [el + "aa" if el != "" else "motif" for el in xticks]

# --- Panel A: NCPR (net charge per residue) ---
categories_ordered = ["neg", "pos"]
metric_name = "ncpr"
metrics_ordered = [f"{metric_name}_motif{el}" for el in xticks]
ytitle = "NCPR\n(net charge per residue)"
ylim_bottom = -0.55
decimals = 2
parent_create_boxplot_with_dots(
    protein_df_with_physchem_metrics_cleaned,
    categories_ordered,
    metrics_ordered,
    ytitle,
    ax[0][0],
    ylim_bottom,
    xticks_labels,
    decimal_places=decimals
)

# --- Panel B: Aromaticity (F + Y + W frequency) ---
metric_name = "aromaticity"
metrics_ordered = [f"{metric_name}_motif{el}" for el in xticks]
ytitle = "Aromaticity\n(rel freq. of F+Y+W)"
ylim_bottom = -0.01
decimals = 2
parent_create_boxplot_with_dots(
    protein_df_with_physchem_metrics_cleaned,
    categories_ordered,
    metrics_ordered,
    ytitle,
    ax[0][1],
    ylim_bottom,
    xticks_labels,
    decimal_places=decimals
)

# --- Panel C: Hydrophobicity ---
metric_name = "hydropathy"
metrics_ordered = [f"{metric_name}_motif{el}" for el in xticks]
ytitle = "Hydrophobicity"
ylim_bottom = 0.5
decimals = 0
parent_create_boxplot_with_dots(
    protein_df_with_physchem_metrics_cleaned,
    categories_ordered,
    metrics_ordered,
    ytitle,
    ax[1][0],
    ylim_bottom,
    xticks_labels,
    decimal_places=decimals
)

# --- Panel D: Fraction of disorder-promoting residues ---
metric_name = "disorder_promo"
metrics_ordered = [f"{metric_name}_motif{el}" for el in xticks]
ytitle = "Fraction of disorder-\npromoting residues"
ylim_bottom = 0.4
decimals = 2
parent_create_boxplot_with_dots(
    protein_df_with_physchem_metrics_cleaned,
    categories_ordered,
    metrics_ordered,
    ytitle,
    ax[1][1],
    ylim_bottom,
    xticks_labels,
    decimal_places=decimals
)

# --- Legend (assumes all plots share the same handles/labels) ---
handles, labels = fig.gca().get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2)

# Adjust layout and save figure
plt.tight_layout()
os.makedirs(os.path.join(curr_wd, "data/results/subfigures/"), exist_ok=True)
plt.savefig(os.path.join(curr_wd, "data/results/subfigures/fig3_B_C_D_E.svg"), transparent=True)


In [None]:
import os
import matplotlib.pyplot as plt

# Create 1x4 subplot grid for motif-level metrics
fig, ax = plt.subplots(1, 4, sharex=False, figsize=(8, 3))

# No x-tick labels (metrics are scalar, not per-position)
xticks = False
categories_ordered = ["neg", "pos"]

# --- Panel A: Motif length ---
metrics_ordered = ["len_motif"]
ytitle = "# of residues in motif [-]"
ylim_bottom = -0.55
decimals = 0
parent_create_boxplot_with_dots(
    protein_df_with_physchem_metrics_cleaned,
    categories_ordered,
    metrics_ordered,
    ytitle,
    ax[0],
    ylim_bottom,
    xticks,
    decimal_places=decimals
)

# --- Panel B: Motif impurity (% of non-RG residues) ---
metrics_ordered = ["perc_impure"]
ytitle = "Motif impurity [%]\n(perc. of non-RG residues)"
ylim_bottom = -0.55
decimals = 0
parent_create_boxplot_with_dots(
    protein_df_with_physchem_metrics_cleaned,
    categories_ordered,
    metrics_ordered,
    ytitle,
    ax[1],
    ylim_bottom,
    xticks,
    decimal_places=decimals
)

# --- Panel C: Relative frequency of RG duplets ---
metrics_ordered = ["RG_count_rel"]
ytitle = "Frequency of RG\nduplets in motif [-]"
decimals = 1
parent_create_boxplot_with_dots(
    protein_df_with_physchem_metrics_cleaned,
    categories_ordered,
    metrics_ordered,
    ytitle,
    ax[2],
    xticks,
    style='b',
    decimal_places=decimals,
    scatter=False
)

# --- Panel D: Relative frequency of RGG triplets ---
metrics_ordered = ["RGG_count_rel"]
ytitle = "Frequency of RGG\ntriplets in motif [-]"
decimals = 2
parent_create_boxplot_with_dots(
    protein_df_with_physchem_metrics_cleaned,
    categories_ordered,
    metrics_ordered,
    ytitle,
    ax[3],
    xticks,
    style='b',
    decimal_places=decimals,
    scatter=False
)

# --- Shared Legend ---
handles, labels = fig.gca().get_legend_handles_labels()
fig.legend(
    handles, ["negative", "positive"],
    loc='upper center',
    bbox_to_anchor=(0.55, 1.08),
    ncol=2
)

# Final layout adjustments and export
plt.tight_layout()
os.makedirs(os.path.join(curr_wd, "data/results/subfigures/"), exist_ok=True)
plt.savefig(os.path.join(curr_wd, "data/results/subfigures/fig1_B_C_D_E.svg"), transparent=True)
