# Condensate Feature Prediction -- Rotation Project

### Load in data

In [19]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Dataset, DataLoader, Subset
import psutil
from sklearn.model_selection import KFold
from scipy.stats import spearmanr, pearsonr
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetTemperature, NVML_TEMPERATURE_GPU
import pickle



In [None]:
small_pl_data = pd.read_csv("/Users/clairehsieh/OneDrive/Documents/UCLA/rotations/kalli_kappel/data/small_pool_data_with_seq_info_202507.csv", 
                             index_col=False)
list(small_pl_data.columns)
small_pl_data.shape

# large pool data
large_pl_data = pd.read_csv("/Users/clairehsieh/OneDrive/Documents/UCLA/rotations/kalli_kappel/data/large_pool_data_with_seq_info_202507.csv")
large_pl_data.shape
list(large_pl_data.columns)

# check all are the same length
lengths_sml = [len(small_pl_data["protein_seq"][i]) for i in range(small_pl_data.shape[0])]
set(lengths_sml)



  large_pl_data = pd.read_csv("/Users/clairehsieh/OneDrive/Documents/UCLA/rotations/kalli kappel/data/large_pool_data_with_seq_info_202507.csv")


{66}

### Preprocessing

In [21]:
# one hot encode protein seq
import numpy as np

# 20 standard amino acids (IUPAC one-letter codes)
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
aa_to_index = {aa: i for i, aa in enumerate(amino_acids)}

def one_hot_encode(seq):
    # shape: (length_of_seq, 20)
    encoding = np.zeros((len(seq), len(amino_acids)), dtype=np.int8)
    for i, aa in enumerate(seq):
        if aa in aa_to_index:  # skip if non-standard residue
            encoding[i, aa_to_index[aa]] = 1
    return encoding

# # Example
# seq = "MTEYK"
# one_hot = one_hot_encode(seq)
# print(one_hot.shape)   # (5, 20)
# print(one_hot)

subset_data = large_pl_data[["protein_seq", "medium_GFP_fraction_cells_with_condensates"]]
subset_data = subset_data.dropna(subset=[subset_data.columns[1]])

seq_data = np.array([one_hot_encode(i) for i in subset_data["protein_seq"]])
labels = subset_data["medium_GFP_fraction_cells_with_condensates"]
seq_data.shape

(6994, 66, 20)

In [None]:
# write all seqs to fasta file
large_pl_data = large_pl_data.rename(columns={large_pl_data.columns[0]: "id"})

output_file = "/Users/clairehsieh/Library/CloudStorage/OneDrive-Personal/Documents/UCLA/rotations/kalli_kappel/data/large_protein.fasta"

with open(output_file, "w") as w:
    for i, row in large_pl_data.iterrows():
        w.write(f">{row['id']}\n")
        w.write(f"{row['protein_seq']}\n")
    
# run mmseqs2
# mmseqs easy-cluster /Users/clairehsieh/Library/CloudStorage/OneDrive-Personal/Documents/UCLA/rotations/kalli_kappel/data/large_protein.fasta clusterRes tmp --min-seq-id 0.9 -c 0.9 --cov-mode 1


### Model

In [None]:

class Config:
    def __init__(self,
                 output_dir,
                 seed=0,
                 num_folds=5,
                 max_epochs=100,
                 lr=0.001,
                 batch_size=10,
                 input_dim=100,
                 hidden_dim=512,
                 output_dim=1,
                 num_epochs=500,
                 num_layers=3,
                 verbose=False,
                 layernorm=False,
                 dropout=False,
                 dropout_rate=0.05, 
                 early_stopping=True,
                 patience = 30,
                 delta = 0.05):       
        self.layernorm = layernorm
        self.dropout = dropout
        self.dropout_rate = dropout_rate
        if torch.backends.mps.is_available():
            self.device = torch.device("mps")
        else:
            self.device = torch.device("cpu")       
            
        # training parameters
        self.seed = seed
        self.num_folds = num_folds
        self.max_epochs = max_epochs
        self.lr = lr
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_epochs = num_epochs
        self.num_layers = num_layers
        self.verbose = verbose
        self.early_stopping = early_stopping
        self.patience = patience    
        self.delta = delta
        self.output_dir = f"{output_dir}/seed_{seed}/lr_{lr}/hidden_dim_{hidden_dim}/num_layers_{num_layers}/batch_size_{batch_size}/"
        if not os.path.exists(self.output_dir): os.makedirs(self.output_dir)
        if dropout: self.output_dir += f"dropout_{dropout_rate}/"
        if layernorm: self.output_dir += f"layernorm/"
        if early_stopping: self.output_dir += f"early_stopping_patience_{patience}_delta_{delta}/"

    def __str__(self):
        return '\n'.join(f'{k}: {v}' for k, v in vars(self).items())

    def __repr__(self):
        return self.__str__()

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.hidden_dim = config.hidden_dim
        self.num_layers = config.num_layers
        self.input_dim = config.input_dim
        
        self.mlp = nn.Sequential(
            nn.Linear(self.input_dim , self.hidden_dim),
            nn.LeakyReLU()
        ).to(config.device)
        
        # final layer always outputs 1 value per sample
        self.final = nn.Linear(self.hidden_dim, 1).to(config.device)
        
        self.layer_norm = nn.LayerNorm(self.hidden_dim).to(config.device)
        self.logZ = nn.Parameter(torch.ones(1, device=config.device))

    def forward(self, x):
        output = self.mlp(x)        # [batch_size, hidden_dim]
        output = self.final(output) # [batch_size, 1]
        return output.squeeze(-1)   # [batch_size]

    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)


In [17]:
# data: (6994, 66, 20)
#       (batch, seq_len, amino acid)
# linearize: (6994, 1320)


config = Config(output_dir = "/Users/clairehsieh/OneDrive/Documents/UCLA/rotations/kalli kappel/results",
                batch_size=10, input_dim=1320, output_dim=1)

class CondensateDataset(Dataset):
    def __init__(self, x, y):
        # x: 2D array-like [n_samples, n_features]
        # y: 1D array-like [n_samples]
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

#### UTILS ##### 
class EarlyStopping:
    def __init__(self, patience=15, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0
        self.best_model_state = None
    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.best_model_state = model.state_dict()
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model_state = model.state_dict()
            self.counter = 0
    def load_best_model(self, model):
        model.load_state_dict(self.best_model_state)

def log_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"Memory Usage: {mem_info.rss / (1024 ** 2):.2f} MB")

def mean(ls):
    return sum(ls) / len(ls)

def compute_spearman_correlation(actuals, predictions):
    try:
        if np.all(actuals == actuals[0]) or np.all(predictions == predictions[0]):
            print("Warning: Constant array detected - correlation undefined")
            return 0          
        correlation, _ = spearmanr(actuals, predictions)
        return correlation
    except Exception as e:
        print(f"Error computing correlation: {e}")
        return None

def compute_pearson_correlation(actuals, predictions):
    if isinstance(actuals, torch.Tensor):
        actuals = actuals.cpu().numpy()
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    correlation, _ = pearsonr(actuals, predictions)
    return correlation

def plot_epoch_losses(epoch_train_losses, epoch_val_losses, filename, output_dir):
    print(f"Train Losses: {epoch_train_losses}")
    print(f"Val Losses: {epoch_val_losses}")
    plt.figure()
    plt.plot(epoch_train_losses, label='Training Loss')
    plt.plot(epoch_val_losses, label='Validation Loss')
    plt.title(f'Training and Validation Loss | Fold {filename}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    plt.savefig(output_dir + f"Fold{filename}_loss_plot.jpg")
    plt.close()

def plot_fold_scatter(val_actual, val_pred, train_actual, train_pred, correlations, fold, output_dir):   
    # Plot scatterplot for validation data
    plt.figure(figsize=(8, 6))
    plt.scatter(val_actual, val_pred, alpha=0.6, label="Validation Data", color="orange")
    plt.plot([min(val_actual), max(val_actual)], [min(val_actual), max(val_actual)], 'r--', label="Ideal Fit")
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title("Validation Data: Actual vs Predicted")
    plt.legend()
    # Add correlations as text on the validation plot
    plt.text(
        0.95, 0.05,
        f"Spearman: {correlations['val_spearman']:.2f}\nPearson: {correlations['val_pearson']:.2f}",
        transform=plt.gca().transAxes,
        fontsize=10,
        verticalalignment='bottom',
        horizontalalignment='right',
        bbox=dict(boxstyle="round", facecolor="white", alpha=0.5)
    )
    plt.savefig(os.path.join(output_dir, f"validation_fold_{fold}.png"))
    plt.close()

    # Plot scatterplot for training data
    plt.figure(figsize=(8, 6))
    plt.scatter(train_actual, train_pred, alpha=0.6, label="Training Data", color="blue")
    plt.plot([min(train_actual), max(train_actual)], [min(train_actual), max(train_actual)], 'r--', label="Ideal Fit")
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title("Training Data: Actual vs Predicted")
    plt.legend()
     # Add correlations as text on the training plot
    plt.text(
        0.95, 0.05,
        f"Spearman: {correlations['train_spearman']:.2f}\nPearson: {correlations['train_pearson']:.2f}",
        transform=plt.gca().transAxes,
        fontsize=10,
        verticalalignment='top',
        bbox=dict(boxstyle="round", facecolor="white", alpha=0.5)
    )
    plt.savefig(os.path.join(output_dir, f"training_fold_{fold}.jpg"))
    plt.close()

def plot_all_folds_losses(losses, output_dir, filename="all_folds"):
    """
    losses: dict with keys 'train_losses' and 'val_losses'
            values are lists of lists, one list per fold
            e.g. losses['train_losses'][i] = list of training losses for fold i
    output_dir: directory to save plot
    filename: filename prefix for saved plot
    """
    train_folds = losses["train_losses"]
    val_folds = losses["val_losses"]

    n_folds = len(train_folds)
    fig, axes = plt.subplots(1, n_folds, figsize=(5 * n_folds, 4), sharey=True)

    if n_folds == 1:  # make axes iterable if only one subplot
        axes = [axes]

    for i, ax in enumerate(axes):
        ax.plot(train_folds[i], label="Training Loss")
        ax.plot(val_folds[i], label="Validation Loss")
        ax.set_title(f"Fold {i+1}")
        ax.set_xlabel("Epoch")
        ax.set_ylabel("Loss")
        ax.legend()

    plt.tight_layout()
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    plt.savefig(os.path.join(output_dir, f"{filename}_loss_plot.jpg"))
    plt.close()

import matplotlib.pyplot as plt
import os

import matplotlib.pyplot as plt
import os

def plot_all_folds_scatter(fold_outputs, all_correlations, output_dir, filename="all_folds_scatter"):
    """
    fold_outputs: dict with keys
        'fold_train_actual', 'fold_train_pred',
        'fold_val_actual', 'fold_val_pred'
        Each is a list of lists (one per fold).

    all_correlations: list of dicts (one per fold), each dict has keys
        'train_spearman', 'val_spearman', 'train_pearson', 'val_pearson'

    output_dir: directory to save the figure
    filename: filename prefix for saved plot
    """

    n_folds = len(fold_outputs["fold_train_actual"])
    fig, axes = plt.subplots(2, n_folds, figsize=(5 * n_folds, 10), sharey=True)

    if n_folds == 1:  # axes will be 1D
        axes = axes[:, None]  # make it 2x1 array

    for i in range(n_folds):
        # Training subplot (top row)
        ax_train = axes[0, i]
        train_actual = fold_outputs["fold_train_actual"][i]
        train_pred = fold_outputs["fold_train_pred"][i]
        ax_train.scatter(train_actual, train_pred, alpha=0.6, color="blue", label="Training")
        ax_train.plot([min(train_actual), max(train_actual)],
                      [min(train_actual), max(train_actual)],
                      'r--', label="Ideal Fit")
        ax_train.set_xlabel("Actual Values")
        ax_train.set_ylabel("Predicted Values")
        ax_train.set_title(f"Fold {i+1} Training")
        ax_train.legend()
        # Correlation text
        corr_val = all_correlations[i]
        ax_train.text(
            0.95, 0.05,
            f"Spearman: {corr_val['train_spearman']:.2f}\nPearson: {corr_val['train_pearson']:.2f}",
            transform=ax_train.transAxes,
            fontsize=10,
            verticalalignment='bottom',
            horizontalalignment='right',
            bbox=dict(boxstyle="round", facecolor="white", alpha=0.5)
        )

        # Validation subplot (bottom row)
        ax_val = axes[1, i]
        val_actual = fold_outputs["fold_val_actual"][i]
        val_pred = fold_outputs["fold_val_pred"][i]
        ax_val.scatter(val_actual, val_pred, alpha=0.6, color="orange", label="Validation")
        ax_val.plot([min(val_actual), max(val_actual)],
                    [min(val_actual), max(val_actual)],
                    'r--', label="Ideal Fit")
        ax_val.set_xlabel("Actual Values")
        ax_val.set_ylabel("Predicted Values")
        ax_val.set_title(f"Fold {i+1} Validation")
        ax_val.legend()
        # Correlation text
        ax_val.text(
            0.95, 0.05,
            f"Spearman: {corr_val['val_spearman']:.2f}\nPearson: {corr_val['val_pearson']:.2f}",
            transform=ax_val.transAxes,
            fontsize=10,
            verticalalignment='bottom',
            horizontalalignment='right',
            bbox=dict(boxstyle="round", facecolor="white", alpha=0.5)
        )

    plt.tight_layout()
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    plt.savefig(os.path.join(output_dir, f"{filename}.png"))
    plt.close()

def get_predictions(model, dataloader):
    model.eval()
    all_actuals = []
    all_predictions = []
    with torch.no_grad():
        for batch in dataloader:
            x,y = batch
            output = model(x)
            all_actuals.extend(y.cpu().numpy())
            all_predictions.extend(output.cpu().numpy())
    all_actuals = np.array(all_actuals)
    all_predictions = np.array(all_predictions)
    return all_actuals, all_predictions

def plot_prediction_results(actual, predicted, filename = None):
    plt.scatter(actual, predicted)
    plt.xlabel("actual")
    plt.ylabel("predicted")
    plt.title("Actual vs Predicted")
    plt.show()
    if filename: plt.savefig(filename)

def train(dataset, config):    
    output_dir = config.output_dir
    fold_outputs = {"fold_train_actual":[], 
                    "fold_train_pred":[],
                     "fold_val_actual":[], 
                     "fold_val_pred":[]}
    logs = {"fold_actuals":[],
            "fold_preds":[],
            "fold_train_actuals":[],
            "fold_train_preds":[],
            "fold_train_losses":[],
            "fold_val_losses":[],
            "train_num_genes":[],
            "all_train_labels":[],
            "val_num_genes":[],
            "all_val_labels":[],
            "f1_accuracy":{},
            "all_correlation":[]}
    loss_fn = nn.MSELoss()
    training_logs = {"train_actual" : [],
                    "train_pred" : [],
                    "val_actual" : [],
                    "val_pred" : [],
                    "epoch_train_losses" : [],
                    "epoch_val_losses" : []}
    model_output_dir = output_dir + "/model/"
    if not os.path.exists(model_output_dir): os.makedirs(model_output_dir)

    losses = {"train_losses":[], "val_losses":[]}
    all_correlations = []
    #### Training loop ##### 

    # kfold split
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kf.split(range(len(dataset)))):
        model = MLP(config)
        train_dataloader = DataLoader(Subset(dataset, train_idx), batch_size=config.batch_size, shuffle=True, num_workers=0)
        val_dataloader = DataLoader(Subset(dataset, val_idx), batch_size=config.batch_size, shuffle=False, num_workers=0)
        epoch_train_losses = []
        epoch_val_losses = []
        val_labels = []
        # for fold in range(num_folds):
        filename = fold
        optimizer = optim.Adam(model.parameters(), lr=config.lr)
        if config.early_stopping:
            early_stopping =  EarlyStopping(patience=config.patience, delta=config.delta)
        print(f'Fold {fold+1}/{config.num_folds}')
        log_memory_usage()

        for epoch in tqdm(range(config.max_epochs)):
            model.train()
            train_losses = []
            val_losses = []
            for batch in train_dataloader:
                x, y = batch
                x = x.to(config.device)
                y = y.to(config.device)
                optimizer.zero_grad()
                device = next(model.parameters()).device  
                output = model(x)
                output.requires_grad_(True)  
                loss = loss_fn(output.float(), y.float())
                train_losses.append(loss.item())
                loss.backward()
                optimizer.step()
            model.eval()
            with torch.no_grad():
                for batch in val_dataloader:
                    x, y = batch
                    x = x.to(config.device)
                    y = y.to(config.device)
                    output = model(x)
                    loss = loss_fn(output.float(), y.float())                
                    val_losses.append(loss.item())
                epoch_train_losses.append(mean(train_losses))
                epoch_val_losses.append(mean(val_losses))
                if config.early_stopping:
                    early_stopping(mean(val_losses), model)
                    if early_stopping.early_stop:
                        print(f"Early stopping at epoch {epoch}")
                        early_stopping.load_best_model(model)
                        break
        
        logs["all_val_labels"].append(val_labels)
        logs["fold_train_losses"].append(epoch_train_losses)
        logs["fold_val_losses"].append(epoch_val_losses)
        train_actual, train_pred = get_predictions(model, train_dataloader)
        val_actual, val_pred = get_predictions(model, val_dataloader)
        logs["fold_actuals"].append(val_actual)
        logs["fold_preds"].append(val_pred)
        logs["fold_train_actuals"].append(train_actual)
        logs["fold_train_preds"].append(train_pred)
        correlations = {
            "train_spearman": compute_spearman_correlation(train_actual, train_pred) or 0,
            "val_spearman": compute_spearman_correlation(val_actual, val_pred) or 0,
            "train_pearson": compute_pearson_correlation(train_actual, train_pred) or 0,
            "val_pearson": compute_pearson_correlation(val_actual, val_pred) or 0
        }
        all_correlations.append(correlations)
        logs["all_correlation"].append(correlations)
        # plot_epoch_losses(epoch_train_losses, epoch_val_losses, filename, output_dir)
        # plot_fold_scatter(val_actual, val_pred, train_actual, train_pred, correlations, fold, output_dir)        

        losses["train_losses"].append(epoch_train_losses), losses["val_losses"].append(epoch_val_losses)
        fold_outputs["fold_train_actual"].append(train_actual), fold_outputs["fold_train_pred"].append(train_pred), fold_outputs["fold_val_actual"].append(val_actual), fold_outputs["fold_val_pred"].append(val_pred), 
        log_memory_usage()
        torch.save(model.state_dict(), os.path.join(model_output_dir, f"model_fold_{fold}.pth"))
        # plot_correlations(logs["all_correlation"], output_dir)

    # Save logs and fold_outputs as pkl
    plot_all_folds_losses(losses, output_dir, filename="all_folds")
    plot_all_folds_scatter(fold_outputs, all_correlations, output_dir, filename="all_folds_scatter")

    logs_file = os.path.join(output_dir, "logs.pkl")
    with open(logs_file, "wb") as f:
        pickle.dump(logs, f)
    print(f"Logs outputs saved to {logs_file}")
    fold_outputs_file = os.path.join(output_dir, "fold_outputs.pkl")
    with open(fold_outputs_file, "wb") as f:
        pickle.dump(fold_outputs, f)
    print(f"Fold outputs saved to {fold_outputs_file}")

    return model, logs, fold_outputs, losses


In [18]:
seq_data = torch.tensor(seq_data.reshape(seq_data.shape[0], -1), dtype=torch.float32).to(config.device)
labels   = torch.tensor(labels, dtype=torch.float32).to(config.device)
dataset = CondensateDataset(seq_data, labels)

model = MLP(config)
model, logs, fold_outputs, losses = train(dataset, config)


  seq_data = torch.tensor(seq_data.reshape(seq_data.shape[0], -1), dtype=torch.float32).to(config.device)
  labels   = torch.tensor(labels, dtype=torch.float32).to(config.device)
  self.x = torch.tensor(x, dtype=torch.float32)
  self.y = torch.tensor(y, dtype=torch.float32)


Fold 1/5
Memory Usage: 98.75 MB


 30%|███       | 30/100 [00:40<01:35,  1.36s/it]

Early stopping at epoch 30





Memory Usage: 136.86 MB
Fold 2/5
Memory Usage: 143.36 MB


 30%|███       | 30/100 [00:40<01:34,  1.35s/it]

Early stopping at epoch 30





Memory Usage: 144.91 MB
Fold 3/5
Memory Usage: 147.48 MB


 30%|███       | 30/100 [00:40<01:35,  1.36s/it]

Early stopping at epoch 30





Memory Usage: 142.53 MB
Fold 4/5
Memory Usage: 145.12 MB


 30%|███       | 30/100 [00:40<01:35,  1.36s/it]

Early stopping at epoch 30





Memory Usage: 118.77 MB
Fold 5/5
Memory Usage: 123.75 MB


 30%|███       | 30/100 [00:41<01:37,  1.39s/it]

Early stopping at epoch 30





Memory Usage: 110.41 MB
Logs outputs saved to /Users/clairehsieh/OneDrive/Documents/UCLA/rotations/kalli kappel/results/seed_0/lr_0.001/hidden_dim_512/num_layers_3/batch_size_10/early_stopping_patience_30_delta_0.05/logs.pkl
Fold outputs saved to /Users/clairehsieh/OneDrive/Documents/UCLA/rotations/kalli kappel/results/seed_0/lr_0.001/hidden_dim_512/num_layers_3/batch_size_10/early_stopping_patience_30_delta_0.05/fold_outputs.pkl


# Notes
- First, a set of 79 “base sequences” was selected for extensive
mutagenesis. These sequences include 43 of the fragments of natural protein sequences
and one designed variant from the small sequence library, for a total of 44 “Class 1 base
sequences.” Another 35 fragments of natural protein sequences (“Class 2 base
sequences”) were further selected from PhasePro, excluding protein sequences that were
annotated as partner dependent. To this end, all protein regions annotated to drive phase
separation were examined, and all possible 66 amino acid fragments with amino acid
composition and dipeptide composition similarity (Pearson correlation coefficient (r2)) to
the Class 1 base sequences and to each other of less than 0.6 (“Class 3 sequences”)
were identified. 35 of these sequences, including at most one sequence fragment per
protein and prioritizing sequence fragments with the highest amino acid composition
similarity to the full protein region as well as regions that were predicted to be more
disordered.

- The natural protein sequence fragment set includes: (1) all base sequences, (2) all
remaining Class 3 sequences (283 sequences), (3) Class 4A sequences (519
sequences): fragments from disordered (as annotated by MobiDB36) sequences from
LLPSDB37 that were annotated either as phase separating or not phase separating with
maximum amino acid composition and dipeptide correlation of 0.8 to each other and to
all base sequences and all Class 3 sequences; and (4) Class 4B sequences (798
sequences): disordered regions (as annotated by MobiDB) from Disprot38 (release
2022_03) with maximum amino acid composition and dipeptide correlation of 0.6 to each
other and to all base sequences, Class 3 sequences, and Class 4A sequences.