In [1]:
import pandas as pd
import re

# ------------------------
# helpers
# ------------------------

def parse_score_vector(s):
    """
    Parse per-residue scores from column 3.
    Expected formats like:
      "0.12, 0.34, 0.56"
      "0.12 0.34 0.56"
    """
    if pd.isna(s):
        return []
    text = str(s).strip()
    if not text:
        return []
    parts = re.split(r"[,\s]+", text)
    return [float(p) for p in parts if p]


def parse_hotspot_string(s, L):
    """
    Parse hotspot regions from column 4.

    Supports:
      "c(28, 33), c(79, 86)"
    and also more generic "28-33; 79-86" or "28,33 79,86".

    Returns list of length L with region labels like "28-33" or None.
    """
    regions = [None] * L

    if pd.isna(s):
        return regions

    text = str(s)
    if not text.strip():
        return regions

    # R-style c(28, 33)
    matches = re.findall(r"c\(\s*(\d+)\s*,\s*(\d+)\s*\)", text)
    # fallback: "28-33" or "28,33"
    if not matches:
        matches = re.findall(r"(\d+)\s*[-,]\s*(\d+)", text)

    for start_str, end_str in matches:
        start = int(start_str)
        end = int(end_str)
        label = f"{start}-{end}"
        # positions in sequence are 1-based, Python indices 0-based
        for pos in range(start - 1, end):
            if 0 <= pos < L:
                regions[pos] = label

    return regions

# ------------------------
# main transform
# ------------------------

def make_long_table(input_path="output.csv", output_path="output_long.csv"):
    df = pd.read_csv(input_path)

    # assume:
    # col 0 = sequence
    # col 1 = overall protein amyloid score
    # col 2 = per-residue scores
    # col 3 = hotspot regions
    seq_col = df.columns[0]
    overall_col = df.columns[1]
    scores_col = df.columns[2]
    regions_col = df.columns[3]

    long_rows = []

    for idx, row in df.iterrows():
        seq = str(row[seq_col])
        overall = row[overall_col]
        scores = parse_score_vector(row[scores_col])
        aa_list = list(seq)

        L = min(len(aa_list), len(scores))
        hotspot_labels = parse_hotspot_string(row[regions_col], L)

        for pos in range(L):
            long_rows.append(
                {
                    "protein_id": idx + 1,           # original row id
                    "sequence": seq,                 # full sequence (optional)
                    "overall_score": overall,        # column 2
                    "position": pos + 1,             # 1-based position
                    "aa": aa_list[pos],              # amino acid
                    "aa_score": scores[pos],         # single residue score
                    "region": hotspot_labels[pos],   # region label or None
                }
            )

    out = pd.DataFrame(long_rows)
    out.to_csv(output_path, index=False)


if __name__ == "__main__":
    make_long_table("output.csv", "output_long.csv")

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("output_long.csv")

In [None]:
df

In [None]:
protein_names = ["RPS2_human", "RPS6_human", "RPL27_human", "RPL36_human", "GAPDH_human_iso1", "GAPDH_human_iso2", "APP_human"]

In [None]:
df["protein_id"].replace(1, protein_names[0], inplace=True)
df["protein_id"].replace(2, protein_names[1], inplace=True)
df["protein_id"].replace(3, protein_names[2], inplace=True)
df["protein_id"].replace(4, protein_names[3], inplace=True)
df["protein_id"].replace(5, protein_names[4], inplace=True)
df["protein_id"].replace(6, protein_names[5], inplace=True)
df["protein_id"].replace(7, protein_names[6], inplace=True)

In [None]:
df

In [None]:
new_df = df.drop(columns=["sequence", "overall_score"])

In [None]:
new_df

In [None]:
new_df['region'] = new_df['region'].fillna(0)
new_df['region'] = np.where(new_df['region'] != 0, 1, 0)

In [None]:
new_df

In [None]:
new_df.to_csv('output_all.csv', index=False)