In [1]:
import json
import re
import logging
from typing import List, Dict
import pandas as pd
import os

# Configure logging
logging.basicConfig(
    filename="script.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
console.setFormatter(formatter)
logging.getLogger("").addHandler(console)


def is_canonical_glycosylation(sequence: str, position: int) -> bool:
    pattern = r"N[^P][ST][^P]"
    start = max(0, position - 1)
    end = min(len(sequence), position + 3)
    subsequence = sequence[start:end]
    return bool(re.search(pattern, subsequence))


def validate_csv(file_path: str, expected_fields: List[str]) -> pd.DataFrame:
    try:
        df = pd.read_csv(file_path)
        missing_fields = set(expected_fields) - set(df.columns)
        if missing_fields:
            logging.error(f"Missing fields in {file_path}: {missing_fields}")
            return None
        return df
    except FileNotFoundError:
        logging.error(f"File not found: {file_path}")
        return None


def generate_json_files(
    virus_file: str,
    antibody_file: str,
    glycan_residues: str,
    seeds: List[str] = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
    viral_sequence_count: int = 3,
    antibody_sequence_vh_count: int = 3,
    antibody_sequence_vl_count: int = 3,
) -> None:
    # Validate virus CSV file
    virus_fields = [
        "Virus GPC",
        "Full Sequence",
        "cropped sequence with foldon",
        "SSP",
        "GP1",
        "GP2",
        "TM and C Term",
        "glycoslated positons full",
        "glycosylated positions cropped",
        "genebank",
        "PDB",
        "PDB (partial)",
    ]
    viruses = validate_csv(virus_file, virus_fields)
    if viruses is None:
        return

    # Validate antibody CSV file
    antibody_fields = ["name", "VH", "VL", "Target", "Epitope", "Publication", "PDB"]
    antibodies = validate_csv(antibody_file, antibody_fields)
    if antibodies is None:
        return

    # Create output directories
    os.makedirs("output/full_sequence", exist_ok=True)
    os.makedirs("output/cropped_sequence_with_foldon", exist_ok=True)

    # Generate JSON files for each virus
    for _, virus in viruses.iterrows():
        for sequence_type in ["cropped sequence with foldon", "Full Sequence"]:
            if pd.notna(virus[sequence_type]):
                for _, antibody in antibodies.iterrows():
                    json_data = {
                        "name": f"{virus['Virus GPC']}_{antibody['name']}".replace(
                            " ", ""
                        ).replace(".", "_"),
                        "modelSeeds": seeds,
                        "sequences": [
                            {
                                "proteinChain": {
                                    "sequence": virus[sequence_type],
                                    "glycans": [],
                                    "count": viral_sequence_count,
                                }
                            },
                            {"proteinChain": {"sequence": antibody["VH"], "count": antibody_sequence_vh_count}},
                            {"proteinChain": {"sequence": antibody["VL"], "count": antibody_sequence_vl_count}},
                        ],
                    }

                    # Add glycan residues and positions
                    glycan_positions = (
                        virus["glycosylated positions cropped"]
                        if sequence_type == "cropped sequence with foldon"
                        else virus["glycoslated positons full"]
                    )
                    if pd.notna(glycan_positions):
                        positions = [
                            int(pos.strip()) for pos in glycan_positions.split(",")
                        ]
                        for position in positions:
                            if is_canonical_glycosylation(
                                virus[sequence_type], position
                            ):
                                json_data["sequences"][0]["proteinChain"][
                                    "glycans"
                                ].append(
                                    {"residues": glycan_residues, "position": position}
                                )
                            else:
                                logging.warning(
                                    f"Non-canonical glycosylation pattern found for {virus['Virus GPC']} at position {position}"
                                )

                    # Save JSON file in the appropriate directory
                    directory = (
                        "output/cropped_sequence_with_foldon"
                        if sequence_type == "cropped sequence with foldon"
                        else "output/full_sequence"
                    )
                    filename = (
                        f"{directory}/{virus['Virus GPC']}_{antibody['name'].replace('.', '_')}.json"
                    )
                    with open(filename, "w") as file:
                        json.dump(json_data, file, indent=2)
                    logging.info(f"Generated JSON file: {filename}")


# Usage example
virus_file = "Arenavirus_GPC_Antibody_List_virus.csv"
antibody_file = "Arenavirus_GPC_Antibody_List_AB.csv"
glycan_residues = "MAN"  # Default glycan residue
seeds = [str(i) for i in range(1, 13)]  # Seeds from 1 to 12

logging.info(f"Input files: {virus_file}, {antibody_file}")
logging.info(f"Glycan residues: {glycan_residues}")
logging.info(f"Seeds: {seeds}")

generate_json_files(virus_file, antibody_file, glycan_residues, seeds)

2024-06-03 21:34:01,515 - INFO - Input files: Arenavirus_GPC_Antibody_List_virus.csv, Arenavirus_GPC_Antibody_List_AB.csv
2024-06-03 21:34:01,516 - INFO - Glycan residues: MAN
2024-06-03 21:34:01,516 - INFO - Seeds: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
2024-06-03 21:34:01,519 - INFO - Generated JSON file: output/cropped_sequence_with_foldon/LASV_Josiah_CR1-07.json
2024-06-03 21:34:01,520 - INFO - Generated JSON file: output/cropped_sequence_with_foldon/LASV_Josiah_CR1-28.json
2024-06-03 21:34:01,521 - INFO - Generated JSON file: output/cropped_sequence_with_foldon/LASV_Josiah_12_1F.json
2024-06-03 21:34:01,521 - INFO - Generated JSON file: output/cropped_sequence_with_foldon/LASV_Josiah_18_5C.json
2024-06-03 21:34:01,522 - INFO - Generated JSON file: output/cropped_sequence_with_foldon/LASV_Josiah_25_10C.json
2024-06-03 21:34:01,522 - INFO - Generated JSON file: output/cropped_sequence_with_foldon/LASV_Josiah_2_9D.json
2024-06-03 21:34:01,522 - INFO - Generat