## Summary

---

## Imports

In [1]:
import contextlib
import os
import tempfile
import urllib.request
from datetime import datetime
from pathlib import Path

import dotenv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import seaborn as sns
from kmbio import PDB
from kmtools import structure_tools
from tqdm.auto import tqdm



## Parameters

In [2]:
NOTEBOOK_DIR = Path("30_cagi6_hmbs").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/home/kimlab5/strokach/workspace/elaspic/elaspic2-cagi6/notebooks/30_cagi6_hmbs')

In [3]:
UNIPROT_ID = "P08397"

UNIPROT_ID

'P08397'

## Download data

In [None]:
def download(url, filename):
    urllib.request.urlretrieve(url, filename)

In [None]:
def load_sequence(sequence_file):
    with sequence_file.open("rt") as fin:
        data = fin.read()
    chunks = []
    for line in data.split("\n"):
        if line.startswith(">"):
            continue
        chunks.append(line.strip())
    return "".join(chunks)

In [None]:
if not NOTEBOOK_DIR.joinpath("validation.py").is_file():
    import synapseclient
    import synapseutils

    dotenv.load_dotenv("../.env")
    syn = synapseclient.Synapse()
    syn.login(os.environ["SYNAPSE_USERNAME"], os.environ["SYNAPSE_PASSWORD"])
    _ = synapseutils.syncFromSynapse(syn, "syn25823964", path=NOTEBOOK_DIR)

In [None]:
sequence_file = NOTEBOOK_DIR.joinpath(f"{UNIPROT_ID}.fasta")

if not sequence_file.is_file():
    download(f"https://www.uniprot.org/uniprot/{UNIPROT_ID}.fasta", sequence_file)

sequence = load_sequence(sequence_file)

In [None]:
structure_file = NOTEBOOK_DIR.joinpath(f"{UNIPROT_ID}.pdb")

if not structure_file.is_file():
    download(f"https://alphafold.ebi.ac.uk/files/AF-{UNIPROT_ID}-F1-model_v1.pdb", structure_file)

with structure_file.open("r") as fin:
    structure_blob = fin.read()

In [None]:
alignment_file = NOTEBOOK_DIR.joinpath(f"{UNIPROT_ID}.a3m.gz")

if not alignment_file.is_file():
    from elaspic2.plugins.alphafold2 import mmseqs2

    dotenv.load_dotenv("../.env")
    with mmseqs2.api_gateway(mmseqs2.MMSEQS2_HOST_URL) as gateway:
        alignment = mmseqs2.run_mmseqs2(sequence, gateway=gateway)
        assert alignment[1] == f"{sequence}\n"
        alignment_df = pd.DataFrame({"alignment": alignment})
        pq.write_table(pa.Table.from_pandas(alignment_df, preserve_index=False), alignment_file)

alignment = pq.read_table(alignment_file).to_pandas()["alignment"].values.tolist()

## Load data

In [None]:
aaa_dict = {**structure_tools.constants.AAA_DICT, "TER": "*"}


def format_mutation(mutation):
    wt, pos, mut = mutation

    wt = aaa_dict[wt.upper()]
    mut = wt if mut == "=" else aaa_dict[mut.upper()]
    pos = int(pos)

    return f"{wt}{pos}{mut}"

In [None]:
def mutation_matches_sequence(mutation, sequence):
    wt, pos, mut = mutation[0], mutation[1:-1], mutation[-1]
    pos = int(pos)
    return sequence[pos - 1] == wt

In [None]:
def sequence_matches_structure(sequence, structure_blob):
    with tempfile.NamedTemporaryFile(suffix=".pdb") as tmp_file:
        with open(tmp_file.name, "wt") as fout:
            fout.write(structure_blob)
        structure = PDB.load(tmp_file.name)

    chain_sequence = structure_tools.get_chain_sequence(
        structure[0]["A"], if_unknown="replace", unknown_residue_marker=""
    )
    return sequence == chain_sequence

In [None]:
result_template_df = pd.read_csv(NOTEBOOK_DIR.joinpath("HMBS_variant_template.tsv"), sep="\t")

display(result_template_df.head(2))
len(result_template_df)

In [None]:
result_df = result_template_df.copy()
result_df["mut"] = [
    format_mutation(mut)
    for mut in (
        result_df["aa_substitution"].str.findall("p.([a-zA-Z]+)([1-9]+[0-9]*)([a-zA-Z=]+)").str[0]
    )
]
result_df = result_df[
    ~result_df["mut"].str.contains("*", regex=False)
    & ~(result_df["mut"].str[0] == result_df["mut"].str[-1])
]

display(result_df.head(2))
len(result_df)

In [None]:
assert all([mutation_matches_sequence(mut, sequence) for mut in result_df["mut"]])

In [None]:
assert sequence_matches_structure(sequence, structure_blob)

In [None]:
assert alignment[1] == f"{sequence}\n"

## Write results template

In [None]:
results_to_fill_file = NOTEBOOK_DIR.joinpath("results-to-fill.parquet")

pq.write_table(pa.Table.from_pandas(result_df, preserve_index=False), results_to_fill_file)