# Predicting the simultaneous effect of multiple mutations

## Summary

<https://genomeinterpretation.org/cagi6-mthfr.html>

## Google colab

In [None]:
!nvidia-smi

Sat Jun 19 19:29:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
try:
    import google.colab
    GOOGLE_COLAB = True
except ImportError:
    GOOGLE_COLAB = False
    
GOOGLE_COLAB

True

In [None]:
if GOOGLE_COLAB:
    from google.colab import drive
    drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
if GOOGLE_COLAB:
    !pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
    !pip install -f https://pytorch-geometric.com/whl/torch-1.8.0+cu111.html --default-timeout=600 \
        "transformers==3.3.1" \
        "torch-scatter==2.0.6" \
        "torch-sparse==0.6.9" \
        "torch-cluster==1.5.9" \
        "torch-spline-conv==1.2.1" \
        "torch-geometric==1.6.1" \
        "https://gitlab.com/kimlab/kmbio/-/archive/v2.1.0/kmbio-v2.1.0.zip" \
        "https://gitlab.com/kimlab/kmtools/-/archive/v0.2.8/kmtools-v0.2.8.zip" \
        "https://gitlab.com/ostrokach/proteinsolver/-/archive/v0.1.25/proteinsolver-v0.1.25.zip" \
        "git+https://gitlab.com/elaspic/elaspic2.git"

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.8.0+cu111.html
Collecting https://gitlab.com/kimlab/kmbio/-/archive/v2.1.0/kmbio-v2.1.0.zip
  Using cached https://gitlab.com/kimlab/kmbio/-/archive/v2.1.0/kmbio-v2.1.0.zip
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting https://gitlab.com/kimlab/kmtools/-/archive/v0.2.8/kmtools-v0.2.8.zip
  Using cached https://gitlab.com/kimlab/kmtools/-/archive/v0.2.8/kmtools-v0.2.8.zip
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting https://gitlab.com/ostrokach/proteinsolver/-/archive/v0.1.25/proteinsolver-v0.1.25.zip
  Using cached https://gitlab.com/ostrokach/proteinsolver/-/archive/v0.1.25/proteinsolver-v0.1.25.zip
  Instal

## Imports

In [None]:
import io
import tempfile
import shutil
import urllib
import zipfile
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import elaspic2 as el2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import proteinsolver
import requests
import torch
from kmbio import PDB
from kmtools import structure_tools
from scipy import stats
from tqdm.notebook import tqdm

## Parameters

In [None]:
TASK_ID = 6
TASK_COUNT = 6

In [None]:
NOTEBOOK_DIR = Path("40_cagi6_mthfr_submission").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/content/20_cagi6_mthfr')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

In [None]:
version = datetime.now().isoformat(timespec="hours")

version

'2021-06-19T19'

## Helper functions

In [None]:
def download(url, filename):
    urllib.request.urlretrieve(url, filename)


def download_and_unzip(url, output_dir):
    zip_path, _ = urllib.request.urlretrieve(url)
    with zipfile.ZipFile(zip_path, "r") as f:
        f.extractall(output_dir)


def download_sequence(uniparc_id):
    with urllib.request.urlopen(f'https://www.uniprot.org/uniprot/{uniparc_id}.fasta') as f:
        data = f.read().decode("utf-8")
    chunks = []
    for line in data.split("\n"):
        if line.startswith(">"):
            continue
        chunks.append(line.strip())
    return "".join(chunks)

In [None]:
def structure_to_blob(structure):
    with tempfile.NamedTemporaryFile(suffix=".pdb") as out:
        PDB.save(structure, out.name)
        with open(out.name, "rt") as fin:
            data = fin.read()
    return data

In [None]:
def sequence_matches_structure(sequence, structure_blob):
    with tempfile.NamedTemporaryFile(suffix=".pdb") as tmp_file:
        with open(tmp_file.name, "wt") as fout:
            fout.write(structure_blob)
        structure = PDB.load(tmp_file.name)

    chain_sequence = structure_tools.get_chain_sequence(
        structure[0]["A"], if_unknown="replace", unknown_residue_marker=""
    )
    return sequence == chain_sequence

In [None]:
def apply_mutations(protein_features, mutation):
    protein_features = deepcopy(protein_features)

    for mutation in mutations:
        protein_features = apply_mutation(protein_features, mutation)

    return protein_features


def apply_mutation(protein_features, mutation):
    """

    Warning: This function mutates `protein_features`!
    """
    aa_wt = mutation[0]
    pos = int(mutation[1:-1])
    aa_mut = mutation[-1]

    # Mutate ProtBert features
    aa_list = list(protein_features.protbert_data.sequence)
    assert aa_list[pos - 1] == aa_wt
    aa_list[pos - 1] = aa_mut

    protein_features = protein_features._replace(
        protbert_data = protein_features.protbert_data._replace(sequence="".join(aa_list))
    )
    
    # Mutate ProteinSolver features
    assert (protein_features.proteinsolver_data.x[pos - 1] == proteinsolver.utils.AMINO_ACIDS.index(aa_wt)).item()
    protein_features.proteinsolver_data.x[pos - 1] = proteinsolver.utils.AMINO_ACIDS.index(aa_mut)

    return protein_features

## Workspace

### Download data

In [None]:
# Challenge data
download_and_unzip("https://genomeinterpretation.org/download/mthfrvariants.zip", NOTEBOOK_DIR)
download_and_unzip("https://genomeinterpretation.org/download/distributions.zip", NOTEBOOK_DIR)
download_and_unzip("https://genomeinterpretation.org/download/mthfrtemplate.zip", NOTEBOOK_DIR)
download("https://genomeinterpretation.org/download/mthfrvalidation.py", NOTEBOOK_DIR.joinpath("mthfrvalidation.py"))

# I-Tasser model for WT
download("http://cagi6.data.proteinsolver.org/mthfr/structure-wt-relaxed.pdb", NOTEBOOK_DIR.joinpath("structure-wt.pdb"))
# https://zhanglab.dcmb.med.umich.edu/I-TASSER/output/S624008/
download("http://cagi6.data.proteinsolver.org/mthfr/structure-a222v-relaxed.pdb", NOTEBOOK_DIR.joinpath("structure-a222v.pdb"))

In [None]:
!ls {NOTEBOOK_DIR}

cataAV_variants.txt  reguAV_variants.txt
cataWT_variants.txt  reguWT_variants.txt
distributions.csv    structure-a222v.pdb
__MACOSX	     structure-wt.pdb
mthfrtemplate	     variants-a222v-2021-06-19T18-0001-of-0006.csv
mthfrvalidation.py   variants-wt-2021-06-19T18-0001-of-0006.csv


### Load protein sequence

In [None]:
sequence_wt = download_sequence("P42898")

sequence_wt

'MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLREKMRRRLESGDKWFSLEFFPPRTAEGAVNLISRFDRMAAGGPLYIDVTWHPAGDPGSDKETSSMMIASTAVNYCGLETILHMTCCRQRLEEITGHLHKAKQLGLKNIMALRGDPIGDQWEEEEGGFNYAVDLVKHIRSEFGDYFDICVAGYPKGHPEAGSFEADLKHLKEKVSAGADFIITQLFFEADTFFRFVKACTDMGITCPIVPGIFPIQGYHSLRQLVKLSKLEVPQEIKDVIEPIKDNDAAIRNYGIELAVSLCQELLASGLVPGLHFYTLNREMATTEVLKRLGMWTEDPRRPLPWALSAHPKRREEDVRPIFWASRPKSYIYRTQEWDEFPNGRWGNSSSPAFGELKDYYLFYLKSKSPKEELLKMWGEELTSEESVFEVFVLYLSGEPNRNGHKVTCLPWNDEPLAAETSLLKEELLRVNRQGILTINSQPNINGKPSSDPIVGWGPSGGYVFQKAYLEFFTSRETAEALLQVLKKYELRVNYHLVNVKGENITNAPELQPNAVTWGIFPGREIIQPTVVDPVSFMFWKDEAFALWIERWGKLYEEESPSRTIIQYIHDNYFLVNLVDNDFPLDNCLWQVVEDTLELLNRPTQNARETEAP'

In [None]:
lst = list(sequence_wt)
assert lst[222 - 1] == "A"
lst[222 - 1] = "V"

sequence_a222v = "".join(lst)

sequence_a222v

'MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLREKMRRRLESGDKWFSLEFFPPRTAEGAVNLISRFDRMAAGGPLYIDVTWHPAGDPGSDKETSSMMIASTAVNYCGLETILHMTCCRQRLEEITGHLHKAKQLGLKNIMALRGDPIGDQWEEEEGGFNYAVDLVKHIRSEFGDYFDICVAGYPKGHPEAGSFEADLKHLKEKVSAGVDFIITQLFFEADTFFRFVKACTDMGITCPIVPGIFPIQGYHSLRQLVKLSKLEVPQEIKDVIEPIKDNDAAIRNYGIELAVSLCQELLASGLVPGLHFYTLNREMATTEVLKRLGMWTEDPRRPLPWALSAHPKRREEDVRPIFWASRPKSYIYRTQEWDEFPNGRWGNSSSPAFGELKDYYLFYLKSKSPKEELLKMWGEELTSEESVFEVFVLYLSGEPNRNGHKVTCLPWNDEPLAAETSLLKEELLRVNRQGILTINSQPNINGKPSSDPIVGWGPSGGYVFQKAYLEFFTSRETAEALLQVLKKYELRVNYHLVNVKGENITNAPELQPNAVTWGIFPGREIIQPTVVDPVSFMFWKDEAFALWIERWGKLYEEESPSRTIIQYIHDNYFLVNLVDNDFPLDNCLWQVVEDTLELLNRPTQNARETEAP'

### Load protein structure

In [None]:
structure_start_idx = 39
structure_end_idx = 644

In [None]:
structure_wt_ref = PDB.load(NOTEBOOK_DIR.joinpath("structure-wt.pdb"))

In [None]:
df = structure_wt_ref.to_dataframe()
df = df[(df["model_idx"] == 0) & (df["chain_idx"] == 0)]
structure_wt = PDB.Structure.from_dataframe(df)

In [None]:
assert sequence_wt == structure_tools.get_chain_sequence(
    structure_wt[0]["A"], if_unknown="replace", unknown_residue_marker=""
)

In [None]:
structure_a222v_ref = PDB.load(NOTEBOOK_DIR.joinpath("structure-a222v.pdb"))

In [None]:
df = structure_a222v_ref.to_dataframe()
df = df[(df["model_idx"] == 0) & (df["chain_idx"] == 0)]
structure_a222v = PDB.Structure.from_dataframe(df)

In [None]:
assert sequence_a222v == structure_tools.get_chain_sequence(
    structure_a222v[0]["A"], if_unknown="replace", unknown_residue_marker=""
)

### Load challenge data

In [None]:
dfs = []
for filename in ["cataWT_variants.txt", "reguWT_variants.txt"]:
    df = pd.read_csv(NOTEBOOK_DIR.joinpath(filename), names=["mut_ref"])
    dfs.append(df)
variants_wt_df = pd.concat(dfs, ignore_index=True)

In [None]:
variants_wt_df["aa_wt"], variants_wt_df["aa_pos"], variants_wt_df["aa_mut"] = (
    list(zip(*variants_wt_df["mut_ref"].str.findall("^p\.([a-zA-Z]+)([0-9]+)([a-zA-Z]+)").str[0]))
)
variants_wt_df["aa_pos"] = variants_wt_df["aa_pos"].astype(int)

variants_wt_df["mut_3char"] = (
    variants_wt_df["aa_wt"] + 
    (variants_wt_df["aa_pos"]).astype(str) + 
    variants_wt_df["aa_mut"] 
)

variants_wt_df["mut"] = (
    variants_wt_df["aa_wt"].str.upper().map(structure_tools.AAA_DICT) + 
    (variants_wt_df["aa_pos"]).astype(str) + 
    variants_wt_df["aa_mut"].str.upper().map(structure_tools.AAA_DICT) 
)

variants_wt_df = variants_wt_df.sort_values(["aa_pos", "mut"])
variants_wt_df.head()

Unnamed: 0,mut_ref,aa_wt,aa_pos,aa_mut,mut_3char,mut
2131,p.Val2Asp,Val,2,Asp,Val2Asp,V2D
2137,p.Val2Phe,Val,2,Phe,Val2Phe,V2F
2132,p.Val2His,Val,2,His,Val2His,V2H
2133,p.Val2Ile,Val,2,Ile,Val2Ile,V2I
2135,p.Val2Lys,Val,2,Lys,Val2Lys,V2K


In [None]:
dfs = []
for filename in ["cataAV_variants.txt", "reguAV_variants.txt"]:
    df = pd.read_csv(NOTEBOOK_DIR.joinpath(filename), names=["mut_ref"])
    dfs.append(df)
variants_a222v_df = pd.concat(dfs, ignore_index=True)

In [None]:
variants_a222v_df["aa_wt"], variants_a222v_df["aa_pos"], variants_a222v_df["aa_mut"] = (
    list(zip(*variants_a222v_df["mut_ref"].str.findall("^p\.\[([a-zA-Z]+)([0-9]+)([a-zA-Z]+)").str[0]))
)
variants_a222v_df["aa_pos"] = variants_a222v_df["aa_pos"].astype(int)

variants_a222v_df["mut_3char"] = (
    variants_a222v_df["aa_wt"] + 
    (variants_a222v_df["aa_pos"]).astype(str) + 
    variants_a222v_df["aa_mut"] 
)

variants_a222v_df["mut"] = (
    variants_a222v_df["aa_wt"].str.upper().map(structure_tools.AAA_DICT) + 
    (variants_a222v_df["aa_pos"]).astype(str) + 
    variants_a222v_df["aa_mut"].str.upper().map(structure_tools.AAA_DICT) 
)

variants_a222v_df = variants_a222v_df.sort_values(["aa_pos", "mut"])
variants_a222v_df.head()

Unnamed: 0,mut_ref,aa_wt,aa_pos,aa_mut,mut_3char,mut
2131,p.[Val2Asp;Ala222Val],Val,2,Asp,Val2Asp,V2D
2137,p.[Val2Phe;Ala222Val],Val,2,Phe,Val2Phe,V2F
2132,p.[Val2His;Ala222Val],Val,2,His,Val2His,V2H
2133,p.[Val2Ile;Ala222Val],Val,2,Ile,Val2Ile,V2I
2135,p.[Val2Lys;Ala222Val],Val,2,Lys,Val2Lys,V2K


### Select variants for a particular chunk

In [None]:
if len(variants_wt_df) > 2000:
    chunk_size = len(variants_wt_df) // TASK_COUNT + 1
    variants_wt_df = variants_wt_df.iloc[chunk_size * (TASK_ID - 1):chunk_size * TASK_ID]

len(variants_wt_df)

1445

In [None]:
if len(variants_a222v_df) > 2000:
    chunk_size = len(variants_a222v_df) // TASK_COUNT + 1
    variants_a222v_df = variants_a222v_df.iloc[chunk_size * (TASK_ID - 1):chunk_size * TASK_ID]

len(variants_a222v_df)

1445

### Instantiate the `ELASPIC2` model and featurize protein



In [None]:
model = el2.ELASPIC2(device=device)



### Make predictions for WT protein

In [None]:
structure_file_obj = tempfile.NamedTemporaryFile(suffix=".pdb")
PDB.save(structure_wt, structure_file_obj.name)

In [None]:
protein_features_wt = model.build(
    structure_file=structure_file_obj.name,
    protein_sequence=sequence_wt,
    ligand_sequence=None,
    remove_hetatms=True,
)

protein_features_wt

ELASPIC2Data(is_interface=False, protbert_data=ProtBertData(sequence='MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLREKMRRRLESGDKWFSLEFFPPRTAEGAVNLISRFDRMAAGGPLYIDVTWHPAGDPGSDKETSSMMIASTAVNYCGLETILHMTCCRQRLEEITGHLHKAKQLGLKNIMALRGDPIGDQWEEEEGGFNYAVDLVKHIRSEFGDYFDICVAGYPKGHPEAGSFEADLKHLKEKVSAGADFIITQLFFEADTFFRFVKACTDMGITCPIVPGIFPIQGYHSLRQLVKLSKLEVPQEIKDVIEPIKDNDAAIRNYGIELAVSLCQELLASGLVPGLHFYTLNREMATTEVLKRLGMWTEDPRRPLPWALSAHPKRREEDVRPIFWASRPKSYIYRTQEWDEFPNGRWGNSSSPAFGELKDYYLFYLKSKSPKEELLKMWGEELTSEESVFEVFVLYLSGEPNRNGHKVTCLPWNDEPLAAETSLLKEELLRVNRQGILTINSQPNINGKPSSDPIVGWGPSGGYVFQKAYLEFFTSRETAEALLQVLKKYELRVNYHLVNVKGENITNAPELQPNAVTWGIFPGREIIQPTVVDPVSFMFWKDEAFALWIERWGKLYEEESPSRTIIQYIHDNYFLVNLVDNDFPLDNCLWQVVEDTLELLNRPTQNARETEAP'), proteinsolver_data=Data(edge_attr=[36616, 2], edge_index=[2, 36616], x=[656]))

In [None]:
mutation_features_wt = list(
    tqdm(
        (
            model.analyze_mutation(mut, protein_features_wt)
            for mut in variants_wt_df["mut"]
        ),
        total=len(variants_wt_df),
    )
)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1445.0), HTML(value='')))




In [None]:
variants_wt_df["protbert_score"] = [d["protbert_core_score_mut"] - d["protbert_core_score_wt"] for d in mutation_features_wt]
variants_wt_df["proteinsolver_score"] = [d["proteinsolver_core_score_mut"] - d["proteinsolver_core_score_wt"] for d in mutation_features_wt]
# Higher values mean more stable (opposite of ΔΔG)
variants_wt_df["el2_score"] = model.predict_mutation_effect(mutation_features_wt).tolist()

In [None]:
output_file = NOTEBOOK_DIR.joinpath(f"variants-wt-{version}-{TASK_ID:04d}-of-{TASK_COUNT:04d}.csv")
variants_wt_df.to_csv(output_file, sep="\t", index=False)

if GOOGLE_COLAB:
    output_gdrive_file = Path(f"/gdrive/MyDrive/CAGI6/MTHFR/{output_file.name}")
    output_gdrive_file.parent.mkdir(exist_ok=True, parents=True)
    shutil.copy(output_file, output_gdrive_file)

### Make predictions for A222V protein

In [None]:
structure_file_obj = tempfile.NamedTemporaryFile(suffix=".pdb")
PDB.save(structure_a222v, structure_file_obj.name)

In [None]:
protein_features_a222v = model.build(
    structure_file=structure_file_obj.name,
    protein_sequence=sequence_a222v,
    ligand_sequence=None,
    remove_hetatms=True,
)

protein_features_a222v

ELASPIC2Data(is_interface=False, protbert_data=ProtBertData(sequence='MVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLREKMRRRLESGDKWFSLEFFPPRTAEGAVNLISRFDRMAAGGPLYIDVTWHPAGDPGSDKETSSMMIASTAVNYCGLETILHMTCCRQRLEEITGHLHKAKQLGLKNIMALRGDPIGDQWEEEEGGFNYAVDLVKHIRSEFGDYFDICVAGYPKGHPEAGSFEADLKHLKEKVSAGVDFIITQLFFEADTFFRFVKACTDMGITCPIVPGIFPIQGYHSLRQLVKLSKLEVPQEIKDVIEPIKDNDAAIRNYGIELAVSLCQELLASGLVPGLHFYTLNREMATTEVLKRLGMWTEDPRRPLPWALSAHPKRREEDVRPIFWASRPKSYIYRTQEWDEFPNGRWGNSSSPAFGELKDYYLFYLKSKSPKEELLKMWGEELTSEESVFEVFVLYLSGEPNRNGHKVTCLPWNDEPLAAETSLLKEELLRVNRQGILTINSQPNINGKPSSDPIVGWGPSGGYVFQKAYLEFFTSRETAEALLQVLKKYELRVNYHLVNVKGENITNAPELQPNAVTWGIFPGREIIQPTVVDPVSFMFWKDEAFALWIERWGKLYEEESPSRTIIQYIHDNYFLVNLVDNDFPLDNCLWQVVEDTLELLNRPTQNARETEAP'), proteinsolver_data=Data(edge_attr=[36620, 2], edge_index=[2, 36620], x=[656]))

In [None]:
mutation_features_a222v = list(
    tqdm(
        (
            model.analyze_mutation(mut, protein_features_a222v)
            for mut in variants_a222v_df["mut"]
        ),
        total=len(variants_a222v_df),
    )
)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1445.0), HTML(value='')))




In [None]:
variants_a222v_df["protbert_score"] = [d["protbert_core_score_mut"] - d["protbert_core_score_wt"] for d in mutation_features_a222v]
variants_a222v_df["proteinsolver_score"] = [d["proteinsolver_core_score_mut"] - d["proteinsolver_core_score_wt"] for d in mutation_features_a222v]
# Higher values mean more stable (opposite of ΔΔG)
variants_a222v_df["el2_score"] = model.predict_mutation_effect(mutation_features_a222v).tolist()

In [None]:
output_file = NOTEBOOK_DIR.joinpath(f"variants-a222v-{version}-{TASK_ID:04d}-of-{TASK_COUNT:04d}.csv")
variants_a222v_df.to_csv(output_file, sep="\t", index=False)

if GOOGLE_COLAB:
    output_gdrive_file = Path(f"/gdrive/MyDrive/CAGI6/MTHFR/{output_file.name}")
    output_gdrive_file.parent.mkdir(exist_ok=True, parents=True)
    shutil.copy(output_file, output_gdrive_file)