## Summary

### HGVS

```bash
docker run -d --name uta_20180821 -p 15032:5432 --rm biocommons/uta:uta_20180821
```

### Note

In row `1098`, mutations do not match sequence?! This needs to be addressed.

### Other resources

- https://mutalyzer.nl/ — Used to map HGVS.g to HGVS.c.

---

## Imports

In [1]:
%env HGVS_SEQREPO_DIR=/home/kimlab5/strokach/data/seqrepo/2021-01-29/
%env UTA_DB_URL=postgresql://anonymous@localhost:15032/uta/uta_20180821
%env GOOGLE_APPLICATION_CREDENTIALS=/home/kimlab5/strokach/workspace/ostrokach-data-da3b89497213.json

env: HGVS_SEQREPO_DIR=/home/kimlab5/strokach/data/seqrepo/2021-01-29/
env: UTA_DB_URL=postgresql://anonymous@localhost:15032/uta/uta_20180821
env: GOOGLE_APPLICATION_CREDENTIALS=/home/kimlab5/strokach/workspace/ostrokach-data-da3b89497213.json


In [2]:
import concurrent.futures
import contextlib
import gzip
import itertools
import logging
import os
import pickle
import re
import socket
import subprocess
import tempfile
import time
import urllib.request
from pathlib import Path

import dotenv
import more_itertools
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import synapseclient
import synapseutils
from kmbio import PDB
from kmtools import structure_tools
from tqdm.auto import tqdm



## Parameters

In [3]:
NOTEBOOK_DIR = Path("30_cagi6_sherloc").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/home/kimlab5/strokach/workspace/elaspic/elaspic2-cagi6/notebooks/30_cagi6_sherloc')

In [4]:
if "scinet" in socket.gethostname():
    CPU_COUNT = 40
else:
    CPU_COUNT = max(1, len(os.sched_getaffinity(0)))

CPU_COUNT

32

## Download data

In [5]:
if not NOTEBOOK_DIR.joinpath("submission_template.tsv").is_file():
    syn = synapseclient.Synapse()
    syn.login(os.environ["SYNAPSE_USERNAME"], os.environ["SYNAPSE_PASSWORD"])
    _ = synapseutils.syncFromSynapse(syn, "syn25958776", path=NOTEBOOK_DIR)
    _ = subprocess.run(
        ["unzip", "-o", "CAGI6-Sherloc-clinical-classification.zip"], cwd=NOTEBOOK_DIR, check=True
    )

## Load data

In [6]:
submission_template_file = NOTEBOOK_DIR.joinpath("submission_template.tsv")

In [7]:
submission_template_df = pd.read_csv(submission_template_file, sep="\t")

display(submission_template_df.head(2))
len(submission_template_df)

Unnamed: 0,hgvs,score,class,comment
0,NM_152486.2:c.39C>T,*,*,*
1,NM_152486.2:c.62C>T,*,*,*


122124

### `training_df`

In [8]:
training_file = NOTEBOOK_DIR.joinpath(
    "CAGI6-Sherloc-clinical-classification", "final_train_070821.txt"
)

In [9]:
training_all_df = pd.read_csv(training_file, sep="\t")

display(training_all_df.head(2))
len(training_all_df)

Unnamed: 0,chr,pos,ref,alt,hgvs,interpretation,Subcategory Missense,Subcategory Small intronic deletion or insertion,Subcategory Small exonic in-frame deletion or insertion
0,1,861332,G,A,NM_152486.2:c.11G>A,Uncertain significance,True,False,False
1,1,865568,G,A,NM_152486.2:c.106G>A,Uncertain significance,True,False,False


418354

In [10]:
assert not set(submission_template_df["hgvs"]) & set(training_all_df["hgvs"])

In [11]:
training_df = training_all_df[training_all_df["Subcategory Missense"]]

display(training_df.head(2))
len(training_df)

Unnamed: 0,chr,pos,ref,alt,hgvs,interpretation,Subcategory Missense,Subcategory Small intronic deletion or insertion,Subcategory Small exonic in-frame deletion or insertion
0,1,861332,G,A,NM_152486.2:c.11G>A,Uncertain significance,True,False,False
1,1,865568,G,A,NM_152486.2:c.106G>A,Uncertain significance,True,False,False


217153

### `testing_df`

In [12]:
testing_file = NOTEBOOK_DIR.joinpath(
    "CAGI6-Sherloc-clinical-classification", "final_test_070821.txt"
)

In [13]:
testing_all_df = pd.read_csv(testing_file, sep="\t")

display(testing_all_df.head(2))
len(testing_all_df)

Unnamed: 0,chr,pos,ref,alt,hgvs,Subcategory Missense,Subcategory Small intronic deletion or insertion,Subcategory Small exonic in-frame deletion or insertion
0,1,861360,C,T,NM_152486.2:c.39C>T,False,False,False
1,1,861383,C,T,NM_152486.2:c.62C>T,True,False,False


122124

In [14]:
assert not set(submission_template_df["hgvs"]) ^ set(testing_all_df["hgvs"])

In [15]:
testing_df = testing_all_df[testing_all_df["Subcategory Missense"]]

display(testing_df.head(2))
len(testing_df)

Unnamed: 0,chr,pos,ref,alt,hgvs,Subcategory Missense,Subcategory Small intronic deletion or insertion,Subcategory Small exonic in-frame deletion or insertion
1,1,861383,C,T,NM_152486.2:c.62C>T,True,False,False
7,1,865640,G,A,NM_152486.2:c.178G>A,True,False,False


47103

### `validation_df`

In [16]:
validation_file = NOTEBOOK_DIR.joinpath("validation_variants.tsv")

In [17]:
validation_all_df = pd.read_csv(validation_file, sep="\t").rename(columns={"HGVS.c": "hgvs_g"})

display(validation_all_df.tail(2))
len(validation_all_df)

Unnamed: 0,hgvs_g
17812,NC_000023.10:g.32519861C>T
17813,NC_000015.9:g.85406864_85406865insT


17814

In [18]:
def map_g_to_c(str_g, mapper):
    from hgvs.exceptions import HGVSUsageError, HGVSInvalidIntervalError

    var_g = hp.parse_hgvs_variant(str_g)
    for tx_ac in mapper.relevant_transcripts(var_g):
        try:
            var_c = mapper.g_to_c(var_g, tx_ac)
        except (HGVSUsageError, HGVSInvalidIntervalError):
            continue
        yield var_c

In [19]:
validation_mapping_file = NOTEBOOK_DIR.joinpath("validation-mapping-grch37.parquet")

if validation_mapping_file.is_file():
    validation_mapping_df = pq.read_table(validation_mapping_file).to_pandas()
else:
    import hgvs.assemblymapper
    import hgvs.dataproviders.uta
    import hgvs.parser

    hp = hgvs.parser.Parser()
    hdp = hgvs.dataproviders.uta.connect()
    mapper = hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name="GRCh37", normalize=False)

    results = []
    for tup in tqdm(validation_all_df.itertuples(), total=len(validation_all_df)):
        for var_c in map_g_to_c(tup.hgvs_g, mapper):
            results.append((tup.str_g, str(var_c)))
    validation_mapping_df = pd.DataFrame(results, columns=["hgvs_g", "hgvs"])
    pq.write_table(
        pa.Table.from_pandas(validation_mapping_df, preserve_index=False), validation_mapping_file
    )

In [20]:
display(validation_mapping_df.head(2))
print(len(validation_mapping_df))

Unnamed: 0,hgvs_g,hgvs
0,NC_000002.11:g.152537358A>G,NM_001164508.1:c.2944-16T>C
1,NC_000002.11:g.152537358A>G,NM_004543.4:c.2944-16T>C


67819


In [21]:
validation_df = (
    validation_all_df
    #
    .merge(validation_mapping_df, on=["hgvs_g"])
)

In [22]:
display(validation_df.tail(2))
print(len(validation_df))
print(len(validation_df["hgvs_g"].unique()))

Unnamed: 0,hgvs_g,hgvs
67817,NC_000023.10:g.32519861C>T,NM_004010.3:c.2011+11G>A
67818,NC_000015.9:g.85406864_85406865insT,NM_020778.4:c.5098_5099insT


67819
17801


## Map HGVS to proteins and mutations (`mutation_mapping_df`)

In [23]:
class DisableLogger:
    def __enter__(self):
        logging.disable(logging.CRITICAL)

    def __exit__(self, exit_type, exit_value, exit_traceback):
        logging.disable(logging.NOTSET)

In [24]:
def worker(str_c, hp, mapper):
    var_c = hp.parse_hgvs_variant(str_c)
    with DisableLogger():
        var_p = mapper.c_to_p(var_c)
    return (str_c, var_p.ac, var_p.posedit)

In [25]:
def load_cache(cache_file):
    if cache_file.is_file():
        with cache_file.open("rb") as fin:
            results = pickle.load(fin)
    else:
        results = []
    return results

In [26]:
def pe_to_df(pe):
    return f"{pe.pos.start.aa}{pe.pos.start.base}{pe.edit.alt}" if pe is not None else None

In [27]:
def map_transcript_to_protein(
    hgvs_ids, cache_file=NOTEBOOK_DIR.joinpath("c-to-p-mapping-grch37.pickle")
):
    import hgvs.assemblymapper
    import hgvs.dataproviders.uta
    import hgvs.parser
    from hgvs.exceptions import HGVSDataNotAvailableError

    hp = hgvs.parser.Parser()
    hdp = hgvs.dataproviders.uta.connect()
    mapper = hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name="GRCh37")

    results = load_cache(cache_file)
    stc_c_list = sorted((set(hgvs_ids) - set([r[0] for r in results])))
    for str_c in tqdm(stc_c_list):
        try:
            results.append(worker(str_c, hp, mapper))
        except Exception as e:
            print(f"An error occured: {e!r}.")

        if len(results) % 1000 == 0:
            with cache_file.open("wb") as fout:
                pickle.dump(results, fout, pickle.HIGHEST_PROTOCOL)

    with cache_file.open("wb") as fout:
        pickle.dump(results, fout, pickle.HIGHEST_PROTOCOL)

    return results

In [28]:
mutation_mapping_file = NOTEBOOK_DIR.joinpath("mutation-mapping-grch37.parquet")

if mutation_mapping_file.is_file():
    mutation_mapping_df = pq.read_table(mutation_mapping_file).to_pandas()
else:
    hgvs_ids = list(set(training_df["hgvs"]) | set(testing_df["hgvs"]) | set(validation_df["hgvs"]))
    mutation_map = map_transcript_to_protein(hgvs_ids)
    mutation_mapping_df = pd.DataFrame(
        [(nm, np, pe_to_df(pe)) for nm, np, pe in mutation_map if hasattr(pe, "pos")],
        columns=["hgvs", "protein_id", "mut"],
    )
    pq.write_table(
        pa.Table.from_pandas(mutation_mapping_df, preserve_index=False), mutation_mapping_file
    )

In [29]:
mutation_mapping_df["refseq_id"] = mutation_mapping_df["protein_id"].str.split("\.").str[0].values

In [30]:
display(mutation_mapping_df.head(2))
print(len(mutation_mapping_df))

Unnamed: 0,hgvs,protein_id,mut,refseq_id
0,NM_213720.2:c.8G>T,NP_998885.1,R3L,NP_998885
1,NM_213720.2:c.8G>C,NP_998885.1,R3P,NP_998885


297364


In [31]:
mutation_mapping_df = mutation_mapping_df[mutation_mapping_df["mut"].str.contains("^[A-Z][0-9]+[A-Z]$")]

display(mutation_mapping_df.head(2))
print(len(mutation_mapping_df))

Unnamed: 0,hgvs,protein_id,mut,refseq_id
0,NM_213720.2:c.8G>T,NP_998885.1,R3L,NP_998885
1,NM_213720.2:c.8G>C,NP_998885.1,R3P,NP_998885


270201


## Map ENSP to UniProt

### Canonical (`uniprot_mapping_df`)

In [32]:
def map_refseq_to_uniprot(refseq_protein_ids):
    from google.cloud import bigquery

    sql_query = """\
SELECT
  refseq.db_id refseq_id,
  human_uniprot.db_id uniprot_id
FROM (
  SELECT
    uniparc_id,
    db_id
  FROM
    `ostrokach-data.uniparc.xref`
  WHERE
    db_type = 'RefSeq') refseq
JOIN (
  SELECT
    uniparc_id,
    db_id
  FROM
    `ostrokach-data.uniparc.xref` xref
  JOIN
    `ostrokach-data.uniparc.ncbi_taxonomy_id` ncbi_taxonomy_id
  USING
    (uniparc_id,
      xref_id)
  WHERE
    xref.db_type = 'UniProtKB/Swiss-Prot'
    AND ncbi_taxonomy_id.value = 9606) human_uniprot
USING
  (uniparc_id)
WHERE
  refseq.db_id IN UNNEST(@refseq_ids);
"""
    job_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ArrayQueryParameter("refseq_ids", "STRING", refseq_protein_ids),
        ]
    )

    client = bigquery.Client()
    query_job = client.query(sql_query, job_config=job_config)
    results_df = query_job.to_dataframe()

    return results_df

In [33]:
uniprot_mapping_file = NOTEBOOK_DIR.joinpath("uniprot-mapping.parquet")

if uniprot_mapping_file.is_file():
    uniprot_mapping_df = pq.read_table(uniprot_mapping_file).to_pandas()
else:
    refseq_protein_ids = mutation_mapping_df["refseq_id"].unique().tolist()
    uniprot_mapping_df = map_refseq_to_uniprot(refseq_protein_ids)
    pq.write_table(
        pa.Table.from_pandas(uniprot_mapping_df, preserve_index=False), uniprot_mapping_file
    )

In [34]:
len(set(mutation_mapping_df["refseq_id"]) - set(uniprot_mapping_df["refseq_id"]))

1923

### Non-canonical (`uniprot_mapping_noncanon_df`)

In [35]:
def map_refseq_to_uniprot_noncanon(refseq_protein_ids):
    from google.cloud import bigquery

    sql_query = """\
SELECT
  refseq.db_id refseq_id,
  human_uniprot.db_id uniprot_id,
  uniparc.uniparc_id uniparc_id,
  uniparc.sequence uniparc_sequence
FROM (
  SELECT
    uniparc_id,
    xref_id,
    db_id
  FROM
    `ostrokach-data.uniparc.xref`
  WHERE
    db_type = 'RefSeq') refseq
JOIN
  `ostrokach-data.uniparc.uniparc` uniparc
USING
  (uniparc_id)
JOIN
  `ostrokach-data.uniparc.gene_name` gene_name_refseq
USING
  (uniparc_id,
    xref_id)
JOIN
  `ostrokach-data.uniparc.gene_name` gene_name_uniprot
ON
  (gene_name_refseq.value = gene_name_uniprot.value)
JOIN (
  SELECT
    uniparc_id,
    xref_id,
    db_id
  FROM
    `ostrokach-data.uniparc.xref` xref
  JOIN
    `ostrokach-data.uniparc.ncbi_taxonomy_id` ncbi_taxonomy_id
  USING
    (uniparc_id,
      xref_id)
  WHERE
    xref.db_type = 'UniProtKB/Swiss-Prot'
    AND ncbi_taxonomy_id.value = 9606) human_uniprot
ON
  (gene_name_uniprot.uniparc_id = human_uniprot.uniparc_id
    AND gene_name_uniprot.xref_id = human_uniprot.xref_id)
WHERE
  refseq.db_id IN UNNEST(@refseq_ids);
"""
    job_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ArrayQueryParameter("refseq_ids", "STRING", refseq_protein_ids),
        ]
    )

    client = bigquery.Client()
    query_job = client.query(sql_query, job_config=job_config)
    results_df = query_job.to_dataframe()

    return results_df

In [36]:
uniprot_mapping_noncanon_file = NOTEBOOK_DIR.joinpath("uniprot-mapping-noncanon.parquet")

if uniprot_mapping_noncanon_file.is_file():
    uniprot_mapping_noncanon_df = pq.read_table(uniprot_mapping_noncanon_file).to_pandas()
else:
    refseq_protein_ids = list(
        set(mutation_mapping_df["refseq_id"]) - set(uniprot_mapping_df["refseq_id"])
    )
    uniprot_mapping_noncanon_df = map_refseq_to_uniprot_noncanon(refseq_protein_ids)
    pq.write_table(
        pa.Table.from_pandas(uniprot_mapping_noncanon_df, preserve_index=False),
        uniprot_mapping_noncanon_file,
    )

In [37]:
len(
    set(mutation_mapping_df["refseq_id"])
    - set(uniprot_mapping_df["refseq_id"])
    - set(uniprot_mapping_noncanon_df["refseq_id"])
)

36

In [38]:
display(uniprot_mapping_noncanon_df.head(2))
len(uniprot_mapping_noncanon_df)

Unnamed: 0,refseq_id,uniprot_id,uniparc_id,uniparc_sequence
0,NP_002867,O43502,UPI000006DB00,MRGKTFRFEMQRDLVSFPLSPAVRVKLVSAGFQTAEELLEVKPSEL...
1,NP_056512,Q96H96,UPI0000D6158E,MTPISQVRMRKGSAHTAAQPGRLGLHPAGATAHACRGMTSIRARPG...


1892

In [39]:
uniparc_sequence_lookup = (
    uniprot_mapping_noncanon_df[["uniparc_id", "uniparc_sequence"]]
    .drop_duplicates()
    .set_index("uniparc_id")["uniparc_sequence"]
    .to_dict()
)

list(uniparc_sequence_lookup)[:3]

['UPI000006DB00', 'UPI0000D6158E', 'UPI000002A8AB']

## Combine datasets (`combined_mapped_df`)

In [40]:
combined_mapped_df = (
    #
    pd.concat([training_df, testing_df, validation_df], ignore_index=True)
    #
    .merge(mutation_mapping_df, on=["hgvs"])
    #
    .merge(
        pd.concat(
            [uniprot_mapping_df, uniprot_mapping_noncanon_df.drop("uniparc_sequence", axis=1)],
            ignore_index=True,
        ),
        on=["refseq_id"],
    )
    #
    .dropna(subset=["uniprot_id", "mut"])
)

In [41]:
display(combined_mapped_df.head(2))
print(len(combined_mapped_df))

Unnamed: 0,chr,pos,ref,alt,hgvs,interpretation,Subcategory Missense,Subcategory Small intronic deletion or insertion,Subcategory Small exonic in-frame deletion or insertion,hgvs_g,protein_id,mut,refseq_id,uniprot_id,uniparc_id
0,1,861332.0,G,A,NM_152486.2:c.11G>A,Uncertain significance,True,False,False,,NP_689699.2,G4E,NP_689699,Q96NU1,
1,1,865568.0,G,A,NM_152486.2:c.106G>A,Uncertain significance,True,False,False,,NP_689699.2,A36T,NP_689699,Q96NU1,


279662


In [42]:
len(set(training_df["hgvs"]) - set(combined_mapped_df["hgvs"])) / len(set(training_df["hgvs"]))

0.0017913636928801352

In [43]:
len(set(testing_df["hgvs"]) - set(combined_mapped_df["hgvs"])) / len(set(testing_df["hgvs"]))

0.003333121032630618

## Download sequences (`uniprot_sequences_df`)

In [44]:
def download_sequence(uniprot_id):
    if uniprot_id in ["Q9Y2G2"]:
        uniprot_id = f"{uniprot_id}-1"

    with urllib.request.urlopen(f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta") as f:
        data = f.read().decode("utf-8")
    chunks = []
    for line in data.split("\n"):
        if line.startswith(">"):
            continue
        chunks.append(line.strip())
    sequence = "".join(chunks)

    if uniprot_id == "P11586":
        sequence_lst = list(sequence)
        assert sequence_lst[133] == "K"
        sequence_lst[133] = "R"
        sequence = "".join(sequence_lst)

    return sequence


download_sequence("Q96NU1")

'MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKERTPSFSASDGDSDGSGPTCGRRPGLKQEDGPHIRIMKRRVHTHWDVNISFREASCSQDGNLPTLISSVHRSRHLVMPEHQSRCEFQRGSLEIGLRPAGDLLGKRLGRSPRISSDCFSEKRARSESPQEALLLPRELGPSMAPEDHYRRLVSALSEASTFEDPQRLYHLGLPSHGEDPPWHDPPHHLPSHDLLRVRQEVAAAALRGPSGLEAHLPSSTAGQRRKQGLAQHREGAAPAAAPSFSERELPQPPPLLSPQNAPHVALGPHLRPPFLGVPSALCQTPGYGFLPPAQAEMFAWQQELLRKQNLARLELPADLLRQKELESARPQLLAPETALRPNDGAEELQRRGALLVLNHGAAPLLALPPQGPPGSGPPTPSRDSARRAPRKGGPGPASARPSESKEMTGARLWAQDGSEDEPPKDSDGEDPETAAVGCRGPTPGQAPAGGAGAEGKGLFPGSTLPLGFPYAVSPYFHTGAVGGLSMDGEEAPAPEDVTKWTVDDVCSFVGGLSGCGEYTRVFREQGIDGETLPLLTEEHLLTNMGLKLGPALKIRAQVARRLGRVFYVASFPVALPLQPPTLRAPERELGTGEQPLSPTTATSPYGGGHALAGQTSPKQENGTLALLPGAPDPSQPLC'

In [45]:
uniprot_sequences_file = NOTEBOOK_DIR.joinpath("uniprot-sequences.parquet")

if uniprot_sequences_file.is_file():
    uniprot_sequences_df = pq.read_table(uniprot_sequences_file).to_pandas()
else:
    with concurrent.futures.ThreadPoolExecutor(20) as pool:
        uniprot_ids = combined_mapped_df["uniprot_id"].unique()
        sequences = list(tqdm(pool.map(download_sequence, uniprot_ids), total=len(uniprot_ids)))
        uniprot_sequences_df = pd.DataFrame({"uniprot_id": uniprot_ids, "sequence": sequences})
    pq.write_table(
        pa.Table.from_pandas(uniprot_sequences_df, preserve_index=False), uniprot_sequences_file
    )

In [46]:
assert uniprot_sequences_df["sequence"].notnull().all()
assert (uniprot_sequences_df["sequence"] != "").all()

In [47]:
uniprot_sequence_lookup = (
    uniprot_sequences_df[["uniprot_id", "sequence"]]
    .drop_duplicates()
    .set_index("uniprot_id")["sequence"]
    .to_dict()
)

list(uniprot_sequence_lookup)[:3]

['Q96NU1', 'P05161', 'O00468']

## Download structures (`uniprot_structures_df`)

In [48]:
def download_structure(uniprot_id):
    try:
        with urllib.request.urlopen(
            f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v1.pdb"
        ) as f:
            data = f.read().decode("utf-8")
    except urllib.error.HTTPError as error:
        if error.code == 404:
            return None
        raise
    return data


download_structure("Q96NU1")[:20]

'HEADER              '

In [49]:
download_structure("asdf")

In [50]:
uniprot_structures_file = NOTEBOOK_DIR.joinpath("uniprot-structures.parquet")

if uniprot_structures_file.is_file():
    uniprot_structures_df = pq.read_table(uniprot_structures_file).to_pandas()
else:
    with concurrent.futures.ThreadPoolExecutor(20) as pool:
        uniprot_ids = combined_mapped_df["uniprot_id"].unique()
        structures = list(tqdm(pool.map(download_structure, uniprot_ids), total=len(uniprot_ids)))
        uniprot_structures_df = pd.DataFrame({"uniprot_id": uniprot_ids, "structure": structures})
    pq.write_table(
        pa.Table.from_pandas(uniprot_structures_df, preserve_index=False), uniprot_structures_file
    )

In [51]:
uniprot_structures_df["structure"].notnull().sum() / len(uniprot_structures_df)

0.9014127764127764

In [52]:
uniprot_structures_df = uniprot_structures_df[uniprot_structures_df["structure"].notnull()]

In [53]:
uniprot_structure_lookup = (
    uniprot_structures_df[["uniprot_id", "structure"]]
    .drop_duplicates()
    .set_index("uniprot_id")["structure"]
    .to_dict()
)

list(uniprot_structure_lookup)[:3]

['Q96NU1', 'P05161', 'O00468']

## Download alignments (`uniprot_alignments_df`)

In [54]:
mmseqs2_output_path = NOTEBOOK_DIR.joinpath("mmseqs2")

mmseqs2_output_path

PosixPath('/home/kimlab5/strokach/workspace/elaspic/elaspic2-cagi6/notebooks/30_cagi6_sherloc/mmseqs2')

In [55]:
def get_alignment(uniprot_id, sequence, gateway):
    while True:
        try:
            alignment = mmseqs2.run_mmseqs2([sequence], gateway=gateway)[0]
            break
        except Exception as e:
            print(e)
            time.sleep(10)
            continue
    assert alignment[1] == f"{sequence}\n"
    result = {"uniprot_id": uniprot_id, "alignment": alignment}

    with gzip.open(mmseqs2_output_path.joinpath(f"{uniprot_id}.pickle.gz"), "wb") as fout:
        pickle.dump(result, fout, pickle.HIGHEST_PROTOCOL)

    return result

In [56]:
def iter_alignments(mmseqs2_output_path):
    for file in tqdm(os.listdir(mmseqs2_output_path)):
        uniprot_id = file.split(".")[0]
        with gzip.open(mmseqs2_output_path.joinpath(file)) as fin:
            try:
                data = pickle.load(fin)
            except Exception as ie:
                print(e)
                continue
        assert data["uniprot_id"] == uniprot_id
        assert len(data["alignment"]) > 2
        yield data

In [57]:
uniprot_alignments_file = NOTEBOOK_DIR.joinpath("uniprot-alignments.parquet")

if uniprot_alignments_file.is_file():
    uniprot_alignments_df = pq.read_table(uniprot_alignments_file).to_pandas()
else:
    writer = None
    batch_size = 100
    for alignment_batch in more_itertools.chunked(iter_alignments(mmseqs2_output_path), batch_size):
        alignment_batch_df = pd.DataFrame(alignment_batch)
        table = pa.Table.from_pandas(alignment_batch_df, preserve_index=False)
        if writer is None:
            writer = pq.ParquetWriter(uniprot_alignments_file, table.schema, compression="zstd")
        writer.write_table(table)
    writer.close()

uniprot_alignments_df["have_alignment"] = True

In [58]:
uniprot_sequences_for_aln_df = uniprot_sequences_df[
    ~uniprot_sequences_df["uniprot_id"].isin(uniprot_alignments_df["uniprot_id"].values)
]

uniprot_sequences_for_aln_df["sequence_length"] = uniprot_sequences_for_aln_df["sequence"].str.len()
uniprot_sequences_for_aln_df = uniprot_sequences_for_aln_df.sort_values(
    "sequence_length", ascending=True
)

display(uniprot_sequences_for_aln_df)
print(len(uniprot_sequences_for_aln_df))

Unnamed: 0,uniprot_id,sequence,sequence_length
436,P50461,MPNWGGGAKCGACEKTVYHAEEIQCNGRSFHKTCFHCMACRKALDS...,194
1556,Q8WZ42,MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVI...,34350


2


In [59]:
if len(uniprot_sequences_for_aln_df) > 2:
    from elaspic2.plugins.alphafold import mmseqs2

    dotenv.load_dotenv("../.env")
    with mmseqs2.api_gateway(mmseqs2.MMSEQS2_HOST_URL) as gateway:
        concurrency = 64
        with concurrent.futures.ThreadPoolExecutor(concurrency) as pool:
            alignments = list(
                tqdm(
                    pool.map(
                        get_alignment,
                        uniprot_sequences_for_aln_df["uniprot_id"],
                        uniprot_sequences_for_aln_df["sequence"],
                        itertools.repeat(gateway),
                    ),
                    total=len(uniprot_sequences_for_aln_df),
                )
            )

## Map mutations to canonical uniprot (`combined_mapped_validated_df`)

In [60]:
def mutation_matches_sequence(mutation, sequence):
    wt, pos, mut = mutation[0], mutation[1:-1], mutation[-1]
    try:
        pos = int(pos)
    except Exception as e:
        return False
    if pos > len(sequence):
        return False
    return sequence[pos - 1] == wt


mutation_matches_sequence("A5C", "XXXXA")

True

In [61]:
combined_mapped_canonical_all_df = combined_mapped_df[combined_mapped_df["uniparc_id"].isnull()]

combined_mapped_noncanonical_all_df = combined_mapped_df[combined_mapped_df["uniparc_id"].notnull()]

assert len(combined_mapped_df) == len(combined_mapped_canonical_all_df) + len(
    combined_mapped_noncanonical_all_df
)

In [62]:
matches = np.array(
    [
        mutation_matches_sequence(mut, uniprot_sequence_lookup[uniprot_id])
        for mut, uniprot_id in combined_mapped_canonical_all_df[["mut", "uniprot_id"]].values
    ]
)

combined_mapped_canonical_df = combined_mapped_canonical_all_df[matches]

len(combined_mapped_canonical_df) / len(combined_mapped_canonical_all_df)

0.9983728749185359

In [63]:
matches = np.array(
    [
        mutation_matches_sequence(mut, uniparc_sequence_lookup[uniparc_id])
        for mut, uniparc_id in combined_mapped_noncanonical_all_df[["mut", "uniparc_id"]].values
    ]
)

combined_mapped_noncanonical_df = combined_mapped_noncanonical_all_df[matches]

len(combined_mapped_noncanonical_df) / len(combined_mapped_noncanonical_all_df)

0.9610340873553633

In [64]:
uniparc_to_uniprot_alignment_file = NOTEBOOK_DIR.joinpath("uniparc-to_uniprot-alignment.parquet")

if uniparc_to_uniprot_alignment_file.is_file():
    uniparc_to_uniprot_alignment_df = pq.read_table(uniparc_to_uniprot_alignment_file).to_pandas()
else:
    from kmtools import sequence_tools

    uniparc_and_uniprot_pairs = combined_mapped_noncanonical_df[
        ["uniparc_id", "uniprot_id"]
    ].drop_duplicates()

    with concurrent.futures.ThreadPoolExecutor(CPU_COUNT) as pool:
        alignments = list(
            tqdm(
                pool.map(
                    sequence_tools.align_pairwise,
                    (
                        uniparc_sequence_lookup[id_]
                        for id_ in uniparc_and_uniprot_pairs["uniparc_id"]
                    ),
                    (
                        uniprot_sequence_lookup[id_]
                        for id_ in uniparc_and_uniprot_pairs["uniprot_id"]
                    ),
                ),
                total=len(uniparc_and_uniprot_pairs),
            )
        )

    uniparc_to_uniprot_alignment_df = pd.DataFrame(
        [
            (uniparc_id, uniprot_id, *alignment)
            for (uniparc_id, uniprot_id), alignment in zip(
                uniparc_and_uniprot_pairs.values, alignments
            )
        ],
        columns=["uniparc_id", "uniprot_id", "uniparc_sequence_aln", "uniprot_sequence_aln"],
    )
    pq.write_table(
        pa.Table.from_pandas(uniparc_to_uniprot_alignment_df, preserve_index=False),
        uniparc_to_uniprot_alignment_file,
    )

In [65]:
uniparc_to_uniprot_alignment_lookup = (
    uniparc_to_uniprot_alignment_df.set_index(["uniparc_id", "uniprot_id"])
    .apply(tuple, axis=1)
    .to_dict()
)

assert len(uniparc_to_uniprot_alignment_df) == len(uniparc_to_uniprot_alignment_lookup)

list(uniparc_to_uniprot_alignment_lookup)[:3]

[('UPI00001D7C8B', 'O00468'),
 ('UPI000013D455', 'O14640'),
 ('UPI000002B0DD', 'O60683')]

In [66]:
def map_mutation(mutation, alignment):
    seq1, seq2 = alignment
    assert len(seq1) == len(seq2)

    wt, pos, mut = mutation[0], mutation[1:-1], mutation[-1]
    pos = int(pos)
    assert len(seq1) >= pos

    pos1 = 0
    pos2 = 0
    for aa1, aa2 in zip(seq1, seq2):
        if aa1 != "-":
            pos1 += 1
        if aa2 != "-":
            pos2 += 1
        if pos1 == pos and wt == aa2:
            return f"{wt}{pos2}{mut}"

In [67]:
combined_mapped_noncanonical_df["mut_uniprot"] = [
    map_mutation(mutation, uniparc_to_uniprot_alignment_lookup[(uniparc_id, uniprot_id)])
    for mutation, uniparc_id, uniprot_id in combined_mapped_noncanonical_df[
        ["mut", "uniparc_id", "uniprot_id"]
    ].values
]

In [68]:
combined_mapped_noncanonical_df["mut_uniprot"].notnull().sum() / len(combined_mapped_noncanonical_df)

0.9699106213120444

In [69]:
combined_mapped_validated_df = pd.concat(
    [
        combined_mapped_canonical_df,
        combined_mapped_noncanonical_df.rename(
            columns={"mut": "mut_uniparc", "mut_uniprot": "mut"}
        ).dropna(subset=["mut"]),
    ],
    ignore_index=True,
)

In [70]:
print(len(combined_mapped_validated_df) / len(combined_mapped_df))
print(len(combined_mapped_validated_df["hgvs"].unique()) / len(combined_mapped_df["hgvs"].unique()))
print(
    len(combined_mapped_validated_df["hgvs_g"].unique())
    / len(combined_mapped_df["hgvs_g"].unique())
)
# 0.9870093183914869
# 0.9876785224602933
# 0.9723953695458593

0.9870093183914869
0.9876785224602933
0.9723953695458593


In [71]:
assert len(combined_mapped_validated_df) == len(
    combined_mapped_validated_df.drop_duplicates(subset=["hgvs", "hgvs_g", "uniprot_id"])
)

## Finalize data

In [72]:
def load_structure_blob(structure_blob):
    with tempfile.NamedTemporaryFile(suffix=".pdb") as tmp_file:
        with open(tmp_file.name, "wt") as fout:
            fout.write(structure_blob)
        structure = PDB.load(tmp_file.name)
    return structure

def sequence_matches_structure(sequence, structure_blob):
    structure = load_structure_blob(structure_blob)
    sequence_from_structure = structure_tools.get_chain_sequence(structure[0]["A"])
    return sequence == sequence_from_structure
        
        
sequence_matches_structure("", uniprot_structure_lookup["Q96NU1"])

False

In [73]:
combined_mapped_validated_df.head(2)

Unnamed: 0,chr,pos,ref,alt,hgvs,interpretation,Subcategory Missense,Subcategory Small intronic deletion or insertion,Subcategory Small exonic in-frame deletion or insertion,hgvs_g,protein_id,mut,refseq_id,uniprot_id,uniparc_id,mut_uniparc
0,1,861332.0,G,A,NM_152486.2:c.11G>A,Uncertain significance,True,False,False,,NP_689699.2,G4E,NP_689699,Q96NU1,,
1,1,865568.0,G,A,NM_152486.2:c.106G>A,Uncertain significance,True,False,False,,NP_689699.2,A36T,NP_689699,Q96NU1,,


In [74]:
combined_mapped_validated_df[
    combined_mapped_validated_df.duplicated(subset=["hgvs", "uniprot_id", "mut"], keep=False)
].sort_values(["uniprot_id", "hgvs"]).head(100)

Unnamed: 0,chr,pos,ref,alt,hgvs,interpretation,Subcategory Missense,Subcategory Small intronic deletion or insertion,Subcategory Small exonic in-frame deletion or insertion,hgvs_g,protein_id,mut,refseq_id,uniprot_id,uniparc_id,mut_uniparc
58731,14,61115535.0,C,T,NM_005982.3:c.373G>A,,True,False,False,,NP_005973.1,E125K,NP_005973,A1YER0,,
58733,,,,,NM_005982.3:c.373G>A,,,,,NC_000014.8:g.61115535C>T,NP_005973.1,E125K,NP_005973,A1YER0,,
101430,17,79478256.0,G,A,NM_001614.3:c.760C>T,,True,False,False,,NP_001605.1,R254W,NP_001605,A2BDB0,,
101437,,,,,NM_001614.3:c.760C>T,,,,,NC_000017.10:g.79478256G>A,NP_001605.1,R254W,NP_001605,A2BDB0,,
115353,2,15679451.0,G,A,NM_015909.3:c.409C>T,,True,False,False,,NP_056993.2,R137W,NP_056993,A2RRP1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103020,,,,,NM_000271.4:c.3160G>A,,,,,NC_000018.9:g.21116722C>T,NP_000262.2,A1054T,NP_000262,O15118,,
103058,18,21148900.0,T,C,NM_000271.4:c.350A>G,,True,False,False,,NP_000262.2,Q117R,NP_000262,O15118,,
103059,,,,,NM_000271.4:c.350A>G,,,,,NC_000018.9:g.21148900T>C,NP_000262.2,Q117R,NP_000262,O15118,,
238475,12,115112299.0,G,A,NM_005996.3:c.1381C>T,,True,False,False,,NP_005987.3,P481S,NP_005987,O15119,UPI000002B4EA,P461S


In [75]:
set(uniprot_structure_lookup) - set(uniprot_sequence_lookup)

set()

In [76]:
final_df = (
    combined_mapped_validated_df
    # Caused by bad mapping from RefSeq to UniProt
    .merge(uniprot_sequences_df, on=["uniprot_id"])
    .assign(
        uniprot_id=lambda df: df["sequence"].map(
            {uniprot_sequence_lookup[k]: k for k in uniprot_sequence_lookup}
            | {uniprot_sequence_lookup[k]: k for k in uniprot_structure_lookup}
        )
    )
    #
    .drop_duplicates(subset=["uniprot_id", "hgvs", "mut", "interpretation"])
    .sort_values("uniprot_id")
    .drop_duplicates(["sequence", "hgvs", "mut", "interpretation"])
    .groupby(["uniprot_id"], dropna=False)
    .agg({"hgvs": list, "mut": list, "interpretation": list})
    .reset_index()
)

assert final_df["uniprot_id"].notnull().all()

display(final_df.head(2))
print(len(final_df))  # 2901 | 3245 | 2926

Unnamed: 0,uniprot_id,hgvs,mut,interpretation
0,A1X283,"[NM_001017995.2:c.1475A>C, NM_001017995.2:c.10...","[K492T, K341E, R356Q, E396K, A431T, T445A, A49...","[Uncertain significance, Uncertain significanc..."
1,A2IDD5,"[NM_001031737.2:c.902A>G, NM_001031737.2:c.910...","[K301R, V304L, R308H, D333E, N322K, A325P, R29...","[Uncertain significance, Uncertain significanc..."


2926


In [77]:
final_df = (
    final_df
    #
    .merge(uniprot_sequences_df, on=["uniprot_id"])
    #
    .merge(uniprot_structures_df, on=["uniprot_id"])
    #
    .merge(uniprot_alignments_df[["uniprot_id", "alignment"]], on=["uniprot_id"])
    #
    .dropna(subset=["sequence", "structure", "alignment"])
)

display(final_df.head(2))
print(len(final_df))  # 2610 | 2923 | 2833

Unnamed: 0,uniprot_id,hgvs,mut,interpretation,sequence,structure,alignment
0,A1X283,"[NM_001017995.2:c.1475A>C, NM_001017995.2:c.10...","[K492T, K341E, R356Q, E396K, A431T, T445A, A49...","[Uncertain significance, Uncertain significanc...",MPPRRSIVEVKVLDVQKRRVPNKHYVYIIRVTWSSGSTEAIYRRYS...,HEADER ...,"[>101\n, MPPRRSIVEVKVLDVQKRRVPNKHYVYIIRVTWSSGS..."
1,A2IDD5,"[NM_001031737.2:c.902A>G, NM_001031737.2:c.910...","[K301R, V304L, R308H, D333E, N322K, A325P, R29...","[Uncertain significance, Uncertain significanc...",MEHAATTGPRPGPPSRRVENVVLRAKDWLPGAPGGTAVWATSLEAE...,HEADER ...,"[>101\n, MEHAATTGPRPGPPSRRVENVVLRAKDWLPGAPGGTA..."


2833


In [78]:
for uniprot_id, sequence, structure_blob in tqdm(
    final_df[["uniprot_id", "sequence", "structure"]].values
):
    assert sequence_matches_structure(sequence, structure_blob)

  0%|          | 0/2833 [00:00<?, ?it/s]

In [79]:
for uniprot_id, sequence, alignment in tqdm(
    final_df[["uniprot_id", "sequence", "alignment"]].values
):
    assert sequence == alignment[1].strip()

  0%|          | 0/2833 [00:00<?, ?it/s]

## Write output

In [80]:
if (combined_mapped_validated_df["chr"].apply(type) == int).any():
    mask = combined_mapped_validated_df["chr"].isnull()
    combined_mapped_validated_df["chr"] = combined_mapped_validated_df["chr"].astype(str)
    combined_mapped_validated_df.loc[mask, "chr"] = None

In [81]:
final_output_file = NOTEBOOK_DIR.joinpath("processed-input-data.parquet")

pq.write_table(
    pa.Table.from_pandas(combined_mapped_validated_df, preserve_index=False), final_output_file
)

In [82]:
def split_rows(df, fields_to_split=["hgvs", "mut", "interpretation"], max_size=100):
    results = []
    for tup in tqdm(df.itertuples(index=False), total=len(df)):
        if len(tup.hgvs) <= max_size:
            results.append(tup)
        else:
            for start in range(0, len(tup.hgvs), max_size):
                tup_ = tup._replace(
                    **{
                        field: getattr(tup, field)[start : start + max_size]
                        for field in fields_to_split
                    }
                )
                results.append(tup_)
    return pd.DataFrame(results)

In [85]:
final_df["seqlen"] = final_df["sequence"].str.len()
final_df = final_df.sort_values("seqlen", ascending=True)
del final_df["seqlen"]

final_chunked_df = split_rows(final_df)

  0%|          | 0/2833 [00:00<?, ?it/s]

In [86]:
assert final_df["hgvs"].str.len().sum() == final_chunked_df["hgvs"].str.len().sum()
assert (
    np.array(list(more_itertools.flatten(final_df["hgvs"])))
    == np.array(list(more_itertools.flatten(final_chunked_df["hgvs"])))
).all()

assert final_df["mut"].str.len().sum() == final_chunked_df["mut"].str.len().sum()
assert (
    np.array(list(more_itertools.flatten(final_df["mut"])))
    == np.array(list(more_itertools.flatten(final_chunked_df["mut"])))
).all()

assert (final_df["uniprot_id"].values == final_chunked_df["uniprot_id"].unique()).all()
assert (final_df["sequence"].values == final_chunked_df["sequence"].unique()).all()
assert (final_df["structure"].values == final_chunked_df["structure"].unique()).all()
assert (
    final_df["alignment"].apply(tuple).values == final_chunked_df["alignment"].apply(tuple).unique()
).all()

In [89]:
final_output_file = NOTEBOOK_DIR.joinpath("input-data-gby-protein.parquet")

pq.write_table(
    pa.Table.from_pandas(
        final_chunked_df.rename(
            columns={
                "uniprot_id": "protein_id",
                "hgvs": "mutation_id",
                "mut": "mutation",
                "interpretation": "effect",
            }
        ),
        preserve_index=False,
    ),
    final_output_file,
    row_group_size=1,
)

In [90]:
pfile = pq.ParquetFile(final_output_file)

pfile.num_row_groups

4182