# Variant embeddings

In this notebook, we explore strategies to obtain simple vector embeddings from the variant annotation tabular data.

These articles are interesting:
- https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00544/115239/Transformers-for-Tabular-Data-Representation-A

```python
with open("codebook.tsv", "w") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["index", "column_name"])
    for i, c in enumerate(list(df.columns)):
        writer.writerow([i, c])
``````

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(
    "../data/variants/1000_Genomes/10000_subset_snvs_coding_1kGPhg38.tsv", sep="\t"
)

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
from tqdm import tqdm


class BioGPTEmbedder(object):
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
        self.model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

    def calculate(self, text_inputs):
        X = np.zeros((len(text_inputs), 1024), dtype=np.float32)
        for i, text in tqdm(enumerate(text_inputs)):
            encoded_input = self.tokenizer(text, return_tensors="pt")
            with torch.no_grad():
                hidden_states = self.model.base_model(**encoded_input).last_hidden_state
            mean_encoding = torch.mean(hidden_states, dim=1)
            mean_encoding_np = mean_encoding.numpy()
            X[i, :] = mean_encoding_np
        return X


embedder = BioGPTEmbedder()

  from .autonotebook import tqdm as notebook_tqdm


## Basic information and annotations

In [5]:
gene_biotypes = {
    "Protein-coding related": {
        "protein_coding": "Genes that code for proteins.",
        "nonsense_mediated_decay": "Genes with transcripts that are subject to Nonsense-Mediated Decay.",
        "non_stop_decay": "Transcripts that have an open reading frame but lack a stop codon.",
        "ambiguous_orf": "Transcripts with an uncertain coding potential.",
        "disrupted_domain": "Transcripts that contain a frameshift within their coding sequence.",
    },
    "Non-coding RNA genes": {
        "lncRNA": "Long non-coding RNA genes.",
        "miRNA": "MicroRNA genes, which produce small RNAs involved in gene silencing.",
        "snRNA": "Small nuclear RNA genes, usually involved in splicing.",
        "snoRNA": "Small nucleolar RNA genes, mainly involved in rRNA modification.",
        "rRNA": "Ribosomal RNA genes.",
        "tRNA": "Transfer RNA genes.",
        "pseudogene": "Non-functional sequences that resemble functional genes.",
        "Mt_tRNA": "Mitochondrial tRNA genes.",
        "Mt_rRNA": "Mitochondrial rRNA genes.",
    },
    "Immunoglobulin/T-cell receptor genes": {
        "IG_C_gene": "Immunoglobulin Constant gene.",
        "IG_D_gene": "Immunoglobulin Diversity gene.",
        "IG_J_gene": "Immunoglobulin Joining gene.",
        "IG_V_gene": "Immunoglobulin Variable gene.",
        "TR_C_gene": "T-cell receptor Constant gene.",
        "TR_D_gene": "T-cell receptor Diversity gene.",
        "TR_J_gene": "T-cell receptor Joining gene.",
        "TR_V_gene": "T-cell receptor Variable gene.",
    },
    "Other types": {
        "misc_RNA": "Miscellaneous RNA genes not classified elsewhere.",
        "scaRNA": "Small Cajal body-specific RNA genes.",
        "vaultRNA": "Vault RNA genes.",
        "bidirectional_promoter_lncRNA": "Non-coding RNA genes that originate from bidirectional promoters.",
        "retained_intron": "Transcripts that retain an intron.",
    },
    "Pseudogenes": {
        "processed_pseudogene": "Pseudogenes derived from retrotransposition.",
        "unprocessed_pseudogene": "Direct duplications of genes.",
        "polymorphic_pseudogene": "Pseudogenes with some functional alleles in the population.",
        "transcribed_processed_pseudogene": "Transcribed processed pseudogenes.",
        "transcribed_unprocessed_pseudogene": "Transcribed unprocessed pseudogenes.",
        "transcribed_unitary_pseudogene": "Transcribed pseudogenes with no functional counterparts.",
        "unitary_pseudogene": "Pseudogenes with no functional counterparts in the genome.",
    },
    "Others": {
        "TEC": "To be Experimentally Confirmed.",
        "macro_lncRNA": "Very long non-coding RNA genes.",
    },
}

gene_biotypes = dict((l, w) for k, v in gene_biotypes.items() for l, w in v.items())

columns = list(df.columns)

basic_columns = columns[slice(0, columns.index("ANN[*].ERRORS") + 1)]

texts = []

for r in df[basic_columns].values:
    text = ""
    c = r[basic_columns.index("CHROM")]
    if c == ".":
        text += "Variant with no known chromosome. "
    else:
        text += "Variant on chromosome {0}. ".format(c)
    text += "Reference allele is {0} and variant allele is {1}. ".format(
        r[basic_columns.index("REF")], r[basic_columns.index("ALT")]
    )
    text += "Annotated gene is {0}. ".format(r[basic_columns.index("ANN[*].GENE")])
    feat = r[basic_columns.index("ANN[*].FEATURE")]
    text += "It is reported on a {0}. ".format(feat)
    biot = r[basic_columns.index("ANN[*].BIOTYPE")]
    if biot in gene_biotypes:
        text += "The biotype is {0}. ".format(gene_biotypes[biot].rstrip("."))
    eff = r[basic_columns.index("ANN[*].EFFECT")].replace("_", " ")
    imp = r[basic_columns.index("ANN[*].IMPACT")].lower()
    rnk = r[basic_columns.index("ANN[*].RANK")]
    text += "The effect is {0} with a {1} impact ranked {2}. ".format(eff, imp, rnk)
    hgvs = r[basic_columns.index("ANN[*].HGVS_P")]
    if hgvs != ".":
        text += "The point mutation is {0}. ".format(hgvs)
    text = text.rstrip(" ")
    texts += [text]


X_basic = embedder.calculate(texts[:10])

10it [00:01,  7.68it/s]


## Frequency

In [8]:
population_frequencies_columns = columns[
    slice(columns.index("AF"), columns.index("dbNSFP_ExAC_SAS_AF") + 1)
]
dfreq = pd.DataFrame(df[population_frequencies_columns])

X_freq = np.array(dfreq)

## Predictions

In [9]:
prediction_scores_columns = columns[
    slice(columns.index("dbNSFP_CADD_phred"), columns.index("dbNSFP_VEST4_score") + 1)
]

df["dbNSFP_Polyphen2_HDIV_pred"].replace({"B": 0, "P": 1, "D": 2}, inplace=True)
df["dbNSFP_SIFT_pred"].replace({"D": 1, "T": 0}, inplace=True)
df["dbNSFP_MutationAssessor_pred"].replace(
    {"H": 3, "M": 2, "L": 1, "N": 0}, inplace=True
)
df["dbNSFP_MutationTaster_pred"].replace({"A": 2, "D": 1, "N": 0, "P": 0}, inplace=True)


def calculate_value(s):
    values = s.split(",")
    values = [v for v in values if v != "."]
    if not values:
        return float("NaN")
    mean_value = sum([1 if v == "D" else 0 for v in values]) / len(values)
    return mean_value


df["dbNSFP_PROVEAN_pred"] = df["dbNSFP_PROVEAN_pred"].apply(calculate_value)
dpred = pd.DataFrame(df[prediction_scores_columns])
dpred.replace(".", np.nan, inplace=True)

X_pred = np.array(dpred)

## Other columns

In [10]:
other_columns = columns[
    slice(
        columns.index("dbNSFP_clinvar_id"), columns.index("dbNSFP_Interpro_domain") + 1
    )
]
texts = []
for v in df[["dbNSFP_clinvar_clnsig", "dbNSFP_clinvar_trait", "COSV_ID"]].values:
    t = ""
    if v[0] != ".":
        t += "The clinical relevance of the variant is {0}. ".format(v[0])
    else:
        t += "The clinical relevance of the variant is not annotated. "
    if v[1] != ".":
        t += "The traits are {0}. ".format(v[1].replace("_", " "))
    else:
        t += "No traits are known. "
    if v[2] != ".":
        t += "The affected domains of the variant in COSMIC are {0}.".format(v[2])
    else:
        t += "No domains in COSMIC."
    texts += [t]

X_other = embedder.calculate(texts[:10])

0it [00:00, ?it/s]

10it [00:01,  8.06it/s]


In [14]:
df[columns[10:]]

Unnamed: 0,ANN[*].EFFECT,ANN[*].IMPACT,ANN[*].RANK,ANN[*].HGVS_C,ANN[*].HGVS_P,ANN[*].CDNA_POS,ANN[*].CDNA_LEN,ANN[*].CDS_POS,ANN[*].CDS_LEN,ANN[*].AA_POS,...,dbNSFP_VEST4_score,dbNSFP_clinvar_id,dbNSFP_clinvar_clnsig,dbNSFP_clinvar_trait,dbNSFP_clinvar_review,dbNSFP_clinvar_MedGen_id,dbNSFP_clinvar_OMIM_id,dbNSFP_clinvar_Orphanet_id,COSV_ID,dbNSFP_Interpro_domain
0,missense_variant,MODERATE,1,c.119T>C,p.Ile40Thr,367,3144,119,1230,40,...,.,.,.,.,.,.,.,.,.,
1,missense_variant,MODERATE,2,c.124C>A,p.Gln42Lys,339,5548,124,231,42,...,.,.,.,.,.,.,.,.,.,
2,missense_variant,MODERATE,10,c.3283G>A,p.Ala1095Thr,3590,6105,3283,5691,1095,...,.,282995,Benign,Three_M_syndrome_2_not_specified_not_provided,"criteria_provided,_multiple_submitters,_no_con...",C2752041_CN169374_CN517202,612921,.,Immunoglobulin-like_domain_Immunoglobulin_subt...,
3,missense_variant,MODERATE,6,c.581G>A,p.Arg194Lys,691,2513,581,795,194,...,.,.,.,.,.,.,.,.,.,
4,missense_variant,MODERATE,22,c.2182T>C,p.Trp728Arg,2212,2708,2182,2520,728,...,.,.,.,.,.,.,.,.,Vps16__C-terminal,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,missense_variant,MODERATE,2,c.185A>G,p.Asn62Ser,220,8432,185,2481,62,...,.,.,.,.,.,.,.,.,WD40-repeat-containing_domain,
9996,missense_variant,MODERATE,16,c.1742C>T,p.Pro581Leu,1772,2494,1742,2271,581,...,.,41635,Likely_benign,Lynch_syndrome_Lynch_syndrome_II_Hereditary_ca...,reviewed_by_expert_panel,C4552100_C1333991_C0027672_C0009405_CN169374_C...,PS120435_609310,ORPHA144_ORPHA140162,DNA_mismatch_repair_protein_Mlh1__C-terminal,
9997,missense_variant,MODERATE,2,c.197T>C,p.Ile66Thr,328,2067,197,1260,66,...,.,.,.,.,.,.,.,.,Immunoglobulin_V-set_domain_Immunoglobulin_sub...,
9998,missense_variant,MODERATE,8,c.1310C>G,p.Ala437Gly,1556,2294,1310,1419,437,...,.,.,.,.,.,.,.,.,.,


In [16]:
df["ANN[*].GENE"]

0         POU4F2
1       C18orf32
2          OBSL1
3         COPS7B
4          VPS16
          ...   
9995        ELP2
9996        MLH1
9997        PSG7
9998      MBOAT7
9999      RNF213
Name: ANN[*].GENE, Length: 10000, dtype: object