# Variant embeddings

In this notebook, we explore strategies to obtain simple vector embeddings from the variant annotation tabular data.

These articles are interesting:
- https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00544/115239/Transformers-for-Tabular-Data-Representation-A

```python
with open("codebook.tsv", "w") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["index", "column_name"])
    for i, c in enumerate(list(df.columns)):
        writer.writerow([i, c])
``````

In [112]:
import pandas as pd
import csv

In [113]:
df = pd.read_csv("../data/variants/1000_Genomes/10000_subset_snvs_coding_1kGPhg38.tsv", sep="\t")

In [114]:
columns = list(df.columns)

population_frequencies_columns = columns[slice(columns.index("AF"), columns.index("dbNSFP_ExAC_SAS_AF")+1)]
dfreq = pd.DataFrame(df[population_frequencies_columns])

In [115]:
prediction_scores_columns = columns[slice(columns.index("dbNSFP_CADD_phred"), columns.index("dbNSFP_VEST4_score")+1)]

df["dbNSFP_Polyphen2_HDIV_pred"].replace({'B': 0, 'P': 1, 'D': 2}, inplace=True)
df["dbNSFP_SIFT_pred"].replace({'D': 1, 'T': 0}, inplace=True)
df["dbNSFP_MutationAssessor_pred"].replace({'H': 3, 'M': 2, "L": 1, "N": 0}, inplace=True)
df["dbNSFP_MutationTaster_pred"].replace({"A": 2, "D": 1, "N": 0, "P": 0}, inplace=True)
def calculate_value(s):
    values = s.split(',')
    values = [v for v in values if v != '.']
    if not values:
        return float('NaN')
    mean_value = sum([1 if v == 'D' else 0 for v in values]) / len(values)
    return mean_value
df['dbNSFP_PROVEAN_pred'] = df['dbNSFP_PROVEAN_pred'].apply(calculate_value)
dpred = pd.DataFrame(df[prediction_scores_columns])
dpred.replace(".", np.nan, inplace=True)

In [127]:
other_columns = columns[slice(columns.index("dbNSFP_clinvar_id"), columns.index("dbNSFP_Interpro_domain")+1)]
texts = []
for v in df[["dbNSFP_clinvar_clnsig", "dbNSFP_clinvar_trait", "COSV_ID"]].values:
    t = ""
    if v[0] != ".":
        t += "The clinical relevance of the variant is {0}. ".format(v[0])
    else:
        t += "The clinical relevance of the variant is not annotated. "
    if v[1] != ".":
        t += "The traits are {0}. ".format(v[1].replace("_", " "))
    else:
        t += "No traits are known. "
    if v[2] != ".":
        t += "The affected domains of the variant in COSMIC are {0}.".format(v[2])
    else:
        t += "No domains in COSMIC."
    texts += [t]

In [128]:
texts

['The clinical relevance of the variant is not annotated. No traits are known. No domains in COSMIC.',
 'The clinical relevance of the variant is not annotated. No traits are known. No domains in COSMIC.',
 'The clinical relevance of the variant is Benign. The traits are Three M syndrome 2 not specified not provided. The affected domains of the variant in COSMIC are Immunoglobulin-like_domain_Immunoglobulin_subtype_2_Immunoglobulin_subtype.',
 'The clinical relevance of the variant is not annotated. No traits are known. No domains in COSMIC.',
 'The clinical relevance of the variant is not annotated. No traits are known. The affected domains of the variant in COSMIC are Vps16__C-terminal.',
 'The clinical relevance of the variant is not annotated. No traits are known. No domains in COSMIC.',
 'The clinical relevance of the variant is not annotated. No traits are known. The affected domains of the variant in COSMIC are Peptidase_family_A1_domain_Peptidase_family_A1_domain.',
 'The clini

In [122]:
set(df["dbNSFP_clinvar_clnsig"])

{'.',
 'Benign',
 'Benign/Likely_benign',
 'Conflicting_interpretations_of_pathogenicity',
 'Likely_benign',
 'Likely_pathogenic',
 'Pathogenic',
 'Pathogenic/Likely_pathogenic',
 'Uncertain_significance',
 'association',
 'drug_response',
 'drug_response,_risk_factor',
 'not_provided',
 'protective'}

In [46]:
from transformers import TapasForQuestionAnswering, TapasTokenizer
model_name = "google/tapas-large-finetuned-wtq"
model = TapasForQuestionAnswering.from_pretrained(model_name, output_hidden_states=True)
tokenizer = TapasTokenizer.from_pretrained(model_name)


In [48]:
inputs = tokenizer(table=table[table.columns[:10]].head(100), queries=["What is this?"], return_tensors="pt", padding="max_length")

Token indices sequence length is longer than the specified maximum sequence length for this model (3750 > 512). Running this sequence through the model will result in indexing errors.


In [40]:
import torch
with torch.no_grad():
    outputs = model(**inputs)

In [41]:
last_hidden_state = outputs.hidden_states[-1]
embeddings_numpy = last_hidden_state.cpu().numpy()

In [42]:
embeddings_numpy.shape

(1, 784, 1024)

In [50]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np


class BioGPTEmbedder(object):
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
        self.model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

    def calculate(self, text_inputs):
        X = np.zeros((len(text_inputs), 1024), dtype=np.float32)
        for i, text in enumerate(text_inputs):
            encoded_input = self.tokenizer(text, return_tensors="pt")
            with torch.no_grad():
                hidden_states = self.model.base_model(**encoded_input).last_hidden_state
            mean_encoding = torch.mean(hidden_states, dim=1)
            mean_encoding_np = mean_encoding.numpy()
            X[i, :] = mean_encoding_np
        return X

embedder = BioGPTEmbedder()

In [55]:
embedder.calculate(["the gene is AURKA"])

array([[-0.9741001 ,  0.13654037,  0.23815162, ..., -0.01901363,
        -0.05405765,  0.06725373]], dtype=float32)

In [58]:
with open()

SAMPLE
CHROM
POS
REF
ALT
ANN[*].GENE
ANN[*].GENEID
ANN[*].FEATURE
ANN[*].FEATUREID
ANN[*].BIOTYPE
ANN[*].EFFECT
ANN[*].IMPACT
ANN[*].RANK
ANN[*].HGVS_C
ANN[*].HGVS_P
ANN[*].CDNA_POS
ANN[*].CDNA_LEN
ANN[*].CDS_POS
ANN[*].CDS_LEN
ANN[*].AA_POS
ANN[*].AA_LEN
ANN[*].DISTANCE
ANN[*].ALLELE
ANN[*].ERRORS
ID
AF
AC
NS
AN
EAS_AF
EUR_AF
AFR_AF
AMR_AF
SAS_AF
dbNSFP_gnomAD_exomes_AC,dbNSFP_gnomAD_exomes_AN
dbNSFP_gnomAD_exomes_AF
dbNSFP_gnomAD_exomes_POPMAX_AC
dbNSFP_gnomAD_exomes_POPMAX_AN
dbNSFP_gnomAD_exomes_POPMAX_AF
dbNSFP_gnomAD_exomes_AFR_AC
dbNSFP_gnomAD_exomes_AFR_AN
dbNSFP_gnomAD_exomes_AFR_AF
dbNSFP_gnomAD_exomes_NFE_AC
dbNSFP_gnomAD_exomes_NFE_AN
dbNSFP_gnomAD_exomes_NFE_AF
dbNSFP_gnomAD_exomes_AMR_AC
dbNSFP_gnomAD_exomes_AMR_AN
dbNSFP_gnomAD_exomes_AMR_AF
dbNSFP_gnomAD_exomes_ASJ_AC
dbNSFP_gnomAD_exomes_ASJ_AN
dbNSFP_gnomAD_exomes_ASJ_AF
dbNSFP_gnomAD_exomes_EAS_AC
dbNSFP_gnomAD_exomes_EAS_AN
dbNSFP_gnomAD_exomes_EAS_AF
dbNSFP_gnomAD_exomes_FIN_AC
dbNSFP_gnomAD_exomes_FIN_AN
dbNSFP_gn