In [None]:


# Load config file with static parameters
with open(os.path.dirname(__file__) + '/../../config.json') as config_file:
        config = json.load(config_file)

biobert_path = config["BIOBERT_PATH"]
long_biobert_path = config["LONG_BIOBERT_PATH"]


def get_biobert_embeddings(text, long_input=True):
    """
    Parameters::
        text: String with input text
        long_input: Boolean indicating weather to use Clinical LongFormer or Clinical Bert.

    Returns::
        embeddings: Final Biobert embeddings with vector dimensionality = (1,768)
        hidden_embeddings: Last hidden layer in Biobert model with vector dimensionality = (token_size, 768)
    """

    biobert_tokenizer = AutoTokenizer.from_pretrained(long_biobert_path + "tokenizer/")
    biobert_model = AutoModelForMaskedLM.from_pretrained(long_biobert_path + 'model', output_hidden_states=True)
    tokens_pt = biobert_tokenizer(text, return_tensors="pt")

    if not long_input:
        biobert_tokenizer = AutoTokenizer.from_pretrained(biobert_path)
        biobert_model = AutoModel.from_pretrained(biobert_path)
        tokens_pt = biobert_tokenizer(text, return_tensors="pt")

    outputs = biobert_model(**tokens_pt)

    if long_input:
        hidden_embeddings = outputs.hidden_states[-1].detach().numpy()
        last_hidden_shape = hidden_embeddings.shape
        pooling = torch.nn.AvgPool2d([last_hidden_shape[1], 1])
        embeddings = pooling(outputs.hidden_states[-1])
        embeddings = torch.reshape(embeddings, (1, 768)).detach().numpy()
    else:
        last_hidden_state = outputs.last_hidden_state
        pooler_output = outputs.pooler_output
        hidden_embeddings = last_hidden_state.detach().numpy()
        embeddings = pooler_output.detach().numpy()

    return embeddings, hidden_embeddings

def create_embeddings(df):
    """
    Parameters::
        df: DataFrame with a column named "text"

    Returns::
        merged_df: DataFrame with 768 columns; each row contains the embeddings for the text in the corresponding row of df.
    """
    embeddings = []

    for i in range(df.shape[0]):
        text = df.iloc[i]["text"]
        full_embedding = get_biobert_embeddings(text)[0]
        embeddings.append(full_embedding.reshape(-1))

    emb_df =  pd.DataFrame(np.array(embeddings))
    emb_df = emb_df.set_index(df.index)
    merged_df = pd.concat([df, emb_df], axis=1)

    return merged_df.drop(columns= "text", axis=1)

In [1]:
import pandas as pd
from math import isnan
from transformers import AutoTokenizer, AutoModel, logging, LongformerModel, LongformerTokenizer, AutoModelForMaskedLM
import torch
import numpy as np
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/census_income/census_income_sentences.csv')
df.head()

Unnamed: 0,text
0,Census Income Dataset: The person's Age is nor...
1,Census Income Dataset: The person's Age is nor...
2,Census Income Dataset: The person's Age is nor...
3,Census Income Dataset: The person's Age is hig...
4,Census Income Dataset: The person's Age is nor...


In [4]:
print(torch.cuda.is_available())
print(torch.backends.cudnn.enabled)

False
True


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

prompt = "My favourite condiment is"

model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
model.to(device)

generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
tokenizer.batch_decode(generated_ids)[0]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

: 

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


encoded_input = tokenizer("hello", return_tensors='pt')
model_output = model(**encoded_input)
mean_pooling(model_output, encoded_input['attention_mask'])