GET DEPENDENCIES

In [15]:
%pip install transformers
%pip install torch
%pip install pandas
%pip install gradio


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;4

In [16]:
import transformers
from transformers import AutoTokenizer
import torch
import pandas as pd
from torch.utils import data
from torch.utils.data import Dataset, DataLoader
import gradio as gr

In [17]:
# determines whether CUDA GPU or CPU is used
device = ('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))

# max length for input data
MAX_LEN = 256

# model name
model_name = 'roberta-base'

In [18]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe['text']
        self.targets = dataframe['label']
        self.max_len = max_len

    def __len__(self):
      return len(self.comment_text)

    def __getitem__(self, index):
        # split and rejoin sentence to standardize whitespace
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text, #text data in dataframe
            None, #no second input
            add_special_tokens=True, #increases accuracy
            max_length=self.max_len, #max input
            # truncation=True,
            pad_to_max_length=True, #standardizes input length
            return_token_type_ids=True #returns token ids
        )

        '''
        inputs returns
        input_ids: list of token ids that represent input text
        attention_mask: determine which tokens are input and which are padding
        token_type_ids: token type IDs that differentiate between different segments of text
        '''

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        # return dictionary with tensor arrays containing ids, mask, token types, and targets (values)
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [19]:
from transformers import RobertaModel
import torch

class RoBERTaClass(torch.nn.Module):
    def __init__(self):
        super(RoBERTaClass, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base") #layer 1 has roberta model
        self.drop = torch.nn.Dropout(0.3) #layer 2 deactivates (drops out) .30 of neurons while training
        self.linear = torch.nn.Linear(768, 11) #weight matrix and bias vector

    def forward(self, ids, mask, token_type_ids): #run model
        _, output_1 = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )

        output_2 = self.drop(output_1)
        output = self.linear(output_2)
        return output

model = RoBERTaClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTaClass(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

INFERENCE CODE

In [45]:
# load model and run inference
model = RoBERTaClass().to(device)
model.load_state_dict(torch.load('enhance_cancer.pth'))
tokenizer = AutoTokenizer.from_pretrained(model_name)

keys = {
    '1': 'Sustaining proliferative signaling (PS)',
    '2': 'Evading growth suppressors (GS)',
    '3': 'Resisting cell death (CD)',
    '4': 'Enabling replicative immortality (RI)',
    '5': 'Inducing angiogenesis (A)',
    '6': ' Activating invasion & metastasis (IM)',
    '7': 'Genome instability & mutation (GI)',
    '8': 'Tumor-promoting inflammation (TPI)',
    '9': 'Deregulating cellular energetics (CE)',
    '10': 'Avoiding immune destruction (ID)'
}

def inference(symptom):
    inference_dataset = pd.DataFrame({'text': [symptom], 'label': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]})
    inference_set = CustomDataset(inference_dataset, tokenizer=tokenizer, max_len=MAX_LEN)
    inference_loader = DataLoader(inference_set, batch_size=1, shuffle=False)
    
    # run model on input data
    model.eval()
    with torch.no_grad():
        for data in inference_loader:
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            output = model(ids, mask, token_type_ids)

    # process output
    predictions = torch.nn.functional.softmax(output, dim=1)
    predicted_classes = torch.argmax(predictions, dim=1)

    hallmarks = [l.item() for l in predicted_classes.data]

    hallmarksDesc = ""
    for lID in hallmarks:
        hallmarksDesc += keys[str(lID)]

    print(hallmarksDesc)

    return hallmarksDesc

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
# input_text = input('Express your symptoms in a sentence:') #ex- 'There was no evidence of immunosuppression.'
# output = inference(input_text)
# print(f'hallmark: {output}')

iface = gr.Interface(fn=inference, inputs='text', outputs='text', title='Hallmarks of Cancer')
iface.launch()

Running on local URL:  http://127.0.0.1:7873

To create a public link, set `share=True` in `launch()`.




Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Genome instability & mutation (GI)




Genome instability & mutation (GI)




Genome instability & mutation (GI)
