In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/4c/a0/32e3a4501ef480f7ea01aac329a716132f32f7911ef1c2fac228acc57ca7/transformers-2.6.0-py3-none-any.whl (540kB)
[K     |████████████████████████████████| 542kB 1.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 50.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 50.6MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████

In [0]:

import os
import math

import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, XLNetTokenizer, XLNetModel, XLNetLMHeadModel, XLNetConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [0]:
print("GPU Available: {}".format(torch.cuda.is_available()))
n_gpu = torch.cuda.device_count()
print("Number of GPU Available: {}".format(n_gpu))
print("GPU: {}".format(torch.cuda.get_device_name(0)))
# Edit > Notebook Settings > Put on a GPU 

GPU Available: True
Number of GPU Available: 1
GPU: Tesla P100-PCIE-16GB


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
class XLNetForMultiLabelSequenceClassification(torch.nn.Module):
  
  def __init__(self, num_labels=2):
    super(XLNetForMultiLabelSequenceClassification, self).__init__()
    self.num_labels = num_labels
    self.xlnet = XLNetModel.from_pretrained('xlnet-base-cased')
    self.classifier = torch.nn.Linear(768, num_labels)

    torch.nn.init.xavier_normal_(self.classifier.weight)

  def forward(self, input_ids, token_type_ids=None,\
              attention_mask=None, labels=None):
    # last hidden layer
    last_hidden_state = self.xlnet(input_ids=input_ids,\
                                   attention_mask=attention_mask,\
                                   token_type_ids=token_type_ids)
    # pool the outputs into a mean vector
    mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
    logits = self.classifier(mean_last_hidden_state)
        
    if labels is not None:
      loss_fct = BCEWithLogitsLoss()
      loss = loss_fct(logits.view(-1, self.num_labels),\
                      labels.view(-1, self.num_labels))
      return loss
    else:
      return logits
    
  def freeze_xlnet_decoder(self):
    """
    Freeze XLNet weight parameters. They will not be updated during training.
    """
    for param in self.xlnet.parameters():
      param.requires_grad = False
    
  def unfreeze_xlnet_decoder(self):
    """
    Unfreeze XLNet weight parameters. They will be updated during training.
    """
    for param in self.xlnet.parameters():
      param.requires_grad = True
    
  def pool_hidden_state(self, last_hidden_state):
    """
    Pool the output vectors into a single mean vector 
    """
    last_hidden_state = last_hidden_state[0]
    mean_last_hidden_state = torch.mean(last_hidden_state, 1)
    return mean_last_hidden_state

In [0]:
# Load the saved model 

checkpoint = torch.load("/content/drive/My Drive/Vaccine Capstone/Labelled Dataset/Datasets_XLNet/xlnet_vaccine.bin")
model_state_dict = checkpoint['state_dict']
model = XLNetForMultiLabelSequenceClassification(num_labels=model_state_dict["classifier.weight"].size()[0])
model.load_state_dict(model_state_dict)


HBox(children=(IntProgress(value=0, description='Downloading', max=690, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=467042463, style=ProgressStyle(description_…




<All keys matched successfully>

In [0]:
# get example text 
X_train_full = pd.read_csv('/content/drive/My Drive/Vaccine Capstone/Labelled Dataset/Datasets_XLNet/X_train_full_XLNET_March22.csv', 
                           converters = {'features': eval, 'masks': eval})

In [0]:
example_text = X_train_full.loc[0,'processed'] #string example
# this is just an example text, but this is where the string variable should be
example_text

'early age vaccinate black child sure milkis vaccination ingredient 69.05 something brain cell'

In [0]:
# create a test dataframe to feed into generate_predictions_dict function 
test_df = pd.DataFrame([example_text], columns = ['hash_processed'])

In [0]:
test_df

Unnamed: 0,hash_processed
0,early age vaccinate black child sure milkis va...


In [0]:
def tokenize_inputs(text_list, tokenizer, num_embeddings=512):
    """
    Tokenizes the input text input into ids. Appends the appropriate special
    characters to the end of the text to denote end of sentence. Truncate or pad
    the appropriate sequence length.
    """
    # tokenize the text, then truncate sequence to the desired length minus 2 for
    # the 2 special characters
    tokenized_texts = list(map(lambda t: tokenizer.tokenize(t)[:num_embeddings-2], text_list))
    # convert tokenized text into numeric ids for the appropriate LM
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    # append special token "<s>" and </s> to end of sentence
    input_ids = [tokenizer.build_inputs_with_special_tokens(x) for x in input_ids]
    # pad sequences
    input_ids = pad_sequences(input_ids, maxlen=num_embeddings, dtype="long", truncating="post", padding="post")
    return input_ids

In [0]:
test_list = test_df['hash_processed'].values

In [0]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=798011, style=ProgressStyle(description_wid…




In [0]:
testinput_ids = tokenize_inputs(test_list, tokenizer, num_embeddings=250)

In [0]:
testinput_ids

array([[  319,   679,  2721, 19142,  8549,   710,   863,   512,  4330,
          590, 19507, 16413,  8382,     9,  3739,   359,  2346,  1987,
            4,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [0]:
def create_attn_masks(input_ids):
    """
    Create attention masks to tell model whether attention should be applied to
    the input id tokens. Do not want to perform attention on padding tokens.
    """
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks

In [0]:
test_attention_masks = create_attn_masks(testinput_ids)

In [0]:
# add input ids and attention masks to the dataframe
test_df["features"] = testinput_ids.tolist()
test_df["masks"] = test_attention_masks

In [0]:
test_df #this is the final DF that will be fed into generate_predictions 

Unnamed: 0,hash_processed,features,masks
0,early age vaccinate black child sure milkis va...,"[319, 679, 2721, 19142, 8549, 710, 863, 512, 4...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [0]:
label_cols = ['Conspiracy: Distrust of government, organizations, big pharma',
       'Fear of Critical side-effects (Autism, Brain Damage, SIDS/Death)',
       'Fear of Non-critical side-effects (Rash, Pain, Fever, GI problems, Bump on arm)',
       'Holistic or alternative medicine', 'Logistic Concerns', 'Pro-vax', 'Religious Beliefs',
       'Right to choose',
       'Toxic Ingredients, unclear origins of materials/manufacturer',
       'Vaccines ineffective/unnecessary']

num_labels = len(label_cols)

In [0]:
# write function that will spit out the labels given the model 

def generate_predictions_dict(model, df, num_labels, device="cpu", batch_size=32):
  num_iter = math.ceil(df.shape[0]/batch_size)
  
  pred_probs = np.array([]).reshape(0, num_labels)
  
  model.to(device)
  model.eval()
  
  for i in range(num_iter):
    df_subset = df.iloc[i*batch_size:(i+1)*batch_size,:]
    X = df_subset["features"].values.tolist()
    masks = df_subset["masks"].values.tolist()
    X = torch.tensor(X)
    masks = torch.tensor(masks, dtype=torch.long)
    X = X.to(device)
    masks = masks.to(device)
    with torch.no_grad():
      logits = model(input_ids=X, attention_mask=masks)
      logits = logits.sigmoid().detach().cpu().numpy()
      pred_probs = np.vstack([pred_probs, logits])

  result_ind = list(np.round(pred_probs)[0])
  # this was the original label column list that was passed to the model when training 
  #label_cols = ['Conspiracy: Distrust of government, organizations, big pharma',
  #     'Fear of Critical side-effects (Autism, Brain Damage, SIDS/Death)',
  #     'Fear of Non-critical side-effects (Rash, Pain, Fever, GI problems, Bump on arm)',
  #     'Holistic or alternative medicine', 'Logistic Concerns', 'Pro-vax', 'Religious Beliefs',
  #     'Right to choose',
  #     'Toxic Ingredients, unclear origins of materials/manufacturer',
  #     'Vaccines ineffective/unnecessary']
  output = {'Fear_of_Critical_Side_Effects__c': result_ind[1] ,
            'Fear_of_Delivery_Method__c': result_ind[4], 
            'Fear_of_Non_Critical_Side_Effects__c': result_ind[2], 
            'Fear_of_Toxic_Ingredients_c': result_ind[8], 
            'Holistic_or_Alternative_Medicine__c': result_ind[3], 
            'Religious_Beliefs_Preclude_Vaccinations__c': result_ind[6], 
            'Right_to_Choose__c': result_ind[7], 
            'Vaccines_are_a_Conspiracy': result_ind[0],
            'Vaccines_are_Ineffective_or_Unnecessary__c': result_ind[9],
            'Patient_is_Pro_Vaccination__c': result_ind[5],
            'Hesitancy_Classification__c': 0}
  
  return output


In [0]:
prediction = generate_predictions_dict(model, test_df, num_labels, device="cuda", batch_size=1)

In [0]:
prediction

{'Fear_of_Critical_Side_Effects__c': 1.0,
 'Fear_of_Delivery_Method__c': 0.0,
 'Fear_of_Non_Critical_Side_Effects__c': 0.0,
 'Fear_of_Toxic_Ingredients_c': 0.0,
 'Hesitancy_Classification__c': 0,
 'Holistic_or_Alternative_Medicine__c': 0.0,
 'Patient_is_Pro_Vaccination__c': 0.0,
 'Religious_Beliefs_Preclude_Vaccinations__c': 0.0,
 'Right_to_Choose__c': 0.0,
 'Vaccines_are_Ineffective_or_Unnecessary__c': 0.0,
 'Vaccines_are_a_Conspiracy': 0.0}