# Fitting BERT Classifier to Twitter MBTI

In [1]:
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve

In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 4 GPU(s) available.
We will use the GPU: TITAN X (Pascal)


In [4]:
def expand_frame(df, length):
    ndf = pd.DataFrame()

    for i in range(1,30):
        adf = df.copy(deep= True)
        adf['text'] = adf['text'].apply(lambda x: x[0 + length*5*i : length * 5 * (i+1)])
        ndf = ndf.append(adf)
        ndf.reset_index(drop=True, inplace= True)
    return ndf



# Loading the Twitter personality dataset:

# OPTION 1 : Load raw without tokenized:

In [41]:
ofile = open('Data/personality_likes_large.csv', encoding = 'cp1252', mode='r' )
raw_df = pd.read_csv(ofile ,index_col=0)
ofile.close()

In [160]:
wifile = open('train_tokenized_large_exp.csv', mode = 'w+')
wtfile = open('test_tokenized_large_exp.csv', mode = 'w+')

wifile.truncate(0)
wtfile.truncate(0)

0

In [42]:
train_df, test_df = train_test_split(raw_df, stratify = raw_df['type'], random_state= 1729, test_size= 0.12)

In [None]:
train_df = expand_frame(train_df, 256)

train_df.dropna(inplace = True)
test_df.to_csv(wtfile)

wtfile.close()


In [164]:
# Create sentence and label lists
sentences = train_df.text.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]

'[CLS] 7 @brad_polumbo @mattyglesias Not just can but can\'t create money out of anything else@MyPillowUSA what’s good?@leftwinggal Nobody is marginalizing Dutch-Americans what the fuckhttps://t.co/OQTASiMt5u@AndrewYang https://t.co/HfvynOpeHu@AndrewYang https://t.co/suSZXLoK2QSo basically, only rich while men can try to make a $, the rest of you are not worthy. OK.  https://t.co/CrQ6q9i40YWho’s this Fucking Guy... $GME $AMC to the  MOON  https://t.co/KUmZPWdO9l@reddittrading I have a romantic relationship with my $AMC stock.Baptist leaders: "Only men can be in leadership positions, so they can use their strength to protect women."\n\nAlso Baptist leaders: "I\'m sorry, I can\'t do anything substantial to help sexual abuse victims because other men might say mean things about me."\n\nA plague of cowards.Democratic leadership laying down the gauntlet on Marjorie Taylor Greene: Steny Hoyer is expected to tell Kevin McCarthy he has 72 hours to strip Marjorie Taylor Greene of her committee 

In [165]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

train_df['tokenized_texts'] = tokenized_texts

Tokenize the first sentence:
['[CLS]', '@', 'mig', '##non', '##10', '##44', '@', 'mig', '##non', '##10', '##44', '!', '!', '@', 'mig', '##non', '##10', '##44', '@', 'mig', '##non', '##10', '##44', '?', '@', 'mig', '##non', '##10', '##44', '@', 'mig', '##non', '##10', '##44', '.', '.', '.', '.', '@', 'mig', '##non', '##10', '##44', '@', 'mig', '##non', '##10', '##44', '~', '~', '~', '@', 'mig', '##non', '##10', '##44', '.', '.', '[SEP]']


In [167]:
train_df.to_csv(wifile)

In [168]:
wifile.close()
ofile = open('train_tokenized_large_exp.csv',  mode='r' )
edf = pd.read_csv(ofile, index_col = 0)

Unnamed: 0,liked_by,text,type,extravert,intuitive,thinking,judging,NT,SF,NF,ST,NJ,NP,SJ,SP,tokenized_texts
0,1281267858639486976,@mignon1044 @mignon1044 !!@mignon1044 @migno...,ISTJ,0,0,1,1,0,0,0,1,0,0,1,0,"['[CLS]', '@', 'mig', '##non', '##10', '##44',..."
1,17724827,an sebegitu menyebabkan anda cuba mengganggu d...,INFJ,0,1,0,1,0,0,1,0,1,0,0,0,"['[CLS]', 'an', 'se', '##be', '##git', '##u', ..."
2,84772941,that I can use strong language.the news: stay...,ENFP,1,1,0,0,0,0,1,0,0,1,0,0,"['[CLS]', 'that', 'i', 'can', 'use', 'strong',..."
3,3416461845,s://t.co/JZXYOUsCqoSelena Gomez's Makeup Artis...,ESFP,1,0,0,0,0,1,0,0,0,0,0,1,"['[CLS]', 's', ':', '/', '/', 't', '.', 'co', ..."
4,3075434919,"de papai, queria eu ser, único erro dele foi r...",ESFP,1,0,0,0,0,1,0,0,0,0,0,1,"['[CLS]', 'de', 'papa', '##i', ',', 'que', '##..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106802,2356621515,,ESFP,1,0,0,0,0,1,0,0,0,0,0,1,"['[CLS]', '[SEP]']"
106803,706889633670557697,,INFJ,0,1,0,1,0,0,1,0,1,0,0,0,"['[CLS]', '[SEP]']"
106804,91546751,@LiviaBellona @TheKimClub @PharmDame @PeterFMa...,ENFJ,1,1,0,1,0,0,1,0,1,0,0,0,"['[CLS]', '@', 'liv', '##ia', '##bell', '##ona..."
106805,24946566,questioned my non-linear liberal arts path. ht...,INTJ,0,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 'questioned', 'my', 'non', '-', 'lin..."


# OPTION 2: Load Dataframe from disk

In [56]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [57]:
test_file = open('test_tokenized_large_exp.csv')
ofile = open('train_tokenized_large_exp.csv')

edf = pd.read_csv(ofile, index_col = 0)

edf.dropna(inplace= True)
#edf, test_df = train_test_split(edf, random_state=2020, test_size=0.10)
test_df = pd.read_csv(test_file, index_col=0)

In [8]:
test_df = test_df.reset_index(drop = True)

In [59]:
def fourType(x):
    if ((x=='INFJ') | (x=='INFP') | (x=='ENFJ') | (x=='ENFP')):
        return 1
    elif ((x=='INTJ') | (x=='INTP') | (x=='ENTJ') | (x=='ENTP')):
        return 2
    elif ((x=='ISFJ') | (x=='ISTJ') | (x=='ESFJ') | (x=='ESTJ')):
        return 3
    else:
        return 0

In [60]:
edf['role'] = edf['type'].map(fourType)
test_df['role'] = test_df['type'].map(fourType)

In [61]:
df = edf[edf['role']==1].sample(10808, random_state = 34)
df = df.append(edf[edf['role']==2].sample(10808, random_state = 35))
df = df.append(edf[edf['role']==3].sample(10808, random_state = 35))
df = df.append(edf[edf['role']==0].sample(10808, random_state = 35))

Unnamed: 0,liked_by,text,type,extravert,intuitive,thinking,judging,NT,SF,NF,ST,NJ,NP,SJ,SP,tokenized_texts,role
105039,15620292,rUryJJaqPnokay I know you guys like TTRPGs. h...,INFP,0,1,0,0,0,0,1,0,0,1,0,0,"['[CLS]', 'ru', '##ry', '##j', '##ja', '##q', ...",1
52013,3403218160,CiBSdFW2vSreal pokemon merchandise https://t.c...,INFP,0,1,0,0,0,0,1,0,0,1,0,0,"['[CLS]', 'ci', '##bs', '##df', '##w', '##2', ...",1
57263,1244901512700030976,it out for Free (link in bio)\n#brunette #curl...,INFJ,0,1,0,1,0,0,1,0,1,0,0,0,"['[CLS]', 'it', 'out', 'for', 'free', '(', 'li...",1
71092,26871926,l me anh(at)https://t.co/Leb3wLQo3x.We just wa...,ENFJ,1,1,0,1,0,0,1,0,1,0,0,0,"['[CLS]', 'l', 'me', 'an', '##h', '(', 'at', '...",1
96461,6046132,ng it. Anyway the commercially-successful-but-...,INFP,0,1,0,0,0,0,1,0,0,1,0,0,"['[CLS]', 'ng', 'it', '.', 'anyway', 'the', 'c...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22249,172118801,MUNICADO https://t.co/QTXWaFw6Oyhttps://t.co/A...,ESFP,1,0,0,0,0,1,0,0,0,0,0,1,"['[CLS]', 'mu', '##nica', '##do', 'https', ':'...",0
58010,491669361,me ambassador for @worldvisioncan. Now we know...,ESFP,1,0,0,0,0,1,0,0,0,0,0,1,"['[CLS]', 'me', 'ambassador', 'for', '@', 'wor...",0
78902,26541394,the developers specifically had CRT monitors i...,ISTP,0,0,1,0,0,0,0,1,0,0,0,1,"['[CLS]', 'the', 'developers', 'specifically',...",0
9532,790271395855003649,ed before my brains melt. #jiroukyouka #jiro ...,ISFP,0,0,0,0,0,1,0,0,0,0,0,1,"['[CLS]', 'ed', 'before', 'my', 'brains', 'mel...",0


In [62]:
from ast import literal_eval

df = df.sample(frac=1).reset_index(drop=True)
tokenized_texts = df['tokenized_texts'].map(literal_eval)

test_df = test_df.reset_index(drop = True)

In [63]:
tokenized_texts

0        [[CLS], er, ##y, despite, l, ##w, ##j, ', s, w...
1        [[CLS], ph, #, gu, ##hit, ##pina, ##s, https, ...
2        [[CLS], h, eli, ##mina, ##das, da, pro, ##va, ...
3        [[CLS], p, ;, se, ##m, o, bo, ##a, no, ##ite, ...
4        [[CLS], for, an, interview, with, @, maggie, _...
                               ...                        
43227    [[CLS], https, :, /, /, t, ., co, /, ct, ##ek,...
43228    [[CLS], yang, ku, ##rang, pen, ##ting, ., http...
43229    [[CLS], tis, i, ’, ve, never, had, a, pizza, w...
43230    [[CLS], r, lists, !, !, !, !, thank, you, all,...
43231    [[CLS], s, :, /, /, t, ., co, /, f, ##s, ##x, ...
Name: tokenized_texts, Length: 43232, dtype: object

In [76]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 256

In [77]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
from keras.preprocessing.sequence import pad_sequences

In [78]:
# Pad our input tokens
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [79]:
input_ids

array([[  101,  9413,  2100, ...,  2860,  3501,  5176],
       [  101,  6887,  1001, ..., 16770,  1024,  1013],
       [  101,  1044, 12005, ...,  4487,  1012,  1001],
       ...,
       [  101, 22320,  1045, ...,  1996, 11675,  1012],
       [  101,  1054,  7201, ...,  2000,  2079,  2007],
       [  101,  1055,  1024, ..., 18351,  5622,  2912]])

In [80]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [81]:
# Use train_test_split to split our data into train and validation sets for training

labels = df.role.values
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2020, test_size=0.01)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2020, test_size=0.01)

# BERT

In [83]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

  train_inputs = torch.tensor(train_inputs)
  validation_inputs = torch.tensor(validation_inputs)
  train_labels = torch.tensor(train_labels)
  validation_labels = torch.tensor(validation_labels)
  train_masks = torch.tensor(train_masks)
  validation_masks = torch.tensor(validation_masks)


In [85]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 8

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [86]:
from torch import nn
from transformers import XLNetTokenizer, XLNetForSequenceClassification


from torch import nn
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from torch.nn.parallel import DistributedDataParallel
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model, device_ids=[0, 1])
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [87]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]


In [88]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


In [89]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [90]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [91]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [92]:
import random
torch.cuda.empty_cache()
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 44

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        loss = loss.mean()
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of  5,350.    Elapsed: 0:00:14.
  Batch    80  of  5,350.    Elapsed: 0:00:26.
  Batch   120  of  5,350.    Elapsed: 0:00:38.
  Batch   160  of  5,350.    Elapsed: 0:00:52.
  Batch   200  of  5,350.    Elapsed: 0:01:04.
  Batch   240  of  5,350.    Elapsed: 0:01:16.
  Batch   280  of  5,350.    Elapsed: 0:01:28.
  Batch   320  of  5,350.    Elapsed: 0:01:40.
  Batch   360  of  5,350.    Elapsed: 0:01:52.
  Batch   400  of  5,350.    Elapsed: 0:02:04.
  Batch   440  of  5,350.    Elapsed: 0:02:16.
  Batch   480  of  5,350.    Elapsed: 0:02:29.
  Batch   520  of  5,350.    Elapsed: 0:02:41.
  Batch   560  of  5,350.    Elapsed: 0:02:53.
  Batch   600  of  5,350.    Elapsed: 0:03:07.
  Batch   640  of  5,350.    Elapsed: 0:03:19.
  Batch   680  of  5,350.    Elapsed: 0:03:31.
  Batch   720  of  5,350.    Elapsed: 0:03:43.
  Batch   760  of  5,350.    Elapsed: 0:03:55.
  Batch   800  of  5,350.    Elapsed: 0:04:07.
  Batch   840  of  5,350.    Elapsed: 0:04:21.




# Testing

## Helper function for predicting whether label matches the person's prediction, for one person:

In [93]:
import statistics

def predict_person(text, label, length):
  #  print(len(text))
    textLen = len(text)
    sentences = [text[0 + i * int(textLen/30) : (i + 1) * int(textLen/30)] for i in range(0, 30)]
  #  print(len(sentences))
    labels = [label] * len(sentences)
  #  print(len(labels))
   # print(len(sentences))
    input_ids = []
    for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )

        input_ids.append(encoded_sent)

    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN,
                          dtype="long", truncating="post", padding="post")

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert to tensors.
    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.tensor(labels)
    # Set the batch size.
    batch_size = 32

    # Create the DataLoader.
    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

  #  print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions , true_labels = [], []

    # Predict
    for batch in prediction_dataloader:
    # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

    # Telling the model not to compute or store gradients, saving memory and
    # speeding up prediction
        with torch.no_grad():
           # Forward pass, calculate logit predictions
           outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)

        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)

    #for i in range(len(true_labels)):

    # The predictions for this batch are a 2-column ndarray (one column for "0"
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s.
    correct = 0
    num = 0

    final_predict_list = []

    for i in range(len(true_labels)):

        pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
        [final_predict_list.append(i) for i in pred_labels_i]
       # if accuracy_score(pred_labels_i, [0] * len(pred_labels_i)) < 0.5 :
        #    correct += 1
       # num+=1
  #  print(final_predict_list)

    if (statistics.mode(final_predict_list)==label):
        return 1

    else:
        return 0






In [94]:
testing_df = test_df[test_df['role']==1].sample(71)
testing_df = testing_df.append(test_df[test_df['role']==2].sample(71))
testing_df = testing_df.append(test_df[test_df['role']==3].sample(71))
testing_df = testing_df.append(test_df[test_df['role']==0].sample(71))

testing_df = testing_df.sample(frac = 1, random_state= 543)
testing_df = testing_df.reset_index(drop = True)

testing_df

Unnamed: 0,liked_by,text,type,extravert,intuitive,thinking,judging,NT,SF,NF,ST,NJ,NP,SJ,SP,role
0,873952654392647681,Champ and Major have joined us in the White Ho...,ENFJ,1,1,0,1,0,0,1,0,1,0,0,0,1
1,32089349,We did this—together.I hope Mike Pence smiles ...,ENFJ,1,1,0,1,0,0,1,0,1,0,0,0,1
2,36685290,Little kids are obsessed with garbage trucks b...,INTJ,0,1,1,1,1,0,0,0,1,0,0,0,2
3,7885362,"Don’t worry, Mr. President. I’ll see you at yo...",INTJ,0,1,1,1,1,0,0,0,1,0,0,0,2
4,1065701994512424960,Happy Pancake day NOW fans! Best toppings go.....,ISFP,0,0,0,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,1142510423285145600,Took her phone and texted her gc “guess who I’...,ENFP,1,1,0,0,0,0,1,0,0,1,0,0,1
280,1039946712972976129,@VoyageoftheMind @VoyageoftheMind Yes definite...,ENTP,1,1,1,0,1,0,0,0,0,1,0,0,2
281,1155187406,NFL Black Lives Matter commercial showing play...,INFJ,0,1,0,1,0,0,1,0,1,0,0,0,1
282,89410598,@BOHE_BABE @hongokucho \nhttps://t.co/a2xd7R3h...,ENTP,1,1,1,0,1,0,0,0,0,1,0,0,2


## Final predictive accuracy score:

In [95]:
correct = 0
num = 0
for row in tqdm(range(0,len(testing_df.index)), "testing.."):
    num += 1
    pred_text = testing_df.at[row, 'text']
    pred_label = testing_df.at[row, 'role']
    pred = predict_person(pred_text, pred_label, 256)
   # print(pred)
    if pred == 1:
        correct+=1

print("ACC: ", correct/num)

testing..:   0%|          | 1/284 [00:00<01:57,  2.40it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors
testing..: 100%|██████████| 284/284 [01:41<00:00,  2.80it/s]


ACC:  0.4119718309859155


## Saving model to disk

In [37]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = 'BERT_roles'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))


Saving model to BERT_roles


('BERT_roles/tokenizer_config.json',
 'BERT_roles/special_tokens_map.json',
 'BERT_roles/vocab.txt',
 'BERT_roles/added_tokens.json')


## Sandbox (the outputs from these code chunks are not up to date and were performed ad hoc)

In [123]:
raw_df = raw_df.drop_duplicates()

In [128]:
raw_df
raw_set = set(raw_df['liked_by'])
print(len(raw_set))

4186


In [125]:
train_df, test_df = train_test_split(raw_df, stratify = raw_df['type'], random_state= 1729, test_size= 0.12)

In [175]:
train_set = set(edf['liked_by'])
print(len(train_set))

3680


In [176]:
test_set = set(test_df['liked_by'])
print(len(test_set))

503


In [177]:
i = 0
for val in set(test_df['liked_by']):
    if val in train_set:
        i+=1
        print(val)
        print('dude wtf')
print(i)

0


In [104]:
df[df['liked_by']==856944637575090176]

Unnamed: 0,liked_by,text,type,extravert,intuitive,thinking,judging,NT,SF,NF,ST,NJ,NP,SJ,SP,tokenized_texts
1827,856944637575090176,doing this for Rue. I couldn’t type all I’m fe...,ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 'doing', 'this', 'for', 'rue', '.', ..."
3396,856944637575090176,a medal of freedom tbh... your move Biden@sar...,ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 'a', 'medal', 'of', 'freedom', 'tb',..."
4854,856944637575090176,rl who raises the dead for $$$. Out 8.24.2021!...,ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 'r', '##l', 'who', 'raises', 'the', ..."
13563,856944637575090176,trying to tell someone I didn’t believe in me...,ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 'trying', 'to', 'tell', 'someone', '..."
18242,856944637575090176,is FIVE DAYS AWAY!!! Since it’s almost the new...,ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 'is', 'five', 'days', 'away', '!', '..."
19429,856944637575090176,ir wine with food\n\nA dark god with fine tast...,ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 'ir', 'wine', 'with', 'food', 'a', '..."
32099,856944637575090176,"solid Norse number, I commissioned 9 character...",ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 'solid', 'norse', 'number', ',', 'i'..."
32897,856944637575090176,\n\nwhat the hell did they expect me to eat th...,ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 'what', 'the', 'hell', 'did', 'they'..."
33639,856944637575090176,"s part out during revisions, what if the chapt...",ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', 's', 'part', 'out', 'during', 'revis..."
34696,856944637575090176,/t.co/ArcG41MLtG@sarahcatstreet b4$ !!Just got...,ENTJ,1,1,1,1,1,0,0,0,1,0,0,0,"['[CLS]', '/', 't', '.', 'co', '/', 'arc', '##..."


In [105]:
testing_df[testing_df['liked_by']==856944637575090176]

Unnamed: 0,liked_by,text,type,extravert,intuitive,thinking,judging,NT,SF,NF,ST,NJ,NP,SJ,SP
3,856944637575090176,me coming into 99+ tiktok notifs wondering wha...,ENTJ,1,1,1,1,1,0,0,0,1,0,0,0


In [385]:
predict_person(testing_df.at[45, 'text'], testing_df.at[45, 'thinking'], 256)

[0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]
[1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
1


In [35]:
for row in range(300,340):
   print(predict_person(testing_df.at[row, 'text'], testing_df.at[row, 'intuitive'], 256))
   print(test_df.at[row, 'intuitive'])
   print(" ")


1
1
 
1
0
 
1
1
 
1
1
 
1
0
 
1
1
 
1
1
 
1
1
 
0
1
 
1
1
 
1
0
 
1
1
 
1
0
 
1
1
 
1
1
 
1
1
 
1
1
 
1
1
 
0
0
 
0
0
 
1
0
 
1
0
 
1
0
 
1
0
 
1
1
 
0
1
 
0
0
 
1
1
 
1
1
 
1
1
 
1
1
 
1
0
 
1
1
 
1
1
 
0
0
 
0
0
 
1
1
 
1
0
 
1
1
 
1
1
 


In [None]:
# Report the number of sentences.
print('Number of persons in test set: {:,}\n'.format(test_df.shape[0]))

print('Positive samples: %d of %d (%.2f%%)' % (test_df.judging.sum(), len(test_df.judging), (test_df.judging.sum() / len(test_df.judging) * 100.0)))

In [None]:
# from sklearn.metrics import matthews_corrcoef
#
# matthews_set = []
#
# # Evaluate each test batch using Matthew's correlation coefficient
# print('Calculating Matthews Corr. Coef. for each batch...')
#
# # For each input batch...
# for i in range(len(true_labels)):
#
#     # The predictions for this batch are a 2-column ndarray (one column for "0"
#     # and one column for "1"). Pick the label with the highest value and turn this
#     # in to a list of 0s and 1s.
#     pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
#
#     # Calculate and store the coef for this batch.
#     matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
#     matthews_set.append(matthews)
#

In [None]:
#matthews_set

In [None]:
# Combine the predictions for each batch into a single list of 0s and 1s.
# flat_predictions = [item for sublist in predictions for item in sublist]
# flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
#
# # Combine the correct labels for each batch into a single list.
# flat_true_labels = [item for sublist in true_labels for item in sublist]
#
# # Calculate the MCC
# mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
#
# print('MCC: %.3f' % mcc)



