In [1]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
import re
import torch
from pytorch_transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from pytorch_transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.metrics import accuracy_score
import tokenizers

import warnings
warnings.filterwarnings('ignore')

In [3]:
df_train = pd.read_csv("df_train.csv")
df_valid = pd.read_csv("df_valid.csv")
df_test = pd.read_csv("df_test.csv")

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [5]:
from transformers import XLNetTokenizer, XLNetModel

model = XLNetModel.from_pretrained('xlnet-base-cased')

In [6]:
model

XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0-11): 12 x XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (activation_function): GELUActivation()
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
)

In [7]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

tokenized_texts_train = [tokenizer.tokenize(sent) for sent in df_train['clean_text']]
tokenized_texts_valid = [tokenizer.tokenize(sent) for sent in df_valid['clean_text']]
tokenized_texts_test = [tokenizer.tokenize(sent) for sent in df_test['clean_text']]

print ("Tokenize the first sentence:")
print (tokenized_texts_train[0])

Tokenize the first sentence:
['▁can', '▁you', '▁', 'ident', 'if', 'i', '▁the', '▁drink', '▁when', '▁out', '▁of', '▁it', '▁', 'bot', 't', 'l']


In [8]:
MAX_LEN = 150
input_ids_train = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts_train]
input_ids_train = pad_sequences(input_ids_train, maxlen=MAX_LEN, dtype="long")

input_ids_valid = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts_valid]
input_ids_valid = pad_sequences(input_ids_valid, maxlen=MAX_LEN, dtype="long")

input_ids_test = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts_test]
input_ids_test = pad_sequences(input_ids_test, maxlen=MAX_LEN, dtype="long")

In [9]:
# Create attention masks
attention_masks_train = []
attention_masks_valid = []
attention_masks_test = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_train:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_train.append(seq_mask)

for seq in input_ids_valid:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_valid.append(seq_mask)

for seq in input_ids_test:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_test.append(seq_mask)

In [10]:
labels_train = df_train['clickbait']
labels_valid = df_valid['clickbait']
labels_test = df_test['clickbait']

In [11]:
import torch
import numpy as np

# Assuming train_labels is a Pandas Series
train_labels_array = np.array(labels_train)
train_labels = torch.tensor(train_labels_array)
train_inputs = torch.tensor(input_ids_train)
train_masks = torch.tensor(attention_masks_train)

# Similarly, convert other Pandas Series to PyTorch tensors
validation_inputs = torch.tensor(input_ids_valid)
validation_labels_array = np.array(labels_valid)
validation_labels = torch.tensor(validation_labels_array)
validation_masks = torch.tensor(attention_masks_valid)

test_inputs = torch.tensor(input_ids_test)
test_labels_array = np.array(labels_test)
test_labels = torch.tensor(test_labels_array)
test_masks = torch.tensor(attention_masks_test)


In [12]:
# Creates torch dataloaders for training and validation sets
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [13]:
from transformers import XLNetForSequenceClassification, XLNetTokenizer

In [14]:
# Load XLNEtForSequenceClassification, the pretrained XLNet model with a single linear classification layer on top.
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
model.cuda()

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [15]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [16]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=1e-5)

In [22]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Store our loss and accuracy for plotting
train_loss_set = []
val_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for epoch_num in trange(epochs, desc="Epoch"):
 # Training
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0
  tr_accuracy = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Clear out the gradients (by default they accumulate)
      optimizer.zero_grad()
      # Forward pass
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      loss = outputs[0]
      logits = outputs[1]
      train_loss_set.append(loss.item())

      # Calculate training accuracy
      preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
      labels = b_labels.to('cpu').numpy()
      tr_accuracy += accuracy_score(labels, preds)

      # Backward pass
      loss.backward()
      # Update parameters and take a step using the computed gradient
      optimizer.step()

      # Update tracking variables
      tr_loss += loss.item()
      nb_tr_examples += b_input_ids.size(0)
      nb_tr_steps += 1

    # Calculate and print average training accuracy
  average_tr_accuracy = tr_accuracy / nb_tr_steps


  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  val_loss_set = []
  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      logits = output[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

    # Log validation loss
    val_loss_set.append(loss.item())

  average_val_loss = sum(val_loss_set) / len(val_loss_set)


  print(
    f'Epochs: {epoch_num + 1} | Train Loss: {tr_loss/nb_tr_steps: .3f} '
    f'| Train Accuracy: {average_tr_accuracy: .3f} '
    f'| Val Loss: {average_val_loss: .3f} '
    f'| Val Accuracy: {eval_accuracy/nb_eval_steps: .3f} ')

Epoch:  25%|██▌       | 1/4 [14:45<44:16, 885.54s/it]

Epochs: 1 | Train Loss:  0.054 | Train Accuracy:  0.981 | Val Loss:  0.203 | Val Accuracy:  0.967 


Epoch:  50%|█████     | 2/4 [29:29<29:29, 884.70s/it]

Epochs: 2 | Train Loss:  0.039 | Train Accuracy:  0.986 | Val Loss:  0.174 | Val Accuracy:  0.966 


Epoch:  75%|███████▌  | 3/4 [44:13<14:44, 884.15s/it]

Epochs: 3 | Train Loss:  0.029 | Train Accuracy:  0.990 | Val Loss:  0.028 | Val Accuracy:  0.969 


Epoch: 100%|██████████| 4/4 [58:56<00:00, 884.23s/it]

Epochs: 4 | Train Loss:  0.018 | Train Accuracy:  0.994 | Val Loss:  0.009 | Val Accuracy:  0.966 





In [23]:
#Testing terhadap Dataset Testing
# Testing

# Put model in evaluation mode to evaluate on the test set
model.eval()

# Tracking variables
test_accuracy = 0
nb_test_steps = 0

# Evaluate on the test set
for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = output[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)

    test_accuracy += tmp_test_accuracy
    nb_test_steps += 1

# Calculate and print average testing accuracy
average_test_accuracy = test_accuracy / nb_test_steps
print(f'Test Accuracy: {average_test_accuracy:.3f}')

Test Accuracy: 0.974
