In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 8.2MB/s eta 0:00:01
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 23.4MB/s eta 0:00:01
[?25hCollecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 34.3MB/s eta 0:00:01
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.41-cp36-none-any.whl si

We'll be using the Transformers package by HuggingFace that provides the model architectures and pre-trained weights to the latest Transformer models including XLNet, T5, and many others.

Let's import the relevant libraries and load the data into a Pandas Dataframe

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, random_split, DataLoader
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, tqdm_notebook

if torch.cuda.is_available:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# Load data into dataframe
df = pd.read_json('../input/news-category-dataset/News_Category_Dataset_v2.json', lines=True)
print(df.shape)
df.head()

(200853, 6)


Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [None]:
# Drop rows without description and merge title with description
df = df.loc[df.loc[:, 'short_description']!=""]
df['text'] = df.apply(lambda x: x['headline']+". "+x["short_description"], axis=1)
print("Average text length: {}".format(np.mean(df['text'].str.len())))

# Helper function to tokenize our textual data
def convert_lines(input_sentences,tokenizer, max_seq_length=200):
    max_seq_length -=2
    all_tokens = []
    for text in tqdm(input_sentences):
        # Tokenizes the individual sentences
        tokens = tokenizer.tokenize(text)
        if len(tokens)>max_seq_length:
            # Cuts the sentence if it is longer than the maximum length
            tokens = tokens[:max_seq_length]
        
        # Add tokens to indicate the start and end of the sentence, and pads the sentence with 0
        padded_tokens = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens+["[SEP]"])+[0] * (max_seq_length - len(tokens))
        all_tokens.append(padded_tokens)
    return np.array(all_tokens)

# Instantiate Tokenizer and tokenize all our data
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
x = convert_lines(df['text'], tokenizer, 200)

# Getting the categories and number of labels
categories = df['category'].unique().tolist()
num_labels = len(categories)
print(categories)
print("Number of categories: {}".format(num_labels))

# Creating an index for each unique category and assigning corresponding category index to each news article
cat2id = {cat:i for i,cat in enumerate(categories)}
id2cat = {i:cat for cat,i in cat2id.items()}
print(cat2id)
df['cat_id'] = df.apply(lambda x: cat2id[x['category']], axis=1)
labels = df['cat_id'].tolist()

In [None]:
# Setting train-validation-test split size and batch size
test_portion = 0.2
val_portion = 0.2
batch_size = 12

# Loading data and defining dataloaders
dataset = TensorDataset(torch.from_numpy(x).long(), torch.tensor(labels,dtype=torch.long))
test_data_len = int(test_portion*len(dataset))
val_data_len = int(val_portion*len(dataset))
train_data_len = len(dataset) - test_data_len - val_data_len
train_data, val_data, test_data = random_split(dataset, [train_data_len, val_data_len, test_data_len])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
# Setting training Hyperparameters
EPOCHS = 3
accumulation_steps = 2
lr = 1e-5

# Instantiating our model with the the last layer as a classification layer and output dimension = number of labels
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased",num_labels=num_labels)
model.zero_grad()
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

num_train_optimization_steps = int(EPOCHS*len(train_data)/batch_size/accumulation_steps)
num_warmup_steps = int(0.05 * num_train_optimization_steps)

optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_optimization_steps)
model.train()
model.logits_proj

In [None]:
output_model_file = "../working/xlnet.pt"
lowest_val_acc = 0

# Training Loop
tq = tqdm_notebook(range(EPOCHS)) # Main loop progress bar
for epoch in tq:
    model.train()
    avg_loss = 0.
    avg_accuracy = 0.
    lossf=None
    # Progress bar for iterating through training data
    progress_train = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False)
    optimizer.zero_grad()
    for i,(x_batch, y_batch) in progress_train:
        y_pred = model(x_batch.to(device), attention_mask=(np.logical_not(x_batch==0)).to(device), labels=None)
        # Calculate loss function
        loss =  F.cross_entropy(y_pred[0],y_batch.to(device))
        loss.backward()
        if (i+1) % accumulation_steps == 0:
            # Update model weights
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        progress_train.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean((torch.max(F.softmax(y_pred[0], dim=1), dim=1)[1] == y_batch.to(device)).to(torch.float)).item()/len(train_loader)
    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)
    print("Training Accuracy: {}%".format(avg_accuracy*100))
    
    # Validation
    model.eval() # Freeze model weights for validation evaluation
    val_loss = 0.
    val_acc = 0.
    # Progress bar for iterating through validation data
    progress_val = tqdm_notebook(enumerate(val_loader), total=len(val_loader), leave=False)
    for i, (val_x, val_y) in progress_val:
        val_pred = model(val_x.to(device), attention_mask=(np.logical_not(val_x==0)).to(device), labels=None)
        val_loss += F.cross_entropy(val_pred[0],val_y.to(device)).item()/len(val_loader)
        val_acc += torch.mean((torch.max(F.softmax(val_pred[0], dim=1), dim=1)[1] == val_y.to(device)).to(torch.float)).item() / len(val_loader)
    print("Validation Loss: {}".format(val_loss))
    print("Validation Accuracy: {}%".format(val_acc*100))
    if val_acc > lowest_val_acc:
        # Save model if validation accuracy is higher for current loop
        torch.save(model.state_dict(), output_model_file)
        lowest_val_acc = val_acc
        print("Validation performance improved... Saving model")

In [None]:
# Load the best model
output_model_file = "../working/xlnet.pt"
model.load_state_dict(torch.load(output_model_file, map_location=device))
model.eval()
progress_test = tqdm_notebook(test_loader)
test_loss = 0.
test_acc = 0.
for i,(x_batch, y_batch)  in enumerate(progress_test):
    y_pred = model(x_batch.to(device), attention_mask=(np.logical_not(x_batch==0)).to(device), labels=None)
    test_loss += F.cross_entropy(y_pred[0], y_batch.to(device)).item()/len(test_loader)
    test_acc += torch.mean((torch.max(F.softmax(y_pred[0], dim=1), dim=1)[1] == y_batch.to(device)).to(torch.float)).item()/len(test_loader)
print("Test Loss: {}".format(test_loss))
print("Test accuracy: {}%".format(test_acc*100))