In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 6.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 17.3MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 35.4MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K 

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [3]:
# setting up the device for GPU usage
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
def get_data():
  ! wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
  ! mkdir data
  ! unzip -q NewsAggregatorDataset.zip -d ./data
get_data()

--2020-09-09 02:41:54--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29224203 (28M) [application/x-httpd-php]
Saving to: ‘NewsAggregatorDataset.zip’


2020-09-09 02:41:55 (44.2 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203/29224203]



In [6]:
# read in the downloaded file
df = pd.read_csv('./data/newsCorpora.csv', sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df = df[['TITLE', 'CATEGORY']]
df.head()

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b


In [8]:
# encoding the labels to integers
labels = dict(enumerate(set(df.CATEGORY.unique())))
labels_enc = {v:k for k, v in labels.items()}

def encode_labels(value):
  return labels_enc[value]

df['ENCODED_CATEGORY'] = df.CATEGORY.apply(encode_labels)
df.head()

Unnamed: 0,TITLE,CATEGORY,ENCODED_CATEGORY
0,"Fed official says weak data caused by weather,...",b,0
1,Fed's Charles Plosser sees high bar for change...,b,0
2,US open: Stocks fall after Fed official hints ...,b,0
3,"Fed risks falling 'behind the curve', Charles ...",b,0
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b,0


In [10]:
labels_enc, labels

({'b': 0, 'e': 2, 'm': 1, 't': 3}, {0: 'b', 1: 'm', 2: 'e', 3: 't'})

In [27]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-5
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [12]:
class Triage(Dataset):
  def __init__(self, data, tokenizer, max_len):
    super().__init__()
    self.len = data.shape[0]
    self.X = data['TITLE']
    self.y = data['ENCODED_CATEGORY']
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return self.len

  def __getitem__(self, idx):
    title = str(self.X[idx])
    title = ' '.join(title.split())
    inputs = self.tokenizer.encode_plus(title,
                                        add_special_tokens=True,
                                        max_length=self.max_len,
                                        pad_to_max_length=True,
                                        return_token_type_ids=True,
                                        truncation=True)
    numericalized_tokens, mask = inputs['input_ids'], inputs['attention_mask']
    return torch.LongTensor(numericalized_tokens), torch.LongTensor(mask), self.y[idx]

In [None]:
inputs = tokenizer.encode_plus(df.TITLE.loc[0])
len(inputs['input_ids']), len(inputs['attention_mask'])

(16, 16)

In [13]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_ds = df.sample(frac=train_size, random_state=42)
valid_ds = df.drop(train_ds.index).reset_index(drop=True)
train_ds = train_ds.reset_index(drop=True)

print(f"FULL Dataset: {df.shape}")
print(f"Train Dataset: {train_ds.shape}")
print(f"Test Dataset: {valid_ds.shape}")

FULL Dataset: (422419, 3)
Train Dataset: (337935, 3)
Test Dataset: (84484, 3)


In [14]:
# Dataset
train_ds = Triage(train_ds, tokenizer, MAX_LEN)
valid_ds = Triage(valid_ds, tokenizer, MAX_LEN)

# DataLoader
train_dl = DataLoader(train_ds, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
valid_dl = DataLoader(valid_ds, batch_size=VALID_BATCH_SIZE)

In [15]:
x, mask, y = next(iter(train_dl))



In [19]:
x.size(), mask.size(), y.size()

(torch.Size([32, 512]), torch.Size([32, 512]), torch.Size([32]))

In [20]:
x[0]

tensor([  101,  3730,  3414,  1116,  1979,  1209,  1294,  1117, 24836,  1963,
         1120, 15451,  2352,  2263,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [21]:
# Model Creation

class DistillBERT(nn.Module):
  def __init__(self):
    super().__init__()
    self.distil = DistilBertModel.from_pretrained("distilbert-base-uncased")
    self.lin1 = nn.Linear(768, 768)
    self.lin2 = nn.Linear(768, 4)
    self.dropout = nn.Dropout(0.5)

# figure out what is going on first before coding up the rest of the model
  def forward(self, input_ids, attention_mask):
    # outputs = [batch_size, seq_len, h_dim] is inside a tuple as the only element
    outputs = self.distil(input_ids=input_ids, attention_mask=attention_mask)
    # out = [batch_size, h_dim] (only takes the first token of the entire seqeunce (CLS token) )
    out = F.relu(self.lin1(outputs[0][:, 0]))
    out = self.dropout(out)
    # returns an output = [batch_size, n_classes]
    return self.lin2(out)

In [22]:
model = DistillBERT()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




DistillBERT(
  (distil): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [23]:
# creating the loss function and optimizer
#loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [24]:
def calculate_accu(big_idx, targets):
  n_correct = (big_idx == targets).sum().item()
  return n_correct

In [25]:
def train_model(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for idx, data in enumerate(train_dl, 0):
        ids = data[0].to(device)
        mask = data[1].to(device)
        targets = data[2].to(device)

        y_pred = model(ids, mask)
        loss = F.cross_entropy(y_pred, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(y_pred.data, dim=1)
        n_correct += calculate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
    
        if idx % 5000 == 0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

In [28]:
for epoch in range(EPOCHS):
  train_model(epoch)



Training Loss per 5000 steps: 0.38718345761299133
Training Accuracy per 5000 steps: 93.75


KeyboardInterrupt: ignored

In [30]:
for epoch in range(EPOCHS):
  train_model(epoch)



Training Loss per 5000 steps: 0.2974661886692047
Training Accuracy per 5000 steps: 84.375


KeyboardInterrupt: ignored

In [None]:
def valid(model, valid_dl):
  model.eval()
  n_correct, n_wrong, total = 0, 0, 0

  for idx, data in enumerate(valid_dl):
    ids = data[0].to(device)
    mask = data[1].to(device)
    targets = data[2].to(device)

    y_pred = model(ids, mask)
    loss = F.cross_entropy(y_pred, targets)
    tr_loss += loss.item()
    big_val, big_idx = torch.max(y_preds.data, dim=1)
    n_correct += calculate_accu(big_idx, targets)

    nb_tr_steps += 1
    nb_tr_examples += targets.size(0)

    if idx % 5000 == 0:
      loss_step = tr_loss/nb_tr_steps
      accu_step = (n_correct*100)/nb_tr_examples
      print(f"Validation Loss per 100 steps: {loss_step}")
      print(f"Validation Accuracy per 100 steps: {accu_step}")

  epoch_loss = tr_loss/nb_tr_steps
  epoch_accu = (n_correct*100)/nb_tr_examples
  print(f"Validation Loss Epoch: {epoch_loss}")
  print(f"Validation Accuracy Epoch: {epoch_accu}")

  return epoch_accu



In [None]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

In [31]:
torch.no_grad?