In [1]:
!pip install transformers



In [2]:
import pandas as pd
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer

In [None]:
def get_data():
    ! wget wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
    ! mkdir data
    ! unzip -q NewsAggregatorDataset.zip -d ./data
get_data()

--2020-09-11 03:53:22--  http://wget/
Resolving wget (wget)... failed: Name or service not known.
wget: unable to resolve host address ‘wget’
--2020-09-11 03:53:22--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29224203 (28M) [application/x-httpd-php]
Saving to: ‘NewsAggregatorDataset.zip.1’


2020-09-11 03:53:23 (58.4 MB/s) - ‘NewsAggregatorDataset.zip.1’ saved [29224203/29224203]

FINISHED --2020-09-11 03:53:23--
Total wall clock time: 0.7s
Downloaded: 1 files, 28M in 0.5s (58.4 MB/s)
mkdir: cannot create directory ‘data’: File exists
replace ./data/2pageSessions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
df = pd.read_csv('./data/newsCorpora.csv', sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df = df[['TITLE', 'CATEGORY']]
df.head()

In [None]:
labels = dict(enumerate(set(df.CATEGORY.unique())))
labels = {v:k for k, v in labels.items()}
labels

In [None]:
def encode_labels(value):
    return labels[value]

In [None]:
df['LABELS'] = df.CATEGORY.apply(encode_labels)
df.head()

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-5
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
print(f"Tokenized Text: {tokenizer.tokenize(df.loc[0, 'TITLE'])}")
#print(f"Tokenized Text w/ special tokens: {tokenizer.add_special_tokens(df.loc[0, 'TITLE'])}")
print(f"Encoded Text: {tokenizer.encode(df.loc[0, 'TITLE'])}")
print(f"Encoded Text Plus: {tokenizer.encode_plus(df.loc[0,'TITLE'])}")  # encode_plus returns a dictionary with 2 other arrays

Tokenized Text: ['fed', 'official', 'says', 'weak', 'data', 'caused', 'by', 'weather', ',', 'should', 'not', 'slow', 'tape', '##r']
Encoded Text: [101, 7349, 2880, 2758, 5410, 2951, 3303, 2011, 4633, 1010, 2323, 2025, 4030, 6823, 2099, 102]
Encoded Text Plus: {'input_ids': [101, 7349, 2880, 2758, 5410, 2951, 3303, 2011, 4633, 1010, 2323, 2025, 4030, 6823, 2099, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
test  = tokenizer.encode_plus(df.loc[0, 'TITLE'])
print(f"Decoded Text: {tokenizer.decode(test['input_ids'])}")

Decoded Text: [CLS] fed official says weak data caused by weather, should not slow taper [SEP]


In [64]:
class Triage(Dataset):
    def __init__(self, data, tokenizer, max_len):
        super().__init__()
        self.X = data.TITLE
        self.y = data.LABELS
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        inputs = self.tokenizer.tokenize(self.X.loc[idx])
        inputs = self.tokenizer.encode_plus(inputs,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            pad_to_max_length=True,
                                            truncation=True)
        
        return torch.LongTensor(inputs['input_ids']), torch.LongTensor(inputs['attention_mask']), self.y.loc[idx]

In [65]:
# testing the dataset
test = df.head()
train_ds = Triage(test, tokenizer, MAX_LEN)
train_ds[0][0].size()



torch.Size([512])

In [66]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset = df.sample(frac=train_size,random_state=200)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

train_ds = Triage(train_dataset, tokenizer, MAX_LEN)
test_ds = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (422419, 3)
TRAIN Dataset: (337935, 3)
TEST Dataset: (84484, 3)


In [67]:
# setting up the dataloader
train_dl = DataLoader(train_ds, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
valid_dl = DataLoader(test_ds, batch_size=VALID_BATCH_SIZE)

In [68]:
x, mask, y = next(iter(train_dl))



In [72]:
class BERT(nn.Module):
    def __init__(self, n_class, dropout):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.classifier = torch.nn.Linear(768, n_class)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        # seq = [batch_size, seq_len, h_dim] pool = [batch_size, h_dim]
        seq, pool = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = F.relu(self.pre_classifier(pool))
        output = self.dropout(output)
        return self.classifier(output)

In [78]:
# TESTING the customized model
model = BERT(df.LABELS.nunique(), .3)
output = model(input_ids=x, attention_mask=mask)
output.size()

torch.Size([4, 4])

In [None]:
model.to(device)