# ARINC Fingerprinting BERT Single Class Classifier

//


Import related libraries:

In [1]:
'''Train with PyTorch.'''
# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import torch.utils.data as data

# BERT Related Libraries
from transformers import BertTokenizer, BertForSequenceClassification

# Python
import pandas as pd
import numpy as np
import os
import time


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Declaring machine learning parameters:

In [2]:
# ML Parameters
lr = 1e-2
epoch = 10
batch_size = 256


Data Source:

In [3]:
from sklearn.datasets import fetch_20newsgroups

# Download News Group Dataset from SciKit
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True)

# Print Available Classes
print("Available Classes: ", list(newsgroups_train.target_names))
num_labels = len(list(newsgroups_train.target_names))
print("Number of Unique Class: ", num_labels)
print("=======")

# Print Data Count
print(newsgroups_train.filenames.shape)
print(newsgroups_train.target.shape)
print("=======")

# Print One Data Example
print(newsgroups_train.filenames[0])
print(newsgroups_train.data[0])
print(newsgroups_train.target[0])


Available Classes:  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Number of Unique Class:  20
(11314,)
(11314,)
/Users/chriz/scikit_learn_data/20news_home/20news-bydate-train/rec.autos/102994
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This 

We don't define our own model, simply use pre-trained BERT model from Transformer:

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Define model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)

# Define 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define optimizer
#optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
optimizer = optim.AdamW(model.parameters(), lr=lr)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Dataset Accessor:

In [5]:

class SentenceDataset(data.Dataset):

    def __init__(self, newsgroup):
        self.newsgroup = newsgroup

    def __len__(self):
        return self.newsgroup.filenames.shape[0]

    def __getitem__(self, idx):
        
        # return the sentence
        i = self.newsgroup.data[idx]

        # return one hot encoding of the label
        #one_hot_label = np.array([0 for _ in range(len(available_classes))])
        #one_hot_label[train_label[idx]] = 1
        
        # return the label directly
        label = self.newsgroup.target[idx]

        #return i, one_hot_label
        return i, label


Preparation of traning and validation set:

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load training dataset
train_dataset = SentenceDataset(newsgroups_train)
test_dataset = SentenceDataset(newsgroups_test)
print(len(train_dataset))
print(len(test_dataset))

# Split training and validation set
#train_len = int(0.6*len(dataset))
#valid_len = len(dataset) - train_len
#TrainData1, ValidationData1 = random_split(dataset,[train_len, valid_len])

# Load into Iterator (each time get one batch)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,drop_last=True, num_workers=0)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True,drop_last=True, num_workers=0)
#train_loader = data.DataLoader(TrainData1, batch_size=batch_size, shuffle=True,drop_last=True, num_workers=0)
#test_loader = data.DataLoader(ValidationData1, batch_size=batch_size, shuffle=True,drop_last=True, num_workers=0)


11314
7532


Training and Testing Functions:

In [7]:
###########################
# Train with training set #
###########################
def train(model, iterator, optimizer, device):
    
    model.train()     # Enter Train Mode
    train_loss = 0
    correct = 0
    total = 0

    for batch_idx, (sentences, labels) in enumerate(iterator):
                
        # tokenize the sentences
        encoding = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']

        # move to GPU if necessary
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        # generate prediction
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # compute gradients and update weights
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        
        # record training losses
        train_loss += loss.item()
        
        # record processed data count
        total += labels.size(0)
        
        # use Softmax to convert to probability
        logits = outputs[1]
        prob = torch.softmax(logits, dim=1)

        # take the index of the highest prob as prediction output
        prediction = prob.max(1)[1]
        correct += prediction.eq(labels).sum().item()

    # print completed result
    acc = 100.*correct/total
    print('correct: %i  total: %i' % (correct, total))
    print('train_loss: %s  test_acc: %f' % (train_loss, acc))
    return train_loss, acc


#############################
# Validate with testing set #
#############################
def test(model, iterator, optimizer, device):

    model.eval()     # Enter Evaluation Mode
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_idx, (sentences, labels) in enumerate(iterator):
            
            # tokenize the sentences
            encoding = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']
            
            # move to GPU if necessary
            input_ids, labels = input_ids.to(device), labels.to(device)
            
            # generate prediction
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            
            # record training losses
            loss = outputs[0]
            test_loss += loss
            
            # record processed data count
            total += labels.size(0)
            
            # use Softmax to convert to probability
            logits = outputs[1]
            prob = torch.softmax(logits, dim=1)

            # take the index of the highest prob as prediction output
            prediction = prob.max(1)[1]
            correct += prediction.eq(labels).sum().item()
    
    # print completed result
    acc = 100.*correct/total
    print('correct: %i  total: %i' % (correct, total))
    print('test_loss: %f  test_acc: %f' % (test_loss, acc))
    return test_loss, acc


Acutal execution:

- Run `training()` and `test()` for `epoch` times


In [None]:
for e in range(epoch):
    
    print("===== Epoch %i =====" % e)
    
    # training
    print("Training started ...")
    train(model, train_loader, optimizer, device)

    # validation testing
    print("Testing started ...")
    test(model, test_loader, optimizer, device)



===== Epoch 0 =====
Training started ...
