In [1]:

%pprint

Pretty printing has been turned OFF



---
# Load needed libraries and functions

In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

In [3]:

%matplotlib inline
from datetime import datetime
from neo4j.exceptions import ServiceUnavailable
import humanize
import matplotlib.pyplot as plt
import time
import winsound

bin_count = 12
duration = 1000  # milliseconds
freq = 880  # Hz
height_inches = 3.0
width_inches = 18.0

In [4]:

t0 = time.time()
try:
    
    # Get the Neo4j driver
    from storage import Storage
    s = Storage()

    from ha_utils import HeaderAnalysis
    ha = HeaderAnalysis(s=s, verbose=False)

    from scrape_utils import WebScrapingUtilities
    wsu = WebScrapingUtilities(s=s)
    uri = wsu.secrets_json['neo4j']['connect_url']
    user =  wsu.secrets_json['neo4j']['username']
    password = wsu.secrets_json['neo4j']['password']

    from cypher_utils import CypherUtilities
    cu = CypherUtilities(uri=uri, user=user, password=password, driver=None, s=s, ha=ha)
    
    version_str = cu.driver.get_server_info().agent
    print(f'======== {version_str} ========')
    
    import warnings
    warnings.filterwarnings('ignore')
except ServiceUnavailable as e:
    print('You need to start Neo4j as a console')
    raise
except Exception as e:
    print(f'{e.__class__}: {str(e).strip()}')
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
# winsound.Beep(freq, duration)
print(f'Utility libraries created in {duration_str}')
print(f'Last run on {datetime.now()}')

Utility libraries created in 2 seconds
Last run on 2023-02-14 18:06:22.474526


In [5]:

# Get all part-of-speech-labeled child string data
t0 = time.time()
def do_cypher_tx(tx, verbose=False):
    cypher_str = """
        MATCH (pos:PartsOfSpeech)-[:SUMMARIZES]->(np:NavigableParents)
        RETURN
            np.navigable_parent AS navigable_parent,
            pos.pos_id AS pos_id;"""
    if verbose:
        clear_output(wait=True)
        print(cypher_str)
    results_list = tx.run(query=cypher_str, parameters=None)
    values_list = []
    for record in results_list:
        values_list.append(dict(record.items()))

    return values_list
with cu.driver.session() as session:
    row_objs_list = session.read_transaction(do_cypher_tx, verbose=False)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
# winsound.Beep(freq, duration)
print(f'Labeled child strings found in {duration_str}')

Labeled child strings found in 0 seconds


In [6]:

# load packages
t0 = time.time()
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time, datetime, re, random, string
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter
from transformers import get_linear_schedule_with_warmup
from itertools import repeat
import optuna
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
import seaborn as sns
from torch.cuda.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup, AdamW

SEED = 15
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
duration_str = humanize.precisedelta(time.time() - t0, minimum_unit='seconds', format='%0.0f')
# winsound.Beep(freq, duration)
print(f'Packages loaded in {duration_str}')

Packages loaded in 2 seconds


In [7]:

!nvidia-smi

Tue Feb 14 18:06:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 528.33       Driver Version: 528.33       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   34C    P0    14W /  60W |      0MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:

# Tell pytorch to use cuda
torch.backends.cudnn.deterministic = True
torch.cuda.amp.autocast(enabled=True)
device = torch.device('cuda')

In [9]:

df = pd.DataFrame(row_objs_list)
df.pos_id = df.pos_id.map(lambda x: int(x))
print(df.pos_id.unique().tolist())
df.navigable_parent = df.navigable_parent.map(lambda x: ' '.join(ha.html_regex_tokenizer(x)))
df.shape

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]


(10635, 2)

In [10]:

# Get counts of each char -- necessary for vocab
counts = Counter(' '.join(df.navigable_parent.values.tolist()))

# Build corpus vocab
vocab = sorted(counts, key=counts.get, reverse=True)

# Provide the vocab indicies
vocab_to_int = {word: i for i, word in enumerate(counts, 1)}

# Add padding
vocab_to_int['PAD'] = 0

# Vocab size
vocab_size = len(vocab_to_int.keys())

In [11]:

max_char_length = 1014

# Encode text
def encode(text):
    encoded = np.zeros([vocab_size, max_char_length], dtype='float32')
    review = text.lower()[:max_char_length-1:-1]
    i = 0
    for letter in text:
        if i >= max_char_length:
            break
        if letter in vocab_to_int:
            encoded[vocab_to_int[letter]][i] = 1
        i += 1
    return encoded


encoded_text = []
for doc in df.navigable_parent.values:
    encoded_text.append(encode(doc))

encoded_text = np.asarray(encoded_text, dtype=np.float32)
encoded_text.shape

(10635, 74, 1014)

In [12]:

encoded_text = encoded_text.reshape(len(df), max_char_length, vocab_size)
encoded_text.shape

(10635, 1014, 74)


# 5 Data Set and Data Loaders


Next we proceed with the usual task of preparing our `TensorDataset`, data loaders, a time helper function, and a weighted random sampler to help account for class imbalance during training.

In [13]:

# Prepare tensor data sets
def prepare_dataset(padded_tokens, target):
    
    # Prepare target into np array
    target = np.array(target.values, dtype=np.int64).reshape(-1, 1)
    
    # Create tensor data sets
    tensor_df = TensorDataset(torch.from_numpy(padded_tokens), torch.from_numpy(target))
    
    # 80% of df
    train_size = int(0.8 * len(df))
    
    # 20% of df
    val_size = len(df) - train_size
    
    # 50% of validation
    test_size = int(val_size - 0.5*val_size)
    
    # Divide the dataset by randomly selecting samples
    train_dataset, val_dataset = random_split(tensor_df, [train_size, val_size])
    
    # Divide validation by randomly selecting samples
    val_dataset, test_dataset = random_split(val_dataset, [test_size, test_size+1])

    return train_dataset, val_dataset, test_dataset

In [14]:

# Create tensor data sets
train_dataset, val_dataset, test_dataset = prepare_dataset(encoded_text, df.pos_id)

In [15]:

# Helper function to count target distribution inside tensor data sets
def target_count(tensor_dataset):
    
    # Set empty count containers
    for i in df.pos_id.unique():
        exec(f'count{i} = 0')
    
    # Set total container to turn into torch tensor
    total = []
    
    # For every item in the tensor data set
    for i in tensor_dataset:
        
        # Add to the count of that container
        exec(f'count{i[1].item()} += 1')
    
    for i in df.pos_id.unique():
        total.append(eval(f'count{i}'))
    
    return torch.tensor(total)

In [16]:

# Prepare weighted sampling for imbalanced classification
def create_sampler(target_tensor, tensor_dataset):
    
    # Generate class distributions [x, y]
    class_sample_count = target_count(tensor_dataset)
    
    # Weight
    weight = 1. / class_sample_count.float()
    
    # Produce weights for each observation in the data set
    samples_weight = torch.tensor([weight[t[1]] for t in tensor_dataset])
    
    # Prepare sampler
    sampler = torch.utils.data.WeightedRandomSampler(weights=samples_weight, num_samples=len(samples_weight), replacement=True)
    
    return sampler

In [17]:

# Create samplers for each data set
train_sampler = create_sampler(target_count(train_dataset), train_dataset)
val_sampler = create_sampler(target_count(val_dataset), val_dataset)
test_sampler = create_sampler(target_count(test_dataset), test_dataset)

In [18]:

# Create DataLoaders with samplers
train_dataloader = DataLoader(train_dataset, batch_size=80, sampler=train_sampler, shuffle=False)
valid_dataloader = DataLoader(val_dataset, batch_size=80, sampler=val_sampler, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=80, sampler=test_sampler, shuffle=False)

In [19]:

# Time function
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    
    # Round to the nearest second
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


# 6 Character-level CNN


Then we create our Character-level CNN class and specify its configuration.

In [20]:

class CharCNN(nn.Module):

    def __init__(self, config, vocab_size=74):
        super().__init__()
        num_conv_filters = config.num_conv_filters
        output_channel = config.output_channel
        num_affine_neurons = config.num_affine_neurons
        target_class = config.target_class
        input_channel = vocab_size

        self.conv1 = nn.Conv1d(input_channel, num_conv_filters, kernel_size=7)
        self.conv2 = nn.Conv1d(num_conv_filters, num_conv_filters, kernel_size=7)
        self.conv3 = nn.Conv1d(num_conv_filters, num_conv_filters, kernel_size=3)
        self.conv4 = nn.Conv1d(num_conv_filters, num_conv_filters, kernel_size=3)
        self.conv5 = nn.Conv1d(num_conv_filters, num_conv_filters, kernel_size=3)
        self.conv6 = nn.Conv1d(num_conv_filters, output_channel, kernel_size=3)

        self.dropout = nn.Dropout(config.dropout)
        self.fc1 = nn.Linear(output_channel, num_affine_neurons)
        self.fc2 = nn.Linear(num_affine_neurons, num_affine_neurons)
        self.fc3 = nn.Linear(num_affine_neurons, target_class)

    def forward(self, x, **kwargs):
        x = x.transpose(1, 2).type(torch.cuda.FloatTensor)

        x = F.max_pool1d(F.relu(self.conv1(x)), 3)
        x = F.max_pool1d(F.relu(self.conv2(x)), 3)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))

        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        x = F.relu(self.fc1(x.view(x.size(0), -1)))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        return self.fc3(x)

In [21]:

# Instantiate model config -- set ex-post from optuna search
class config:
    def __init__(self):
        config.num_conv_filters = 256
        config.output_channel = 256
        config.num_affine_neurons = 1024
        config.target_class = 2
        config.dropout = 0.4
        
        return None


# 7 Training Functions


Next we create our usual training functions.

In [22]:

def train(model, dataloader, optimizer, criterion):

    # Capture time
    total_t0 = time.time()

    # Perform one full pass over the training set
    print('')
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')

    # Reset total loss for epoch
    train_total_loss = 0
    total_train_f1 = 0

    # Put model into training mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(dataloader):

        # Progress update every 40 batches
        if step % 40 == 0 and not step == 0:

            # Report progress
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(dataloader)))

        # Unpack this training batch from our dataloader:
        # As we unpack the batch, we'll also copy each tensor to the GPU
        # `batch` contains two pytorch tensors:
        #   [0]: input ids
        #   [1]: labels
        b_input_ids = batch[0].cuda()
        b_labels = batch[1].cuda().long()

        # Clear previously calculated gradients
        optimizer.zero_grad()

        with autocast():
            
            # Forward propagation (evaluate model on training batch)
            logits = model(b_input_ids)

        # Calculate cross entropy loss
        loss = criterion(logits.view(-1, 2), b_labels.view(-1))

        # Sum the training loss over all batches for average loss at end
        # loss is a tensor containing a single value
        train_total_loss += loss.item()

        # Scales losss: calls backward() on scaled loss to create scaled gradients
        # Backward passes under autocast are not recommended
        # Backward ops run in the same dtype autocast chose for corresponding forward ops
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped
        scaler.step(optimizer)

        # Updates the scale for next iteration
        scaler.update()

        # Update the learning rate
        scheduler.step()

        # Get preds
        _, predicted = torch.max(logits, 1)

        # Move logits and labels to CPU
        predicted = predicted.detach().cpu().numpy()
        y_true = b_labels.detach().cpu().numpy()

        # Calculate f1
        total_train_f1 += f1_score(predicted, y_true, average='weighted', labels=np.unique(predicted))

    # Calculate the average loss over all of the batches
    avg_train_loss = train_total_loss / len(dataloader)

    # Calculate the average f1 over all of the batches
    avg_train_f1 = total_train_f1 / len(dataloader)

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'Train Loss': avg_train_loss,
            'Train F1': avg_train_f1
        }
    )

    # Training time end
    training_time = format_time(time.time() - total_t0)

    # Print result summaries
    print('')
    print('Summary Results')
    print('epoch | trn loss | trn f1 | trn time ')
    print(f'{epoch+1:5d} | {avg_train_loss:.5f} | {avg_train_f1:.5f} | {training_time:}')

    torch.cuda.empty_cache()

    return None

In [23]:

def validating(model, dataloader, criterion):

    # Capture validation time
    total_t0 = time.time()

    # After the completion of each training epoch, measure our performance on
    # our validation set
    print('')
    print('Running Validation...')

    # Put the model in evaluation mode
    model.eval()

    # Track variables
    total_valid_accuracy = 0
    total_valid_loss = 0
    total_valid_f1 = 0
    total_valid_recall = 0
    total_valid_precision = 0

    # Evaluate data for one epoch
    for batch in dataloader:

        # Unpack batch from dataloader
        b_input_ids = batch[0].cuda()
        b_labels = batch[1].cuda().long()

        # Tell pytorch not to bother calculating gradients
        # as it's only necessary for training
        with torch.no_grad():

            # Forward propagation (evaluate model on training batch)
            logits = model(b_input_ids)

            # Calculate BCEWithLogitsLoss
            loss = criterion(logits.view(-1, 2), b_labels.view(-1))

            # Calculate preds
            _, predicted = torch.max(logits, 1)

        # Accumulate validation loss
        total_valid_loss += loss.item()

        # Move logits and labels to CPU
        predicted = predicted.detach().cpu().numpy()
        y_true = b_labels.detach().cpu().numpy()

        # Calculate f1
        total_valid_f1 += f1_score(predicted, y_true, average='weighted', labels=np.unique(predicted))

        # Calculate accuracy
        total_valid_accuracy += accuracy_score(predicted, y_true)

        # Calculate precision
        total_valid_precision += precision_score(predicted, y_true, average='weighted', labels=np.unique(predicted))

        # Calculate recall
        total_valid_recall += recall_score(predicted, y_true, average='weighted', labels=np.unique(predicted))

    # Report final accuracy of validation run
    avg_accuracy = total_valid_accuracy / len(dataloader)

    # Report final f1 of validation run
    global avg_val_f1
    avg_val_f1 = total_valid_f1 / len(dataloader)

    # Report final precision of validation run
    avg_precision = total_valid_precision / len(dataloader)

    # Report final recall of validation run
    avg_recall = total_valid_recall / len(dataloader)

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_valid_loss / len(dataloader)

    # Record all statistics from this epoch.
    valid_stats.append(
        {
            'Val Loss': avg_val_loss,
            'Val Accur.': avg_accuracy,
            'Val precision': avg_precision,
            'Val recall': avg_recall,
            'Val F1': avg_val_f1
        }
    )

    # Capture end validation time
    training_time = format_time(time.time() - total_t0)

    # Print result summaries
    print('')
    print('summary results')
    print('epoch | val loss | val f1 | val time')
    print(f'{epoch+1:5d} | {avg_val_loss:.5f} | {avg_val_f1:.5f} | {training_time:}')

    return None

In [24]:

def testing(model, dataloader, criterion):

    print('')
    print('Running Testing...')
    
    # Put the model in evaluation mode
    model.eval()

    # Track variables
    total_test_accuracy = 0
    total_test_loss = 0
    total_test_f1 = 0
    total_test_recall = 0
    total_test_precision = 0

    # Evaluate data for one epoch
    for step, batch in enumerate(dataloader):
        
        # Progress update every 40 batches
        if step % 40 == 0 and not step == 0:

            # Report progress
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(dataloader)))

        # Unpack batch from dataloader
        b_input_ids = batch[0].cuda()
        b_labels = batch[1].cuda().long()

        # Tell pytorch not to bother calculating gradients
        # only necessary for training
        with torch.no_grad():

            # Forward propagation (evaluate model on training batch)
            logits = model(b_input_ids)

            # Calculate cross entropy loss
            loss = criterion(logits.view(-1, 2), b_labels.view(-1))

            # Calculate preds
            _, predicted = torch.max(logits, 1)

            # Accumulate test loss
            total_test_loss += loss.item()

        # Move logits and labels to CPU
        predicted = predicted.detach().cpu().numpy()
        y_true = b_labels.detach().cpu().numpy()

        # Calculate f1
        total_test_f1 += f1_score(predicted, y_true, average='weighted', labels=np.unique(predicted))

        # Calculate accuracy
        total_test_accuracy += accuracy_score(predicted, y_true)

        # Calculate precision
        total_test_precision += precision_score(predicted, y_true, average='weighted', labels=np.unique(predicted))

        # Calculate recall
        total_test_recall += recall_score(predicted, y_true, average='weighted', labels=np.unique(predicted))

    # Report final accuracy of test run
    avg_accuracy = total_test_accuracy / len(dataloader)

    # Report final f1 of test run
    avg_test_f1 = total_test_f1 / len(dataloader)

    # Report final precision of test run
    avg_precision = total_test_precision / len(dataloader)

    # Report final recall of test run
    avg_recall = total_test_recall / len(dataloader)

    # Calculate the average loss over all of the batches
    avg_test_loss = total_test_loss / len(dataloader)

    # Record all statistics from this epoch.
    test_stats.append(
        {
            'Test Loss': avg_test_loss,
            'Test Accur.': avg_accuracy,
            'Test precision': avg_precision,
            'Test recall': avg_recall,
            'Test F1': avg_test_f1
        }
    )
    
    return None


Next, we instantiate a number of preparatory objects to help us train:

In [25]:

config1 = config()
model = CharCNN(config1).cuda()

# Set loss
criterion = nn.CrossEntropyLoss()

# Set number of epochs
epochs = 7

# Set optimizer
optimizer = AdamW(model.parameters(), lr=0.0009978734977728082, weight_decay=0.5)

# Set LR scheduler
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Create gradient scaler for mixed precision
scaler = GradScaler()


# 8 Training


Now we are ready to train our model:

In [26]:

# Create training result storage
training_stats = []
valid_stats = []
best_valid_loss = float('inf')

# For each epoch
for epoch in range(epochs):
    
    # Train
    train(model, train_dataloader, optimizer, criterion)
    
    # Validate
    validating(model, valid_dataloader, criterion)
    
    # Check validation loss
    if valid_stats[epoch]['Val Loss'] < best_valid_loss:
        best_valid_loss = valid_stats[epoch]['Val Loss']
        
        # Save best model for use later
        torch.save(model.state_dict(), '../saves/pt/char-cnn-model1.pt')


Training...


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.