In [16]:
# ONE LIBRARY REQUIRED FOR RUNNING THIS CODE! (aside from torch)
# !pip install bi_lstm_crf
# Press the 'run all' button to train the model
# To train on a custom dataset, replace DATA_PATH with a file containing words with hyphenations 

In [1]:
import torch
import pandas as pd
import numpy as np
import re
import time  # For eta

# For data management
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

# Torch imports
import torch.nn as nn
import torch.optim as optim
from bi_lstm_crf import CRF
import torch.nn.functional as F

In [5]:
# Constants
DATA_PATH = 'datasets/dictionarywordssample.txt'  # Filename in directory with words to be hyphenated
HYPHENATION_TOKEN = '-'  # Hyphenation character
WINDOW_SIZE = 5  # Window size

BATCH_SIZE = 64  # Training batch size
SPLIT = 0.1  # Train / validation split
SUB_SAMPLE = 'FULL'  # Set to custom amount for subsample, leave to full to train whole wordlist

REMOVE_APOSTROPHE = 'NO'  # Set to 'YES' to remove apostrophe from input words

SEED = 9999

In [None]:
# Pre-processing
hyphenated_words = pd.read_csv(DATA_PATH, na_values=['xxxp'], keep_default_na=False, header=None)[0]

if REMOVE_APOSTROPHE == 'YES':
    hyphenated_words.apply(lambda x: x.replace("'", ''))
    
words = hyphenated_words.apply(lambda x: x.replace(HYPHENATION_TOKEN, '')) 
ALPHABET = ''.join(sorted(set(''.join(words.to_list()))))
MAX_TOKEN_LENGTH = np.max(words.str.len())

# Codes hyphenation into solution, e.g. 'num-ber' into '112111'
def code_solution(word):
    pattern = '.'+HYPHENATION_TOKEN
    solut = re.sub(pattern,'2', word)
    solut = re.sub('[^2]','1', solut)
    return solut

# For padding / converting all input to np array
def convert(input_string, input_type, direction='code'):
    if direction=='code':
        if input_type == 'word':
            return [(ALPHABET.index(i)+1) for i in input_string] + [0]*(MAX_TOKEN_LENGTH-len(input_string))
        elif input_type == 'solution':
            return [int(i) for i in input_string] + [0]*(MAX_TOKEN_LENGTH-len(input_string))
    if direction=='decode':
        if input_type == 'word':
            return ''.join([ALPHABET[i-1] for i in input_string if i!=0])
        elif input_type == 'solution':
            return ''.join([str(i) for i in input_string if i!=0])

# Shorthand for converter
encode_x = lambda input_string: convert(input_string, input_type='word', direction='code')
encode_y = lambda input_string: convert(input_string, input_type='solution', direction='code')
decode_x = lambda input_string: convert(input_string, input_type='word', direction='decode')
decode_y = lambda input_string: convert(input_string, input_type='solution', direction='decode')

# For creating windowed version of input
def expand(x_input):
    chars = len(ALPHABET)
    window_pad = WINDOW_SIZE//2
    window = list(range(chars, chars+window_pad*2))
    word_padded = np.concatenate([window[0:window_pad], x_input[x_input!=0], window[window_pad:]])
    
    full_list = []
    start_index=0
    for i, char in enumerate(x_input):
        if char==0:
            full_list.append([0]*WINDOW_SIZE)
        else:
            full_list.append(word_padded[start_index:start_index+WINDOW_SIZE])
            start_index+= 1
            
    full_list = np.array(full_list)  
    return full_list  

def process_data(hyphenated_words):   
    # Process labels
    labels = hyphenated_words.apply(code_solution)
    labels = labels.apply(encode_y)
    labels = np.array(labels.to_list())

    # Encode words
    words = hyphenated_words.apply(lambda x: x.replace(HYPHENATION_TOKEN, ''))
    words = np.array(words.apply(encode_x).to_list())
    words_windowed = np.empty((len(words), MAX_TOKEN_LENGTH, WINDOW_SIZE),dtype=np.int32)

    for i in range(len(words)):
        words_windowed[i] = expand(words[i])

    return words_windowed, labels
    
x, y = process_data(hyphenated_words)

# Print
print('Total length of x/y is: ' + str(len(hyphenated_words)))
print('Total alphabet tokens are: ' + str(len(ALPHABET)) + ' + 1')
print('Alphabet characters found in source file are: ' + str(ALPHABET))
print('Max tokens per word is: ' + str(MAX_TOKEN_LENGTH))

In [None]:
if SUB_SAMPLE == 'FULL':
    SUB_SAMPLE = len(x)

# Define rng, pick random indexes
rng = np.random.default_rng(seed=SEED)
indexes = rng.choice(hyphenated_words.index,replace=False,size=SUB_SAMPLE)
x_train = x[indexes[:int(SUB_SAMPLE*(1-SPLIT))]]
y_train = y[indexes[:int(SUB_SAMPLE*(1-SPLIT))]]
x_test = x[indexes[int(SUB_SAMPLE*(1-SPLIT)):]]
y_test = y[indexes[int(SUB_SAMPLE*(1-SPLIT)):]]

# Print
train_int = np.random.randint(len(x_train)-1)
test_int = np.random.randint(len(x_test)-1)

print('Total training set length: ' + str(len(x_train)))
print('Total testing set length: ' + str(len(x_test)))
print('Sample train word: ' + decode_x(x_train[train_int][:,2]) + ', ' + decode_y(y_train[train_int]))
print('Sample test word: ' + decode_x(x_test[test_int][:,2]) + ', ' + decode_y(y_test[test_int]))

In [8]:
# Pytorch data preperation
ALPHABET_SIZE = len(ALPHABET)+WINDOW_SIZE-1

# Create dataset, loader
train_data = TensorDataset(torch.tensor(x_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.long))
train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_data = TensorDataset(torch.tensor(x_test, dtype=torch.long), torch.tensor(y_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
# Function for comparing predicted with true performance
def callback_word_char_accuracy(y_predict, y_true):
    outputs_tensor = torch.nested.nested_tensor(y_predict)
    outputs_tensor = torch.nested.to_padded_tensor(outputs_tensor, padding=0,output_size=(len(y_predict),MAX_TOKEN_LENGTH))
    outputs_tensor = outputs_tensor.to(device)
    
    comparison_matrix = torch.eq(outputs_tensor, y_true)
    
    characters_correct = comparison_matrix.sum().item()
    words_correct = (comparison_matrix.all(dim=1)).sum().item()
    return characters_correct, words_correct

In [None]:
OUT_CHANNELS = 40
CONV_KERNEL_SIZE = 3
POOL_KERNEL_SIZE = 3
LSTM_UNITS = 128
DROPOUT_EM = 0.3

LSTM_IN = ((WINDOW_SIZE+2)-POOL_KERNEL_SIZE+1) * OUT_CHANNELS

# Pytorch modeling
device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. Define model
class syl_model(nn.Module):

    def __init__(self):
        super(syl_model, self).__init__()
        self.em = nn.Embedding(ALPHABET_SIZE, 128)
        self.dropout1 = nn.Dropout(p=DROPOUT_EM)
        self.conv1 =  torch.nn.Conv1d(in_channels=128, out_channels=OUT_CHANNELS, kernel_size=CONV_KERNEL_SIZE, stride=1, padding='same')
        self.bn1 = nn.BatchNorm1d(OUT_CHANNELS)  # Add batch normalization after conv layer
        self.pool = torch.nn.MaxPool1d(kernel_size=POOL_KERNEL_SIZE, stride=1, padding=1)
        
        self.lstm = nn.LSTM(LSTM_IN, LSTM_UNITS, batch_first=True, bidirectional=True)
        self.bn2 = nn.BatchNorm1d(LSTM_UNITS*2)  # Add batch normalization after LSTM
        self.crf = CRF(LSTM_UNITS*2, 3)

    def forward(self, x):

        batch_size=x.size(0)
        mask = x[:,:,2].gt(0)
        
        x = x.reshape(batch_size*MAX_TOKEN_LENGTH, WINDOW_SIZE)
        x = self.em(x)
        x = self.dropout1(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.pool(x)
        x = torch.flatten(x, start_dim=1)
        x = x.reshape(batch_size, MAX_TOKEN_LENGTH, LSTM_IN)      
        x, _ = self.lstm(x)

        x = x.permute(0, 2, 1)  # Change shape for batch norm
        x = self.bn2(x)  # Apply batch norm after LSTM
        x = x.permute(0, 2, 1)  # Change shape back  
        
        scores, tag_seq = self.crf(x, mask)      
        return scores, tag_seq

    def loss(self, x, tags):

        batch_size=x.size(0)
        mask = x[:,:,2].gt(0)
        
        x = x.reshape(batch_size*MAX_TOKEN_LENGTH, WINDOW_SIZE)
        x = self.em(x)
        x = self.dropout1(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.bn1(x)      
        x = F.relu(x)
        x = self.pool(x)             
        x = torch.flatten(x, start_dim=1)              
        x = x.reshape(batch_size, MAX_TOKEN_LENGTH, LSTM_IN)                 
        x, _ = self.lstm(x)

        x = x.permute(0, 2, 1)  # Change shape for batch norm
        x = self.bn2(x)  # Apply batch norm after LSTM
        x = x.permute(0, 2, 1)  # Change shape back  
        
        loss = self.crf.loss(x, tags, mask)
        return loss

# 2. Create instance, set compiler / loss calculator
syl_model_instance = syl_model().to(device)
print(syl_model_instance)

# 3. Set optimizer
optimizer = optim.Adam(syl_model_instance.parameters(),
                      lr=3e-4)

# 4. Train model
N_EPOCHS = 14
print('Starting training...')

for epoch in range(N_EPOCHS):
    
    # Training
    batch = 0
    start_time = time.time() 

    train_loss = 0.0
    syl_model_instance.train()  
    for inputs, labels in train_loader:

        inputs = inputs.to(device)
        labels = labels.to(device)
        
        batch += 1
        optimizer.zero_grad()  # Reset optimizer
        loss = syl_model_instance.loss(inputs, labels)  # Forward pass
        
        loss.backward()  # Backpass
        optimizer.step()  # Update
        
        train_loss += loss.item()
        progress = round((((batch + 1) / (len(train_loader)+1))*100)/2)
        eta_seconds = (time.time() - start_time) / batch * (len(train_loader) - batch)
        eta_minutes = int(eta_seconds // 60)  # Get the number of minutes
        eta_remaining_seconds = int(eta_seconds % 60)  # Get the remaining seconds
        
        # Print progress bar with ETA in minutes:seconds format
        print('Epoch ' + str(epoch+1) + ': [' + progress * '=' + (50 - progress) * '.' + 
              '] ' + str(batch) + '/' + str(len(train_loader) + 1) + 
              f' | ETA: {eta_minutes}:{eta_remaining_seconds:02d}', end='\r')
        
    # Validation
    characters_correct = 0
    words_correct = 0
    
    val_loss = 0.0
    syl_model_instance.eval()

    for inputs, labels in test_loader:
        
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = syl_model_instance(inputs)
        
        chars_valid, words_valid = callback_word_char_accuracy(outputs[1], labels)
        characters_correct += chars_valid
        words_correct += words_valid 
    
    print('E')  # To circumvent the '\r'
    print("Character accuracy: {}\nWord accuracy: {}".format(characters_correct/(MAX_TOKEN_LENGTH*len(test_data)),
                                                              words_correct/len(test_data)))
    print('---')

    # Save model for epoch
    save_filename = 'dutch_seed' + str(SEED) + '_epoch_' + str(epoch+1) + '.pt'
    torch.save(syl_model_instance.state_dict(), save_filename)

In [None]:
# Input one more more words to syllabificate after training
word_list = 'prima goed leuk en dergelijke andere woorden'
target_words = word_list
target_words = pd.Series(target_words.split())
target_words = np.array(target_words.apply(encode_x).to_list())
target_words_windowed = np.empty((len(target_words), MAX_TOKEN_LENGTH, WINDOW_SIZE),dtype=np.int32)
for i in range(len(target_words)):
    target_words_windowed[i] = expand(target_words[i])
target_words_windowed = torch.tensor(target_words_windowed, dtype=torch.long)

# Run words through model
syl_model_instance.eval()
target_outputs = syl_model_instance(target_words_windowed)
target_outputs = target_outputs[1]
number_words = len(target_outputs)
target_outputs = torch.nested.nested_tensor(target_outputs)
target_outputs = torch.nested.to_padded_tensor(target_outputs, padding=0,output_size=(number_words,MAX_TOKEN_LENGTH))

# Output as string
for word in enumerate(word_list.split()):
    current_word = []
    for char in enumerate(word[1]):
        if target_outputs[word[0]][char[0]]==1:
            current_word += [char[1]]
        else:
            current_word += [char[1] + '-']
    print(''.join(current_word))