In [161]:
import json
import re
import spacy
from typing import Counter
import torch
from torch.utils.data import Dataset, DataLoader
import re
from collections import Counter
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import numpy as np
from PIL import Image
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader, Dataset
from PIL import Image
import matplotlib.pyplot as plt
from collections import Counter
import time
import os
import copy
from torchvision import models
import torchvision.utils
import torchvision.datasets as dsets
import torchvision.transforms as transforms


In [168]:
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

# Data Extraction

#### Functions for processing the data

In [130]:
spanish_processer = spacy.load('es_core_news_md')

def extract_medical_texts_and_predictions(dataset):
    medical_texts = []
    predictions = []

    for data_element in dataset:
        medical_text = data_element['data']['text']
        medical_texts.append(medical_text)

        for result_element in data_element['predictions']:
            predictions.append(result_element['result'])

    return medical_texts, predictions


def remove_residual_chars(text):
    """
    Substitute all characters that are not letters, digits, whitespaces or allowed punctuation with a whitespace.
    """
    allowed_punctuation = r'\.,;:\"!'
    pattern = f'[^{allowed_punctuation}\\w\\s]'
    return re.sub(pattern, ' ', text)


def extract_word_positions(text):
    pattern = re.compile(r'\w+|[^\w\s]')
    matches = pattern.finditer(text)

    words_with_indices = {match.group(): {'start': match.start(), 'end': match.end()} for match in matches}

    return words_with_indices


def is_word(token):
    """
    Check if the word is a punctuation mark or a whitespace.
    """
    pattern = re.compile(r"[a-zA-Z]|\d")
    
    return pattern.search(token)


def preprocess_and_tokenize(base_text):
    text = remove_residual_chars(base_text)
    
    doc = spanish_processer(text)
    token_sent = [[token for token in sentence if is_word(token.text)] for sentence in doc.sents]

    token_sent = clear_processed_text(token_sent)
    
    token_sent = [[extract_features(sentence, j) for j in range(len(sentence))] for sentence in token_sent]

    return token_sent


def extract_features(sentence, i):
    token = sentence[i]
    word = token.text

    features = {
        'word': word,
        'word_lower': word.lower(),
        'is_capitalized': word[0].isupper(),
        'is_all_caps': word.isupper(),
        'is_digit': word.isdigit(),
        'word_length': len(word),
        'contains_digits': bool(re.search(r'\d', word)),
        'pos': token.pos_,
        'lemma': token.lemma_,
        'start-end': {'start': token.idx, 'end': token.idx + len(word)}
    }

    features["prefix_2"] = word[:2]
    features["suffix_2"] = word[-2:]

    return features

def clear_processed_text(processed_text):
    """
    Remove empty sentences from the processed text. 
    """
    return [sentence for sentence in processed_text if sentence]

def extract_cues_and_scopes(document):
    """
    Extract negations, uncertain cues, negation scopes and uncertain scopes from the document.
    """
    negations = [result_element for result_element in document["predictions"][0]["result"] if "NEG" in result_element["value"]["labels"]]
    uncertains = [result_element for result_element in document["predictions"][0]["result"] if "UNC" in result_element["value"]["labels"]]
    nscopes = [result_element for result_element in document["predictions"][0]["result"] if "NSCO" in result_element["value"]["labels"]]
    uscopes = [result_element for result_element in document["predictions"][0]["result"] if "USCO" in result_element["value"]["labels"]]

    # Sort the cues and scopes by their start position
    negations.sort(key=lambda x: x["value"]["start"])
    uncertains.sort(key=lambda x: x["value"]["start"])
    nscopes.sort(key=lambda x: x["value"]["start"])
    uscopes.sort(key=lambda x: x["value"]["start"])

    return negations, uncertains, nscopes, uscopes


def BIO_tagging(tokens, labels, label_name, original_text):
    """
    Set BIO tags for the tokens based on the labels from the training data.
    
    Args:
    - tokens (list): A list of token dictionaries
    - labels (list): A list of labels from the training data
    - label_name (str): The name of the label to use for tagging (e.g., "NEG")
    - original_text (str): The original text of the document
    """
     
    negation_idx = 0

    for token in tokens:
        if 'tag' not in token:
            token['tag'] = 'O'

        if negation_idx >= len(labels):
            continue

        negation = labels[negation_idx]

        neg_start = negation['value']['start']
        neg_end = negation['value']['end']

        # Skip whitespace characters at the start and end positions
        if original_text[neg_start] == ' ':
            neg_start += 1
        if original_text[neg_end - 1] == ' ':
            neg_end -= 1

        token_start = token['start-end']['start']
        token_end = token['start-end']['end']

        if token_start == neg_start:
            token['tag'] = f'B-{label_name}'
        elif neg_start < token_start < neg_end:
            # If the token is at the end of the negation, set the tag to E
            if token_end == neg_end:
                token['tag'] = f'E-{label_name}'
            else:
                token['tag'] = f'I-{label_name}'
        elif token_start > neg_end:
            # Move to the next negation that starts after the current token
            while negation_idx < len(labels) - 1 and token_start > labels[negation_idx]['value']['start']:
                negation_idx += 1
                negation = labels[negation_idx]
            
            if token_start == negation['value']['start']:
                token['tag'] = f'B-{label_name}'
    
def add_BIO_tags(X_train_texts, Y_train, medical_texts_train, verbose=False):
    """
    Set BIO tags for the tokens in the training data based on the labels from the training data.
    
    Args:
    - X_train_texts (list): A list of tokenized sentences
    - Y_train (list): A list of labels from the training data
    - medical_texts_train (list): A list of medical texts
    - verbose (bool): Whether to print the tagged sentences
    
    Returns:
    """
    
    for text_idx, (train_text, train_labels, original_text) in enumerate(list(zip(X_train_texts, Y_train, medical_texts_train))):
        negs, uncs, nscs, uscs = train_labels
        for i in range(len(train_text)):
            BIO_tagging(train_text[i], negs, "NEG", original_text)
            BIO_tagging(train_text[i], uncs, "UNC", original_text)
            BIO_tagging(train_text[i], nscs, "NSCO", original_text)
            BIO_tagging(train_text[i], uscs, "USCO", original_text)

        if verbose:
            print("Text: ", text_idx, " : ", original_text[:100], "...")
            for i, sentence in enumerate(train_text[:2]):
                print("Sentence: ", i)
                for token in sentence[:5]:
                    print(token)
            print("<--------------------------------------->")    
            
    return X_train_texts

Extracting the data from files, tokenizing and tagging

In [131]:
with open('train_data.json', 'r', encoding='utf-8') as train_file:
    train_dataset = json.load(train_file)

    with open('test_data.json', 'r', encoding='utf-8') as test_file:
        test_dataset = json.load(test_file)

    medical_texts_train, predictions_train = extract_medical_texts_and_predictions(train_dataset)
    medical_texts_test, predictions_test = extract_medical_texts_and_predictions(test_dataset)

    limit = -1  # Limit the number of documents to process (set to -1 to process all documents)

    print("Preprocessing text and extracting cues...")

    processed_text_train = [preprocess_and_tokenize(text) for text in medical_texts_train[:limit]]
    processed_text_test = [preprocess_and_tokenize(text) for text in medical_texts_test[:limit]]
    
    Y_train = [extract_cues_and_scopes(document) for document in train_dataset[:limit]]
    Y_test = [extract_cues_and_scopes(document) for document in test_dataset[:limit]]

    print("Tagging tokens...")

    tagged_texts_train = add_BIO_tags(processed_text_train, Y_train, medical_texts_train)
    tagged_texts_test = add_BIO_tags(processed_text_test, Y_test, medical_texts_test)
   
    print("Number of training documents:", len(medical_texts_train))
    print("Number of test documents:", len(medical_texts_test))
    print("Limiting to", limit, "documents...")

Preprocessing text and extracting cues...
Tagging tokens...
Number of training documents: 254
Number of test documents: 64
Limiting to -1 documents...


In [132]:
print([token['word'] for token in tagged_texts_train[0][0]])
print([token['word'] for token in tagged_texts_test[0][0]])

['nº', 'historia', 'clinica', 'nºepisodi', 'sexe', 'home', 'data', 'de', 'naixement', '16.05.1936', 'edat', '82', 'anys', 'procedencia', 'cex', 'mateix', 'hosp', 'servei', 'urologia', 'data', 'd', 'ingres', '24.07.2018', 'data', 'd', 'alta', '25.07.2018', '08:54:04', 'ates', 'per', 'informe', 'd', 'alta', 'd', 'hospitalitzacio', 'motiu', 'd', 'ingres', 'paciente', 'que', 'ingresa', 'de', 'forma', 'programada', 'para', 'realizacion', 'de', 'uretrotomia', 'interna']
['nº', 'historia', 'clinica', 'nºepisodi', 'sexe', 'dona', 'data', 'de', 'naixement', '12.05.1977', 'edat', '42', 'anys', 'procedencia', 'aguts', 'servei', 'obstetricia', 'data', 'd', 'ingres', '27.09.2019', 'data', 'd', 'alta', '01.10.2019', '13:00:00', 'ates', 'per', 'informe', 'd', 'alta', 'd', 'hospitalitzacio', 'motiu', 'd', 'ingres', 'induccion', 'al', 'parto', 'por', 'pequeño', 'para', 'la', 'edad', 'gestacional', 'peg', 'antecedents', 'no', 'alergias', 'medicamentosas', 'conocidas', 'antcededentes', 'medico', 'quirurg

In [133]:
# Concatenate train sentences
tmp = []

for text in tagged_texts_train:
    tmp.extend(text)
    
tagged_sentences_train = tmp

# Concatenate test sentences

tmp = []

for text in tagged_texts_test:
    tmp.extend(text)
    
tagged_sentences_test = tmp

#### Vocabularies

Word vocabulary

In [134]:
class Vocabulary:
    def __init__(self,freq_threshold):
        #setting the pre-reserved tokens int to string tokens
        self.ind2word = {0:"<pad>",1:"<start>",2:"<end>",3:"<unk>",4:"<date>"}
        
        #string to int tokens
        #its reverse dict self.itos
        self.word2ind = {v:k for k,v in self.ind2word.items()}
        
        self.freq_threshold = freq_threshold
        
    def __len__(self): return len(self.itos)
    
    @staticmethod
    def tokenize(text):
        tokens = []
        
        for token in text:
            # If it is a date, replace it with <date> token
            if token['pos'] == 'NUM' and re.match(r'\d{2,4}(-|\/|.)\d{2}(-|\/|.)\d{2,4}', token['word']):
                tokens.append('<date>')
            else:
                tokens.append(token['word_lower'])
        
        return tokens
    
    def build_vocab(self, sentence_list):
        frequencies = Counter()
        idx = 5
        
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                frequencies[word] += 1
                
                #add the word to the vocab if it reaches minimum frequecy threshold
                if frequencies[word] == self.freq_threshold and word not in self.word2ind:
                    self.word2ind[word] = idx
                    self.ind2word[idx] = word
                    idx += 1
                
    def numericalize(self,text):
        # change word with index
        tokenized_text = self.tokenize(text)
        return [ self.word2ind[token] if token in self.word2ind else self.word2ind["<unk>"] for token in tokenized_text ]  

In [135]:
word_vocab = Vocabulary(freq_threshold=1)
word_vocab.build_vocab(tagged_sentences_train)
print("Word vocab size: ", len(word_vocab.word2ind))
print(word_vocab.word2ind)
print([token['word_lower'] for token in tagged_sentences_train[0]])
print(word_vocab.numericalize(tagged_sentences_train[0]))

Word vocab size:  17232
{'<pad>': 0, '<start>': 1, '<end>': 2, '<unk>': 3, '<date>': 4, 'nº': 5, 'historia': 6, 'clinica': 7, 'nºepisodi': 8, 'sexe': 9, 'home': 10, 'data': 11, 'de': 12, 'naixement': 13, 'edat': 14, '82': 15, 'anys': 16, 'procedencia': 17, 'cex': 18, 'mateix': 19, 'hosp': 20, 'servei': 21, 'urologia': 22, 'd': 23, 'ingres': 24, 'alta': 25, 'ates': 26, 'per': 27, 'informe': 28, 'hospitalitzacio': 29, 'motiu': 30, 'paciente': 31, 'que': 32, 'ingresa': 33, 'forma': 34, 'programada': 35, 'para': 36, 'realizacion': 37, 'uretrotomia': 38, 'interna': 39, 'antecedents': 40, 'alergia': 41, 'a': 42, 'penicilina': 43, 'y': 44, 'cloramfenicol': 45, 'no': 46, 'habitos': 47, 'toxicos': 48, 'antecedentes': 49, 'medicos': 50, 'bloqueo': 51, 'auriculoventricular': 52, 'primer': 53, 'grado': 54, 'hipertension': 55, 'arterial': 56, 'diverticulosis': 57, 'extensa': 58, 'insuficiencia': 59, 'renal': 60, 'cronica': 61, 'colelitiasis': 62, 'quirurgicos': 63, 'exeresis': 64, 'lesiones': 65, '

POS Tag vocabulary

In [136]:
class POS_Vocabulary:
    def __init__(self,freq_threshold):
        #setting the pre-reserved tokens int to string tokens
        self.ind2word = {0:"<pad>",1:"<start>",2:"<end>",3:"<unk>"}
        
        #string to int tokens
        #its reverse dict self.itos
        self.word2ind = {v:k for k,v in self.ind2word.items()}
        
        self.freq_threshold = freq_threshold
        
    def __len__(self): return len(self.itos)
    
    @staticmethod
    def tokenize(text):
        tokens = []
        
        for token in text:
            tokens.append(token['pos'])
        
        return tokens
    
    def build_vocab(self, sentence_list):
        frequencies = Counter()
        idx = 5
        
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                frequencies[word] += 1
                
                #add the word to the vocab if it reaches minimum frequecy threshold
                if frequencies[word] == self.freq_threshold and word not in self.word2ind:
                    self.word2ind[word] = idx
                    self.ind2word[idx] = word
                    idx += 1
                
    def numericalize(self,text):
        # change word with index
        tokenized_text = self.tokenize(text)
        return [ self.word2ind[token] if token in self.word2ind else self.word2ind["<unk>"] for token in tokenized_text ]  

In [137]:
pos_vocab = POS_Vocabulary(freq_threshold=1)
pos_vocab.build_vocab(tagged_sentences_train)
print("POS vocab size: ", len(pos_vocab.word2ind))
print(pos_vocab.word2ind)
print([token['pos'] for token in tagged_sentences_train[0]])
print(pos_vocab.numericalize(tagged_sentences_train[0]))

POS vocab size:  21
{'<pad>': 0, '<start>': 1, '<end>': 2, '<unk>': 3, 'NOUN': 5, 'ADJ': 6, 'VERB': 7, 'ADP': 8, 'PROPN': 9, 'NUM': 10, 'PRON': 11, 'SCONJ': 12, 'CCONJ': 13, 'ADV': 14, 'AUX': 15, 'DET': 16, 'X': 17, 'PUNCT': 18, 'INTJ': 19, 'SYM': 20, 'PART': 21}
['NOUN', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'VERB', 'ADP', 'PROPN', 'NUM', 'PRON', 'NUM', 'PROPN', 'NOUN', 'ADJ', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'VERB', 'PROPN', 'PROPN', 'NUM', 'VERB', 'PROPN', 'NOUN', 'NUM', 'NUM', 'NOUN', 'ADP', 'NOUN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'ADJ', 'SCONJ', 'VERB', 'ADP', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'ADP', 'NOUN', 'ADJ']
[5, 5, 6, 5, 5, 5, 7, 8, 9, 10, 11, 10, 9, 5, 6, 9, 9, 9, 9, 7, 9, 9, 10, 7, 9, 5, 10, 10, 5, 8, 5, 9, 9, 9, 9, 9, 9, 9, 6, 12, 7, 8, 5, 6, 8, 9, 8, 5, 6]


Character vocabulary(set of characters)

In [138]:
# Create a set of all characters in the vocabulary
chars = set('\0')   # Start with the null character which is used for padding
longest_word = 0

for sentence in tagged_sentences_train:
    for token in sentence:
        # Check if the current word is the longest one
        if len(token['word_lower']) > longest_word:
            longest_word = len(token['word_lower'])
            
        # Add all characters in the word to the set
        chars.update(token['word_lower'])
        
        
chars = list(chars)

In [139]:
def char2ind(sentence):
    """
    Convert the characters in the sentence to their corresponding indices in the character vocabulary.
    """
    sentence_chars = []
    
    for token in sentence:
        token_chars = []
        
        for char in token['word_lower']:
            token_chars.append(chars.index(char))
        
        # Add padding to the end of the word
        for _ in range(longest_word - len(token['word_lower'])):
            token_chars.append(0)
        
        sentence_chars.append(token_chars)
        
    return sentence_chars

In [140]:
def word2casings(sentence):
    """
    Create one hot encoding for the casing of the token.

    The casing features are:
    - Contains digits
    - Contains punctuation
    - First word in the sentence
    - Last word in the sentence
    """
    casings = []
    
    for idx, token in enumerate(sentence):
        casing = []
        casing.append(1 if any(char.isdigit() for char in token['word_lower']) else 0)
        casing.append(1 if any(not char.isalnum() for char in token['word_lower']) else 0)
        casing.append(1 if idx == 0 else 0)
        casing.append(1 if idx == len(sentence) - 1 else 0)
        casings.append(casing)
        
    return casings

In [141]:
# Longest sentence in the processed text with start and end tokens
longest_sentence = max(len(sentence) for sentence in tagged_sentences_train) + 2
print("Longest sentence:", longest_sentence)

Longest sentence: 268


In [296]:
# Find longest sentence with its idx
longest_sentence_idx = np.argmax([len(sentence) for sentence in tagged_sentences_train])
print("Longest sentence idx:", longest_sentence_idx)
print([token['word'] for token in tagged_sentences_train[longest_sentence_idx]])

Longest sentence idx: 4345
['orientacio', 'diagnostica', 'r10.9', 'dolor', 'abdominal', 'no', 'especificat', 'tractament', 'i', 'recomanacions', 'a', 'l', 'alta', 'adjuntamos', 'tratamiento', 'al', 'alta', 'omeprazol', '20', 'mg', 'caps', '20', 'mg', 'e', 'aa', 'or', '09', '01', '2019', '15', '173d', 's', 'hioscina', 'butilbromur', '20', 'mg', '1', 'ml', 'amp', 'buscapina', '20', 'mg', '8asp', 'piv', '18', '01', '2019', '16', '173d', 's', 'metoclopramida', '10', 'mg', '2', 'ml', 'amp', '10', 'mg', '8sp', 'piv', '14', '01', '2019', '16', '173d', 's', 'ondansetron', '4', 'mg', '2', 'ml', 'amp', '4', 'mg', '8asp', 'piv', '02', '01', '2019', '12', '173d', 's', 'lactulosa', '10', 'g', '15', 'ml', 'sobre', '10', 'g', 'edssp', 'or', '14', '01', '2019', '16', '173d', 's', 'enoxaparina', '60', 'mg', '0,6', 'ml', 'xeringa', '60', 'mg', '9', '21', 'sc', '19', '01', '2019', '17', '173d', 's', 'betametasona', '0,05', '30', 'g', 'crema', '1', 'apl', '12', 'to', '20', '01', '2019', '20', '173d', 's',

BIO Tag vocabulary

In [142]:
class Tag_Vocabulary:
    def __init__(self):
        #setting the pre-reserved tokens int to string tokens
        self.ind2word = {0:"O",1:"B-NEG",2:"I-NEG",3:"E-NEG",4:"B-UNC",5:"I-UNC",6:"E-UNC",7:"B-NSCO",8:"I-NSCO",9:"E-NSCO",10:"B-USCO",11:"I-USCO",12:"E-USCO"}
        
        #string to int tokens
        #its reverse dict self.itos
        self.word2ind = {v:k for k,v in self.ind2word.items()}
        
    def __len__(self): return len(self.itos)
    
    @staticmethod
    def tokenize(text):
        tokens = []
        
        for token in text:
            tokens.append(token['tag'])
        
        return tokens
    
    def numericalize(self,text):
        # change word with index
        tokenized_text = self.tokenize(text)
        return [ self.word2ind[token] if token in self.word2ind else self.word2ind["O"] for token in tokenized_text ]  

In [143]:
tag_vocab = Tag_Vocabulary()

#### Create Dataset

In [144]:
def numericalize_sents(sentences):
    """
    Convert the sentences to numerical form.
    """
    numericalized_sents = []
    
    for sentence in sentences:
        numericalized_sent = []
        
        word_encoding = word_vocab.numericalize(sentence)
        pos_encoding = pos_vocab.numericalize(sentence)
        char_encoding = char2ind(sentence)
        casing_encoding = word2casings(sentence)
        
        numericalized_sent = list(zip(word_encoding, pos_encoding, char_encoding, casing_encoding))
        
        # Add start and end tokens
        numericalized_sent.insert(0, [word_vocab.word2ind['<start>'], pos_vocab.word2ind['<start>'], [0] * longest_word, [0, 0, 0, 0]])
        numericalized_sent.append([word_vocab.word2ind['<end>'], pos_vocab.word2ind['<end>'], [0] * longest_word, [0, 0, 0, 0]])
        
        # Fill with padding
        for _ in range(longest_sentence - len(numericalized_sent)):
            numericalized_sent.append([ word_vocab.word2ind['<pad>'], pos_vocab.word2ind['<pad>'], [0] * longest_word, [0, 0, 0, 0]])
            
        numericalized_sents.append(numericalized_sent)
        
    return numericalized_sents
        

In [151]:
# Transform the tagged sentences to numericalized format
X_train = []
X_test = []

X_train = numericalize_sents(tagged_sentences_train)
X_test = numericalize_sents(tagged_sentences_test)

In [152]:
print(X_train[0])
print(X_test[0])

word_sent_train = [word_vocab.ind2word[word] for word, _, _, _ in X_train[0]]
word_sent_test = [word_vocab.ind2word[word] for word, _, _, _ in X_test[0]]

original_sent_train = [token['word'] for token in tagged_sentences_train[0]]
original_sent_test = [token['word'] for token in tagged_sentences_test[0]]

print(word_sent_train)
print(word_sent_test)

print(original_sent_train)
print(original_sent_test)

[[1, 1, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]], (5, 5, [34, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0]), (6, 5, [6, 48, 36, 41, 44, 20, 48, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (7, 6, [16, 28, 48, 34, 48, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (8, 5, [34, 15, 18, 19, 48, 36, 44, 47, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (9, 5, [36, 18, 10, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (10, 5, [6, 44, 21, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (11, 7, [47, 4, 41, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (12, 8, [47, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0

Transform the tags to numericalized format

In [153]:
def numericalize_tags(tagged_sentences, tag_vocab):
    Y = []

    # Transform the tags to numericalized format
    for sentence in tagged_sentences:
        sentence_tags = []
        
        tag_encoding = tag_vocab.numericalize(sentence)
        
        for i, token in enumerate(sentence):
            sentence_tags.append(tag_encoding[i])
            
        # Add start and end tags to the sentence
        start_token = tag_vocab.word2ind['O']
        end_token = tag_vocab.word2ind['O']

        sentence_tags.insert(0, start_token)
        sentence_tags.append(end_token)
        
        # Fill with padding to the longest sentence
        for _ in range(longest_sentence - len(sentence_tags)):
            padding_token = tag_vocab.word2ind['O']
            sentence_tags.append(padding_token)
            
        Y.append(sentence_tags)
        
    return Y

In [154]:
Y_train = []
Y_test = []

Y_train = numericalize_tags(tagged_sentences_train, tag_vocab)
Y_test = numericalize_tags(tagged_sentences_test, tag_vocab)

In [155]:
print(X_train[0][:3])
print(Y_train[2])

word_sent_test = [word_vocab.ind2word[word] for word, _, _, _ in X_test[0]]

print(word_sent_test)
print(Y_test[0])

[[1, 1, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]], (5, 5, [34, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0]), (6, 5, [6, 48, 36, 41, 44, 20, 48, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0])]
[0, 1, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

#### Data Loaders

In [331]:
batch_size = 64

# Create the dataloaders

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, X_train, Y_train):
        self.X_train = X_train
        self.Y_train = Y_train

    def __len__(self):
        return len(self.Y_train)

    def __getitem__(self, idx):
        X = self.X_train[idx]
        y = self.Y_train[idx]
        
        # Word encoding tensor
        word_tensor = torch.tensor([x[0] for x in X], dtype=torch.long)
        
        # POS encoding tensor
        pos_tensor = torch.tensor([x[1] for x in X], dtype=torch.long)
        
        # Character encoding tensor
        char_tensor = torch.tensor([x[2] for x in X], dtype=torch.long)
        
        # Casing encoding tensor
        casing_tensor = torch.tensor([x[3] for x in X], dtype=torch.long)
                
        # Convert y to tensor
        y_tensor = torch.tensor(y, dtype=torch.long)
        
        return word_tensor, pos_tensor, char_tensor, casing_tensor, y_tensor
    
train_dataset = CustomDataset(X_train, Y_train)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [157]:
# Get a batch of data
X_word, X_pos, X_char, X_casing, y_batch = next(iter(train_dataloader))

print("Word tensor shape:", X_word.shape)
print("POS tensor shape:", X_pos.shape)
print("Character tensor shape:", X_char.shape)
print("Casing tensor shape:", X_casing.shape)
print("Tag tensor shape:", y_batch.shape)

Word tensor shape: torch.Size([2, 268])
POS tensor shape: torch.Size([2, 268])
Character tensor shape: torch.Size([2, 268, 28])
Casing tensor shape: torch.Size([2, 268, 4])
Tag tensor shape: torch.Size([2, 268])


In [158]:
print(X_test[0])
print(Y_test[0])

[[1, 1, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]], (5, 5, [34, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0]), (6, 5, [6, 48, 36, 41, 44, 20, 48, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (7, 6, [16, 28, 48, 34, 48, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (8, 5, [34, 15, 18, 19, 48, 36, 44, 47, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (9, 5, [36, 18, 10, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (325, 7, [47, 44, 34, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (11, 7, [47, 4, 41, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]), (12, 8, [47, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 

# Model Definition

In [332]:
class Model(nn.Module):
    def __init__(self,vocab_dim_ch,vocab_dim_pos,vocab_dim_word, max_sent_legnth, max_Xch_len,len_X_casing, embedding_dim_ch_pos,embedding_dim, hidden_dim, n_layers,tag_dim, drop_prob=0.):
        super(Model, self).__init__()

        self.fc1 = nn.Linear(hidden_dim,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, tag_dim)

        self.prelu = nn.PReLU()
        #####################################33
        self.embedding_ch = nn.Embedding(vocab_dim_ch,embedding_dim_ch_pos)
        self.conv2d = nn.Conv2d(in_channels=max_sent_legnth,out_channels=max_sent_legnth,kernel_size=3)
        self.dropout2d = nn.Dropout2d(p=0.5)
        self.maxpool = nn.MaxPool2d(2,stride = 2)
        self.flatten = nn.Flatten(2,3)
        self.fc_ch = nn.Linear(312,300)
        self.fc_pos = nn.Linear(embedding_dim_ch_pos,300)
        self.fc_one_hot = nn.Linear(len_X_casing,300)

        self.embedding_pos = nn.Embedding(vocab_dim_pos,embedding_dim_ch_pos)
        self.embedding_word =nn.Embedding(vocab_dim_word, embedding_dim)

        self.dropout = nn.Dropout(0.4)
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True, bidirectional=False)


    def forward(self, x, h, c=None):
        emb_0 = self.embedding_ch(x[0])
        
        # print("emb_0 shape", emb_0.shape)
        
        out_0 = self.conv2d(emb_0)
        
        # print("out_0 shape", out_0.shape)
        
        out_0 = self.prelu(out_0)
        out_0 = self.dropout2d(out_0)
        out_0 = self.maxpool(out_0)
        
        out_0 = self.flatten(out_0)
        
        # print("out_0 shape", out_0.shape)
        
        out_0 = self.fc_ch(out_0)
        out_0 = self.prelu(out_0)

        out_1 = self.embedding_pos(x[1])
        out_1 = self.fc_pos(out_1)
        out_1 = self.prelu(out_1)

        out_2 = self.embedding_word(x[2])

        out_3 = self.fc_one_hot(x[3])
        out_3 = self.prelu(out_3)

        # print("out_0 shape", out_0.shape)
        # print("out_1 shape", out_1.shape)
        # print("out_2 shape", out_2.shape)
        # print("out_3 shape", out_3.shape)

        out = out_0 + out_1 + out_2 + out_3

        # print("h shape", h.shape)
        # print("c shape", c.shape)   
        
        out, (h, c) = self.lstm(out, (h, c))
        
        # print("out shape", out.shape)
        
        out = self.fc1(out)
        out = self.prelu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out, h, c

    def init_hidden(self, batch_size):
        " Initialize the hidden state of the LSTM to zeros"
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [355]:
def train(dataloader, model, batch_size, num_epochs):
    model.train()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    losses = []
    for epoch in range(num_epochs):

        for batch, (X_word, X_pos, X_char, X_casing, y) in enumerate(dataloader):

            h_state, c_state = model.init_hidden(batch_size)
            h_state = h_state.to(device)

            if c_state is not None:
                c_state = c_state.to(device)

            X_casing = X_casing.float()

            X_word = X_word.to(device)
            X_pos = X_pos.to(device)
            X_char = X_char.to(device)
            X_casing = X_casing.to(device)
            y = y.to(device)
            optimizer.zero_grad()

            y_pred, h_state, c_state = model((X_char, X_pos, X_word, X_casing),  h_state, c_state) # in LSTM we have a cell state and a hidden state

            y_pred = y_pred.view(-1, y_pred.shape[2])
            y = y.view(-1)
            
            loss = criterion(y_pred, y)

            loss.backward()
            optimizer.step()

            if batch%30 == 0:
                print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })
                losses.append(loss.item())
    return losses

In [356]:
vocab_dim_ch=len(chars)
vocab_dim_pos=len(pos_vocab.word2ind)
vocab_dim_word=len(word_vocab.word2ind)
max_Xch_len=longest_word
len_X_casing=4  # casing features
embedding_dim_ch_pos = 50
embedding_dim = 300
hidden_dim = 128
n_layers = 1
tag_dim = 13
drop_prob= 0.4
num_epochs = 30
13

model = Model(vocab_dim_ch,vocab_dim_pos,vocab_dim_word, longest_sentence, max_Xch_len,len_X_casing, embedding_dim_ch_pos,embedding_dim, hidden_dim, n_layers,tag_dim, drop_prob).to(device)
losses = train(train_dataloader, model, batch_size, num_epochs)

{'epoch': 0, 'batch': 0, 'loss': 2.598714590072632}
{'epoch': 0, 'batch': 30, 'loss': 0.05944317206740379}


KeyboardInterrupt: 

In [329]:
with torch.no_grad():
    model.eval()
    
    # Get a batch of data
    X_word, X_pos, X_char, X_casing, y_batch = next(iter(train_dataloader))
    
    # Convert to float
    # X_char = X_char.float()
    X_casing = X_casing.float()
    
    h_state, c_state = model.init_hidden(2)
    
    print("Casing type:", X_casing.dtype)
    
    pred, h_state, c_state = model((X_char, X_pos, X_word, X_casing), h_state, c_state)
    
    print("Predictions shape:", pred.shape)
    print("Predictions:", pred)
    
    

Casing type: torch.float32
emb_0 shape torch.Size([2, 268, 28, 50])
out_0 shape torch.Size([2, 268, 26, 48])
out_0 shape torch.Size([2, 268, 312])
out_0 shape torch.Size([2, 268, 300])
out_1 shape torch.Size([2, 268, 300])
out_2 shape torch.Size([2, 268, 300])
out_3 shape torch.Size([2, 268, 300])
h shape torch.Size([1, 2, 128])
c shape torch.Size([1, 2, 128])
out shape torch.Size([2, 268, 128])
Predictions shape: torch.Size([2, 268, 13])
Predictions: tensor([[[ 0.0315, -0.0180,  0.0608,  ..., -0.0248, -0.0248, -0.1332],
         [ 0.0207, -0.0537,  0.0640,  ...,  0.0060,  0.0315, -0.1407],
         [ 0.0239, -0.0318, -0.0306,  ...,  0.0245,  0.0578, -0.1121],
         ...,
         [ 0.1003, -0.1320,  0.3431,  ...,  0.1419,  0.1537, -0.1955],
         [ 0.1327, -0.1495,  0.3508,  ...,  0.1518,  0.1586, -0.1958],
         [ 0.1113, -0.1298,  0.3598,  ...,  0.1395,  0.1588, -0.1952]],

        [[ 0.0305, -0.0195,  0.0619,  ..., -0.0228, -0.0247, -0.1343],
         [ 0.0182, -0.0541,  0.