In [412]:
!pip install scikit-learn
!pip install gensim
!pip install seqeval

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting seqeval
  Using cached seqeval-1.2.2-py3-none-any.whl
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [39]:

!wget http://vectors.nlpl.eu/repository/20/6.zip

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
--2024-02-13 21:23:59--  http://vectors.nlpl.eu/repository/20/6.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 635351287 (606M) [application/zip]
Saving to: ‘6.zip’


2024-02-13 21:26:18 (4.44 MB/s) - ‘6.zip’ saved [635351287/635351287]



In [40]:
!unzip ./word2vec/6

Archive:  ./word2vec/6.zip
  inflating: meta.json               
  inflating: model.bin               
  inflating: model.txt               
  inflating: README                  


In [413]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, Tensor, optim
import torch.nn.functional as F # Functions module - activations, utilities like padding
import pandas as pd
import numpy as np
import math
import gensim

from sklearn.model_selection import train_test_split

from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from seqeval.scheme import IOB2

In [65]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [57]:
def create_item_list(df):
    # Convert to list of dict's, 
    item_list = [row.to_dict() for idx, row in df.iterrows()]
    
    # Shuffle 
    np.random.shuffle(item_list)
    
    # Ensure utt's and iob tags are matched
    item_list = [item for item in item_list if len(item['utterances'].split(' ')) == len(item['IOB Slot tags'])]
    return item_list
    
    
def get_relations(item_list):
    relation_to_idx = list(sorted(set([j for i in item_list for j in i["Core Relations"]])))
    relation_to_idx = {r:i for i, r in enumerate(relation_to_idx)}
    idx_to_relation = {i:r for i, r in enumerate(relation_to_idx)}
    print(relation_to_idx)
    print(idx_to_relation)
    return relation_to_idx, idx_to_relation
    

# Load train csv
train_csv_df = pd.read_csv("hw1_train.csv")
train_csv_df['IOB Slot tags'] = train_csv_df['IOB Slot tags'].replace(np.nan, "none")
train_csv_df['Core Relations'] = train_csv_df['Core Relations'].replace(np.nan, "none")

# Tokenize train
train_csv_df["IOB Slot tags"] = train_csv_df["IOB Slot tags"].apply(lambda tags: tags.split())
train_csv_df["Core Relations"] = train_csv_df["Core Relations"].apply(lambda tags: tags.split())

item_list = create_item_list(train_csv_df)
relation_to_idx, idx_to_relation = get_relations(item_list)

# Load test csv
test_csv_df = pd.read_csv("hw1_test.csv")

{'actor.gender': 0, 'gr.amount': 1, 'movie.country': 2, 'movie.directed_by': 3, 'movie.estimated_budget': 4, 'movie.genre': 5, 'movie.gross_revenue': 6, 'movie.initial_release_date': 7, 'movie.language': 8, 'movie.locations': 9, 'movie.music': 10, 'movie.produced_by': 11, 'movie.production_companies': 12, 'movie.rating': 13, 'movie.starring.actor': 14, 'movie.starring.character': 15, 'movie.subjects': 16, 'none': 17, 'person.date_of_birth': 18}
{0: 'actor.gender', 1: 'gr.amount', 2: 'movie.country', 3: 'movie.directed_by', 4: 'movie.estimated_budget', 5: 'movie.genre', 6: 'movie.gross_revenue', 7: 'movie.initial_release_date', 8: 'movie.language', 9: 'movie.locations', 10: 'movie.music', 11: 'movie.produced_by', 12: 'movie.production_companies', 13: 'movie.rating', 14: 'movie.starring.actor', 15: 'movie.starring.character', 16: 'movie.subjects', 17: 'none', 18: 'person.date_of_birth'}


In [58]:
# Split train csv into train and val
train_df, val_df = train_test_split(item_list, test_size=0.1, train_size=0.9)

# Convert to lists
train_data = [(i['utterances'], i['IOB Slot tags'], i['Core Relations']) for i in train_df]
val_data = [(i['utterances'], i['IOB Slot tags'], i['Core Relations']) for i in val_df]

train_texts = [p[0] for p in train_data]
train_bios = [p[1] for p in train_data]
train_rels = [p[2] for p in train_data]

In [59]:
# Load word2vec weights
word2vec_weights = gensim.models.KeyedVectors.load_word2vec_format('word2vec/model.txt')

In [366]:
class VocabularyEmbedding(object):
    # For representing vocab embeddings
    def __init__(self, gensim_w2v):

        self.w2v = gensim_w2v
        self.w2v.add_vector('<s>', np.random.uniform(low=-1, high=1.0, size=(300,)))
        self.w2v.add_vector('</s>', np.random.uniform(low=-1, high=1.0, size=(300,)))
        self.w2v.add_vector('<pad>', np.random.uniform(low=-1, high=1.0, size=(300,)))
        self.w2v.add_vector('<unk>', np.random.uniform(low=-1, high=1.0, size=(300,)))

        bos = self.w2v.key_to_index.get('<s>')
        eos = self.w2v.key_to_index.get('</s>')
        pad = self.w2v.key_to_index.get('<pad>')
        unk = self.w2v.key_to_index.get('<unk>')

        self.bos_index = bos
        self.eos_index = eos
        self.pad_index = pad
        self.unk_index = unk

    def tokenizer(self, text):
        return [t for t in text.split(' ')]

    def encode(self, text):

        sequence = []

        tokens = self.tokenizer(text)
        for token in tokens:

            index = self.w2v.key_to_index.get(token, self.unk_index)
            sequence.append(index)

        return sequence

    def create_padded_tensor(self, sequences):
        # sequences:
        #print(sequences)

        lengths = [len(sequence) for sequence in sequences]
        max_seq_len = max(lengths)
        tensor = torch.full((len(sequences), max_seq_len), self.pad_index, dtype=torch.long)

        for i, sequence in enumerate(sequences):
            for j, token in enumerate(sequence):
                tensor[i][j] = token

        return tensor, lengths


class BIOTagSequencer(object):
    # For representing BIO tags
    def __init__(self, tag_corpus, bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>'):
        self.word2idx = {}
        self.idx2word = {}
        self.unk_index = self.add_token(unk_token)
        self.pad_index = self.add_token(pad_token)
        self.bos_index = self.add_token(bos_token)
        self.eos_index = self.add_token(eos_token)
        self.tokenizer = lambda text: [t for t in text]

        for _tags in tag_corpus:
          for _token in self.tokenizer(_tags):
            self.add_token(_token)

    def add_token(self, token):
        if token not in self.word2idx:
          self.word2idx[token] = new_index = len(self.word2idx)
          self.idx2word[new_index] = token
          return new_index

        else:
          return self.word2idx[token]

    def encode(self, text):
        tokens = self.tokenizer(text)

        sequence = []

        for token in tokens:

            index = self.word2idx.get(token, self.unk_index)
            sequence.append(index)

        return sequence

    def create_padded_tensor(self, sequences):

        lengths = [len(sequence) for sequence in sequences]
        max_seq_len = max(lengths)
        tensor = torch.full((len(sequences), max_seq_len), self.pad_index, dtype=torch.long)

        for i, sequence in enumerate(sequences):
            for j, token in enumerate(sequence):
                tensor[i][j] = token

        return tensor, lengths
    
    
class RelationSequencer:
    # For representing relations
    def __init__(self, relations):
        pass

    def encode(self, text):
        return text
    
    def create_padded_tensor(self, sequences):
        tensor = torch.full(size=(len(sequences), len(relation_to_idx)), fill_value=0, dtype=torch.float)
        for i, sequence in enumerate(sequences):
            for token in sequence:
                tensor[i][relation_to_idx[token]] = 1
        return tensor
        


class PositionalEncoding(nn.Module):
    # Copied directly from PyTorch docs
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [367]:
class BIORelDataset(Dataset):
    def __init__(self, data, text_sequencer, bio_sequencer, rel_sequencer):
        self.data = data
        self.input_sequencer = text_sequencer
        self.bio_sequencer = bio_sequencer
        self.rel_sequencer = rel_sequencer

    def __getitem__(self, index):
        text, tags, relations = self.data[index]
        x = self.input_sequencer.encode(text)
        y_bio = self.bio_sequencer.encode(tags)
        y_rel = self.rel_sequencer.encode(relations)
        return x, y_bio, y_rel

    def __len__(self):
        return len(self.data)


class BIORelationTransformer(nn.Module):
    def __init__(self, num_tokens, dim_model, dropout_p, num_heads, num_encoders, num_bio_tags, num_rel_tags):
        super().__init__()
        
        # Info
        self.model_type = "Transformer"
        self.dim_model = dim_model
        
        # Layers
        self.positional_encoder = PositionalEncoding(d_model=dim_model, dropout=dropout_p)
        self.embedding = nn.Embedding(num_tokens, dim_model)
        
        #encoder_layer = nn.TransformerEncoderLayer(dim_model, nhead=num_heads)
        #self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoders)

        self.transformer = nn.Transformer(
            d_model=dim_model,
            nhead=num_heads,
            num_encoder_layers=num_encoders,
            num_decoder_layers=num_encoders,  # TODO
            dropout=dropout_p
        )
        
        self.out_rel = nn.Linear(dim_model, num_rel_tags)
        self.out_bio = nn.Linear(dim_model, num_bio_tags)
        
    def forward(self, x, tgt_key):
        # Input shape: (batch size, seq len)
        x = self.embedding(x) * math.sqrt(self.dim_model)
        x = self.positional_encoder(x)
        
        # Reshape to (seq len, batch size, dim model)
        x = x.permute(1, 0, 2)
        
        # Transformer blocks
        #t_out = self.transformer_encoder(x)
        # mask = self.transformer.generate_square_subsequent_mask(len(x))  # TODO: create padding mask
        t_out = self.transformer(x, x)
        
        # print(f'transformer out shape: {t_out.shape}')
        
        out_iob = self.out_bio(t_out)
        out_rel = self.out_rel(torch.max(t_out, dim=0).values)
        
        # print(f'out_iob shape: {out_iob.shape}')
        # print(f'out_rel shape: {out_rel.shape}')
        
        return out_iob, out_rel


In [379]:
text_sequencer = VocabularyEmbedding(word2vec_weights)
bio_sequencer = BIOTagSequencer(train_bios)
rel_sequencer = RelationSequencer(train_rels)

train_dataset = BIORelDataset(train_data, text_sequencer, bio_sequencer, rel_sequencer)
val_dataset = BIORelDataset(val_data, text_sequencer, bio_sequencer, rel_sequencer)



In [380]:
def prepare_batch(batch, in_sequencer, bio_sequencer, rel_sequencer):
    texts, bio_tags, rel_tags = zip(*batch)
    text_tensor, lengths = in_sequencer.create_padded_tensor(texts)
    bio_tensor, _ = bio_sequencer.create_padded_tensor(bio_tags)
    rel_tensor = rel_sequencer.create_padded_tensor(rel_tags)
    return text_tensor, lengths, bio_tensor, rel_tensor

In [381]:
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=lambda batch: prepare_batch(batch, text_sequencer, bio_sequencer, rel_sequencer))
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=lambda batch: prepare_batch(batch, text_sequencer, bio_sequencer, rel_sequencer), shuffle=False)

In [382]:
lr = 1e-3

model = BIORelationTransformer(num_tokens=len(text_sequencer.w2v),
                               dim_model=64, 
                               num_heads=2,
                               num_encoders=4,
                               dropout_p=0.1,
                               num_bio_tags=len(bio_sequencer.word2idx),
                               num_rel_tags=len(relation_to_idx)).to(device)
opt = optim.Adam(model.parameters(), lr=lr)
loss_fn_bio = nn.CrossEntropyLoss(ignore_index=bio_sequencer.pad_index)
loss_fn_rel = nn.CrossEntropyLoss()



In [386]:
def train_loop(model, opt, loss_fn_bio, loss_fn_rel, dataloader):
    # Help from https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1

    model.train()
    total_loss = 0
    
    for i, batch in enumerate(dataloader):
        X = batch[0].to(device)
        y_bio = batch[2].to(device)
        y_rel = batch[3].to(device)
        
        pred_bio, pred_rel = model(X)
        
        # Permute bio to (batch size, seq len, labels)
        pred_bio = pred_bio.permute(1, 2, 0)
        
        # print(pred_bio[0][0])
        # print(f'y_bio shape: {y_bio.shape}')
        # print(f'pred_bio shape: {pred_bio.shape}')
        # print(f'\ny_rel shape: {y_rel.shape}')
        # print(f'pred_rel shape: {pred_rel.shape}')
        
        loss = loss_fn_rel(pred_rel, y_rel) + loss_fn_bio(pred_bio, y_bio)
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        total_loss += loss.detach().item()
        print(f'{i} / {len(dataloader)}: {loss}')
        
    return total_loss / len(dataloader)
        

In [387]:
train_loop(model, opt, loss_fn_bio, loss_fn_rel, train_loader)

# batch size: 32
# model dim:  64
# seq len:    13
# num bio labels: 30
# num rel labels: 19
#print(len(rel_sequencer.word2idx))

0 / 64: 5.8518757820129395
1 / 64: 5.652836322784424
2 / 64: 5.775554180145264
3 / 64: 5.132939338684082
4 / 64: 5.4045233726501465
5 / 64: 4.924869537353516
6 / 64: 4.258816242218018
7 / 64: 4.620996952056885
8 / 64: 4.089197158813477
9 / 64: 4.168509483337402
10 / 64: 4.163624286651611
11 / 64: 4.101083755493164


KeyboardInterrupt: 

In [431]:
def val_loop(model, loss_fn_bio, loss_fn_rel, dataloader):
    model.eval()
    total_loss = 0
    
    bio_targets = []
    predicted_bio_labels = []
    rel_targets = []
    predicted_rel_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            X = batch[0].to(device)
            y_bio = batch[2].to(device)
            y_rel = batch[3].to(device)

            logits_bio, logits_rel = model(X)

            # Permute bio to (batch size, seq len, labels)
            logits_bio = logits_bio.permute(1, 2, 0)
            
            loss = loss_fn_rel(logits_rel, y_rel) + loss_fn_bio(logits_bio, y_bio)
            total_loss += loss.detach().item()
            
            # Softmax and argmax over possible labels
            probs_bio = F.softmax(logits_bio, dim=-2)
            preds_bio = torch.argmax(probs_bio, dim=-2)
            probs_rel = F.softmax(logits_rel, dim=-2)
            preds_rel = torch.argmax(probs_rel, dim=-2)
            
            bio_targets.append(y_bio.tolist())
            rel_targets.append(y_rel.tolist())
            predicted_bio_labels.append(preds_bio.tolist())
            predicted_rel_labels.append(preds_rel.tolist())
            
    # Get bio targets and predictions without the padding
    non_padding_bio_targets = [[_x[i] for i in range(len(_x)) if _x[i] != bio_sequencer.pad_index] for _batch in bio_targets for _x in _batch]
    non_padding_bio_predictions = [[_x[i] for i in range(len(_x)) if _x[i] != bio_sequencer.pad_index] for _batch in predicted_bio_labels for _x in _batch]
    
    # TODO: hacky, get padding working with model
    non_padding_bio_predictions = [[preds[i] for i in range(len(targets))] for preds, targets in zip(non_padding_bio_predictions, non_padding_bio_targets)]
    
    #[print(t, p) for t, p in zip(non_padding_bio_targets, non_padding_bio_predictions)]
    
    # Convert from one hot to text labels
    bio_true_labels = [[str(bio_sequencer.idx2word[_y]).replace('_', '-') for _y in _x] for _x in non_padding_bio_targets]
    bio_predicted_labels = [[str(bio_sequencer.idx2word[_y]).replace('_', '-') for _y in _x][:len(_t)] for _x, _t in zip(non_padding_bio_predictions, bio_true_labels)]
    
    #[print(t, p) for t, p in zip(bio_true_labels, bio_predicted_labels)]
    
    print(classification_report(bio_true_labels, bio_predicted_labels, scheme=IOB2))
    bio_f1 = f1_score(bio_true_labels, bio_predicted_labels, scheme=IOB2)
    #print(bio_f1)
    
    # TODO: get f1 for relations
    
    return total_loss / len(dataloader)


In [432]:
val_loop(model, loss_fn_bio, loss_fn_rel, val_loader)

              precision    recall  f1-score   support

        cast       0.00      0.00      0.00        11
     country       0.00      0.00      0.00        23
    director       0.00      0.00      0.00        21
       genre       0.00      0.00      0.00         3
    language       0.00      0.00      0.00         6
       movie       0.00      0.00      0.00        95
 mpaa-rating       0.00      0.00      0.00        12
      person       0.00      0.00      0.00        22
    producer       0.00      0.00      0.00        18
release-year       0.00      0.00      0.00         1
     subject       0.00      0.00      0.00         8
        unk>       0.00      0.00      0.00         1

   micro avg       0.00      0.00      0.00       221
   macro avg       0.00      0.00      0.00       221
weighted avg       0.00      0.00      0.00       221


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


4.272278845310211

In [407]:
print(bio_sequencer.idx2word)

{0: '<unk>', 1: '<pad>', 2: '<s>', 3: '</s>', 4: 'O', 5: 'B_subject', 6: 'B_producer', 7: 'B_person', 8: 'I_person', 9: 'B_movie', 10: 'I_movie', 11: 'I_subject', 12: 'B_genre', 13: 'B_language', 14: 'B_mpaa_rating', 15: 'B_cast', 16: 'I_cast', 17: 'I_producer', 18: 'B_director', 19: 'I_director', 20: 'B_country', 21: 'B_char', 22: 'I_country', 23: 'I_language', 24: 'I_char', 25: 'B_release_year', 26: 'I_genre', 27: 'I_mpaa_rating', 28: 'B_location', 29: 'I-movie'}


In [None]:
def train(model, opt, loss_fn, train_loader, val_loader, epochs):
    train_loss_list, val_loss_list = [], []
    
    for epoch, range(epochs):
        print(f'-----Epoch {epoch}')
        train_loss = train_loop(model, opt, loss_fn, train_loader)
        train_loss_list.append(train_loss)
        
        # TODO: val
        
        