In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import numpy as np
import time
import random
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
import math
import pandas as pd

In [2]:
torch.version.cuda

'12.4'

In [3]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  print('Script is running with GPU')
else:
  device = torch.device("cpu")
  print('Script is running WITHOUT GPU')

Script is running with GPU


In [4]:
torch.cuda.is_available()


True

In [5]:
def calculate_class_weights(y):
    unique_classes, class_counts = np.unique(y, return_counts=True)
    total_samples = len(y)
    class_weights = []
    class_weights.append(1)

    for class_label, class_count in zip(unique_classes, class_counts):
        class_weight = math.log(total_samples / (class_count))
        class_weights.append(class_weight)

    class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

    return class_weights

In [6]:
def predict(model, data, pad_idx):
    with torch.no_grad():
      predicts_out = []
      tags_out = []
      text_out = []
      for batch in data:
        text = batch[0]
        output = batch[1]
        output = output.transpose(0,1)
        tags = output[0]
        mask = output[1]
        predictions = model.forward(text)
        predictions = predictions.transpose(1,2)
        predictions = predictions.argmax(dim = 1)
        for idx, sent in enumerate(text):
          predicts_out.append(np.asarray(predictions[idx].cpu()))
          tags_out.append(np.asarray(tags[idx].cpu()))
          text_out.append(np.asarray(sent.cpu()))
    for idx, ele in enumerate(predicts_out):
      predicts_out[idx] = predicts_out[idx].tolist()
    for idx, ele in enumerate(tags_out):
      tags_out[idx] = tags_out[idx].tolist()
    for idx, ele in enumerate(text_out):
      text_out[idx] = text_out[idx].tolist()

    return predicts_out, tags_out, text_out

In [11]:
base_sents = list(np.load('OHG_sents_test.npy', allow_pickle=True))
base_tags = list(np.load('OHG_tags_minimal_test.npy', allow_pickle=True))
problems = []

FileNotFoundError: [Errno 2] No such file or directory: 'OHG_sents_test.npy'

In [None]:
nums = []
for sent_idx, sent in enumerate(base_sents):
    for word_idx, tag in enumerate(base_tags[sent_idx]):
        if tag == 'NUM':
          nums.append((base_sents[sent_idx][word_idx], base_tags[sent_idx][word_idx]))


In [None]:
from collections import Counter
num_counts = Counter(nums)

In [None]:
len(base_sents)

In [None]:
sample_sizes = [250, 500, 750, 1000, 1500, 1683]


In [None]:
results_df = pd.DataFrame(columns=["tag", "accuracy", "sample_size"])

In [None]:
iteration = 0
while iteration < 25:
  print(iteration)
  for size in sample_sizes:
    random_indices = random.sample(range(len(base_sents)), size)
    sents = []
    tags = []
    for index in random_indices:
      sents.append(base_sents[index])
      tags.append(base_tags[index])
    for idx, sent in enumerate(sents):
      sent.insert(0, 'start')
      sent.append('stop')
      sents[idx] = sent
    for idx, sent in enumerate(tags):
      sent.insert(0, 'start')
      sent.append('stop')
      tags[idx] = sent
    rawwords = []
    for sent in sents:
      for word in sent:
        rawwords.append(word)

    rawtags = []
    for sequence in tags:
      for tag in sequence:
        rawtags.append(tag)

    allwords = list(set(rawwords))
    alltags = list(set(rawtags))

    word_tokenizer = {word: idx+1 for idx, word in enumerate(allwords)}
    word_decoder = {idx+1: word for idx, word in enumerate(allwords)}
    tag_tokenizer = {tag: idx+1 for idx, tag in enumerate(alltags)}
    tag_decoder = {idx+1: tag for idx, tag in enumerate(alltags)}

    def tokenize(sentences, tokenizer):
      indexed_sentences = []
      for sentence in sentences:
        indexed_sentence = [tokenizer[word] for word in sentence]
        indexed_sentences.append(indexed_sentence)
      return indexed_sentences

    encsents = tokenize(sents, word_tokenizer)
    enctags = tokenize(tags, tag_tokenizer)

    padsents, padtags = [], []

    maxlen = max(len(sublist) for sublist in encsents)
    for sublist in encsents:
      while len(sublist) < maxlen:
        sublist = sublist + [0]
        if len(sublist) == maxlen:
          break
        sublist = [0] + sublist
      padsents.append(sublist)
    allenctags = []
    maxlen = max(len(sublist) for sublist in enctags)
    for sublist in enctags:
      for i in sublist:
        allenctags.append(i)
      while len(sublist) < maxlen:
        sublist = sublist + [0]
        if len(sublist) == maxlen:
          break
        sublist = [0] + sublist
      padtags.append(sublist)

    tag_mask = []
    for seq in padtags:
      mask = [1]*len(seq)
      for idx, tag in enumerate(seq):
        if 'XX' in list(tag_tokenizer.keys()):
          if tag == tag_tokenizer['XX']:
            mask[idx] = 0
        if tag == tag_tokenizer['start']:
          mask[idx] = 0
        if tag == tag_tokenizer['stop']:
          mask[idx] = 0
        if tag == tag_tokenizer['PUNCT']:
          mask[idx] = 0
      tag_mask.append(mask)
    for i in range(len(tag_mask)):
      if len(tag_mask[i]) != len(padsents[i]):
        print(i)
    pad_tags_mask = []
    for i in range(len(padtags)):
      pad_tags_mask.append([padtags[i], tag_mask[i]])
    X_test, X_train, y_test, y_train = train_test_split(padsents, pad_tags_mask, test_size=0.8)
    X_train_tensor = torch.tensor(X_train, dtype=torch.long).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
    X_test_tensor = torch.tensor(X_test, dtype=torch.long).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    batch_size = 64
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    class_weights = calculate_class_weights(allenctags)

    class Model(nn.Module):
      def __init__(self):
        super().__init__()

      def categorical_accuracy(self, preds, y, mask, tag_pad_idx = 0):
        max_preds = preds.argmax(dim = 1, keepdim = False)
        max_preds = max_preds*mask
        y = y*mask
        max_preds = torch.flatten(max_preds)
        y = torch.flatten(y)
        non_pad_elements = y.nonzero()
        correct = max_preds[non_pad_elements].eq(y[non_pad_elements])
        return correct.sum() / y[non_pad_elements].shape[0]

      def early_stop(self, validation_loss, patience = 3, min_delta = 0):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.patiencecount = 0
        elif validation_loss > (self.min_validation_loss + min_delta):
            self.patiencecount += 1
            #print(f"Early stopping counter: {self.patiencecount} out of {patience}")
            if self.patiencecount >= patience:
                return True
        return False

      def fit(self, train_dl, val_dl, epochs, pad_idx = 0):
        optimizer = torch.optim.Adam(self.parameters(), lr = 0.001)

        self.patiencecount = 0
        self.min_validation_loss = float('inf')

        self.trainlosses = []
        self.vallosses = []

        self.train_accs = []
        self.val_accs = []

        counter = 0
        for epoch in range(epochs):
          for batch in train_dl:
            counter += 1
            text = batch[0]

            output = batch[1]
            output = output.transpose(0,1)
            tags = output[0]
            mask = output[1]
            optimizer.zero_grad()
            predictions = self.forward(text)
            predictions = predictions.transpose(1,2)
            loss = self.loss_fn(predictions, tags)
            loss = (loss * mask).sum() / mask.sum()
            loss.backward()
            optimizer.step()
          train_loss, train_acc = self.evaluate(train_dl)
          val_loss, val_acc = self.evaluate(val_dl)
          self.trainlosses.append(train_loss)
          self.vallosses.append(val_loss)
          self.train_accs.append(train_acc)
          self.val_accs.append(val_acc)
          if self.early_stop(val_loss):
            break
          train_acc = train_acc*100
          val_acc = val_acc*100
          #print(f"Epoch [{epoch + 1}/{epochs}] - TrainLoss: {train_loss:.4f}, ValLoss: {val_loss:.4f}, TrainAcc: {train_acc:.2f},% ValAcc: {val_acc:.2f}%")

      def evaluate(self, val_dl, pad_idx = 0):
        losses = []
        accuracies = []
        with torch.no_grad():
          for batch in val_dl:
              text = batch[0]
              output = batch[1]
              output = output.transpose(0,1)
              tags = output[0]
              mask = output[1]
              predictions = self.forward(text)
              predictions = predictions.transpose(1,2)
              loss = self.loss_fn(predictions, tags)
              loss = (loss * mask).sum() / mask.sum()
              losses.append(loss)
              acc = self.categorical_accuracy(predictions, tags, mask, pad_idx)
              accuracies.append(acc)
        return torch.Tensor(losses).mean(), torch.Tensor(accuracies).mean()


    class OHGTagger(Model):
        def __init__(self, vocab, embeds, hidden, tagset, n_layers, dropout, criterion, pad_idx = 0):
            super().__init__()

            self.embedding = nn.Embedding(vocab, embeds, padding_idx = pad_idx, scale_grad_by_freq = True)

            self.lstm = nn.LSTM(embeds, hidden, num_layers = n_layers, bidirectional = True,
                                dropout = dropout if n_layers > 1 else 0)

            self.lin = nn.Linear(hidden * 2, tagset)

            self.dropout = nn.Dropout(dropout)

            self.loss_fn = criterion

        def forward(self, sent):
            x = self.embedding(sent)
            x, (hidden, cell) = self.lstm(x)
            x = self.lin(self.dropout(x))
            return x

    vocab = len(allwords)+1
    n_emb = 300
    n_hidden = 60
    n_tags = len(alltags)+1
    n_layers = 3
    dropout = .2
    pad_idx = 0
    criterion = nn.CrossEntropyLoss(reduction = 'none', weight = class_weights, ignore_index = pad_idx).to(device)

    network_OHGTagger = OHGTagger(vocab, n_emb, n_hidden, n_tags, n_layers, dropout, criterion, pad_idx).to(device)

    %time network_OHGTagger.fit(train_loader, test_loader, 100)

    preds, tags, texts = predict(network_OHGTagger, test_loader, pad_idx)

    for i in range(len(preds)):
      for j in range(len(preds[i])):
        if tags[i][j] == 0:
          preds[i][j] = 0
      preds[i] = [ele for ele in preds[i] if ele != 0]
      tags[i] = [ele for ele in tags[i] if ele != 0]
      texts[i] = [ele for ele in texts[i] if ele != 0]
    decpreds, dectags, dectext, declangs = [], [], [], []
    decpreds.append(tokenize(preds, tag_decoder))
    dectags.append(tokenize(tags, tag_decoder))
    dectext.append(tokenize(texts, word_decoder))
    decpreds = decpreds[0]
    dectags = dectags[0]
    dectext = dectext[0]

    in_set_vocab = []
    for batch in train_loader:
      texts = batch[0]
      for sent in texts:
        sent = sent.tolist()
        sent = [ele for ele in sent if ele != 0]
        x = []
        for i in sent:
          x.append([i])
        x = tokenize(x, word_decoder)
        for word in x:
          in_set_vocab.append(word)
    in_set_vocab = set([x for y in in_set_vocab for x in y])

    corrects, incorrects, totals = [], [], []
    uk_corrects, uk_incorrects, uk_totals = [], [], []
    for i in range(len(decpreds)):
      for j in range(len(decpreds[i])):
        totals.append(dectags[i][j])
        if decpreds[i][j] == dectags[i][j]:
          corrects.append(decpreds[i][j])
        else:
          incorrects.append(decpreds[i][j])
        if dectext[i][j] not in in_set_vocab:
          uk_totals.append(dectags[i][j])
          if decpreds[i][j] == dectags[i][j]:
            uk_corrects.append(decpreds[i][j])
          else:
            uk_incorrects.append(decpreds[i][j])

    c_freqs = Counter(corrects)
    ic_freqs = Counter(incorrects)
    t_freqs = Counter(totals)
    uk_c_freqs = Counter(uk_corrects)
    uk_ic_freqs = Counter(uk_incorrects)
    uk_t_freqs = Counter(uk_totals)

    data = []
    labels = []
    outs = []
    any_c = [x[0] for x in c_freqs.most_common()]
    for i in t_freqs.most_common():
      if i[0] not in any_c:
        data.append(0)
        labels.append(i[0])
        outs.append((i[0], 0))
        results_df.loc[len(results_df)] = [i[0], 0, size]
      else:
        for ele in c_freqs.most_common():
          if ele[0] == i[0]:
            num_c = ele[1]
        class_acc = num_c/i[1]
        data.append(class_acc*100)
        labels.append(i[0])
        outs.append((i[0], class_acc*100))
        results_df.loc[len(results_df)] = [i[0], class_acc*100, size]

  results_df.to_csv('resultsOHG_test.csv', index=False)
  with torch.no_grad():
      torch.cuda.empty_cache()
  iteration = iteration + 1

0
CPU times: total: 2.08 s
Wall time: 2.92 s
CPU times: total: 953 ms
Wall time: 1 s
CPU times: total: 1.09 s
Wall time: 1.17 s
CPU times: total: 1.11 s
Wall time: 1.2 s
CPU times: total: 1.38 s
Wall time: 1.52 s
CPU times: total: 1.64 s
Wall time: 1.73 s
1
CPU times: total: 734 ms
Wall time: 754 ms
CPU times: total: 969 ms
Wall time: 1.1 s
CPU times: total: 1.09 s
Wall time: 1.21 s
CPU times: total: 1.28 s
Wall time: 1.33 s
CPU times: total: 1.25 s
Wall time: 1.3 s
CPU times: total: 1.59 s
Wall time: 1.63 s
2
CPU times: total: 656 ms
Wall time: 706 ms
CPU times: total: 953 ms
Wall time: 970 ms
CPU times: total: 1.17 s
Wall time: 1.3 s
CPU times: total: 1.23 s
Wall time: 1.27 s
CPU times: total: 1.66 s
Wall time: 1.76 s
CPU times: total: 1.58 s
Wall time: 1.69 s
3
CPU times: total: 734 ms
Wall time: 758 ms
CPU times: total: 922 ms
Wall time: 914 ms
CPU times: total: 1.08 s
Wall time: 1.15 s
CPU times: total: 1.23 s
Wall time: 1.3 s
CPU times: total: 1.44 s
Wall time: 1.6 s
CPU times: t

In [None]:
results_df.to_csv('resultsOHG_tes.csv', index=False)