In [1]:
from segmentation_dataset import SegmentationDataset
from model import Model
import io
import sys
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.autograd import Variable
from tqdm import tqdm

In [2]:
def pk(ref: np.array, hyp: np.array, k: int = None, boundary: int = 1):
    """
    Compute the Pk metric for a pair of segmentations A segmentation
    is any sequence over a vocabulary of two items (e.g. "0", "1"),
    where the specified boundary value is used to mark the edge of a
    segmentation.

    >>> '%.2f' % pk('0100'*100, '1'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
    '0.00'
    """

    if k is None:
        k = int(round(ref.shape[0] / (np.count_nonzero(ref == boundary) * 2.0)))

    err = 0.0
    for i in range(len(ref) - k + 1):
        r = np.count_nonzero(ref[i : i + k] == boundary) > 0
        h = np.count_nonzero(hyp[i : i + k] == boundary) > 0
        if r != h:
            err += 1
    return err / (ref.shape[0] - k + 1.0)

In [3]:
def windowdiff(ref: np.array, hyp: np.array, k: int = None, boundary: int = 1, weighted: bool = False):
    """
    Compute the windowdiff score for a pair of segmentations.  A
    segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

        >>> s1 = "000100000010"
        >>> s2 = "000010000100"
        >>> s3 = "100000010000"
        >>> '%.2f' % windowdiff(s1, s1, 3)
        '0.00'
        >>> '%.2f' % windowdiff(s1, s2, 3)
        '0.30'
        >>> '%.2f' % windowdiff(s2, s3, 3)
        '0.80'
    """
    if k is None:
        k = int(round(ref.shape[0] / (np.count_nonzero(ref == boundary) * 2.0)))b

    if ref.shape[0] != hyp.shape[0]:
        raise ValueError("Segmentations have unequal length")
    if k > ref.shape[0]:
        raise ValueError(
            "Window width k should be smaller or equal than segmentation lengths"
        )
    wd = 0.0
    for i in range(ref.shape[0] - k + 1):
        ndiff = abs(np.count_nonzero(ref[i : i + k] == boundary) - np.count_nonzero(hyp[i : i + k] == boundary))
        if weighted:
            wd += ndiff
        else:
            wd += min(1, ndiff)
    return wd / (ref.shape[0] - k + 1.0)

In [87]:
s1 = np.array([int(ch) for ch in "000000000100"])
s2 = np.array([int(ch) for ch in "000000100100"])

In [88]:
pk(s1, s2)

0.42857142857142855

In [89]:
windowdiff(s1, s2)

0.8571428571428571

In [4]:
def validate(model, dataset):
    model.eval()
    total_pk = 0.0
    total_windowdiff = 0.0
    with tqdm(desc='Validating', total=len(dataset)) as pbar:
        for data in dataset:
            pbar.update()
            target = data['target']
            target = target.long()
            output = model(data['sentences'])
            output_softmax = F.softmax(output, 1)
            output_argmax = torch.argmax(output_softmax, dim=1)
            total_pk += pk(target.detach().numpy(), output_argmax.detach().numpy())
            total_windowdiff += windowdiff(target.detach().numpy(), output_softmax.detach().numpy())
    return total_pk / len(dataset), total_windowdiff / len(dataset)

In [5]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(tokens[1:]).astype(np.float)
    return data

In [6]:
word2vecModel = load_vectors('wiki-news-300d-1M-subword.vec')

In [7]:
val_path = 'val_data'
val_dataset = SegmentationDataset(val_path, word2vecModel)

In [8]:
model_save_path = 'saved_model'
model = torch.load(model_save_path)

In [9]:
total_pk, total_windowdiff = validate(model, val_dataset)

Validating: 100%|██████████| 50/50 [06:02<00:00,  7.25s/it]


In [61]:
total_pk

0.11489407921555729

In [62]:
total_windowdiff

0.11489407921555729

In [90]:
class Baseline():
    def __init__(self, data, threshold):
        self.dataset = data
        self.data_len = len(data)
        self.threshold = threshold
        self.all_labels = []
        self.getBaselineLabels()
        
    def getBaselineLabels(self):
        for data in self.dataset:
            sentence_labels = [0 for i in range(data['target'].shape[0])]
            sentences = data['sentences']
            for i in range(1, sentences.shape[0]):
                firstSentenceTensor = sentences[i - 1, :, :]
                secondSentenceTensor = sentences[i, :, :]
                if torch.norm(secondSentenceTensor - firstSentenceTensor).item() > self.threshold:
                    sentence_labels[i - 1] = 1
            self.all_labels.append(sentence_labels)
    
    def evaluate(self):
        total_pk = 0.0
        total_windowdiff = 0.0
        with tqdm(desc='Validating Baseline', total=self.data_len) as pbar:
            for i in range(self.data_len):
                data = self.dataset[i]
                pbar.update()
                target = data['target']
                target = target.long()
                pred = self.all_labels[i]
                total_pk += pk(target.detach().numpy(), np.array(pred))
                total_windowdiff += windowdiff(target.detach().numpy(), np.array(pred))
        return total_pk / self.data_len, total_windowdiff / self.data_len

In [91]:
b = Baseline(val_dataset, 5)

In [92]:
base_pk, base_windowdiff = b.evaluate()

Validating Baseline: 100%|██████████| 50/50 [00:11<00:00,  4.18it/s]


In [93]:
base_pk

0.04796617029867993

In [94]:
base_windowdiff

0.11583660303029159