In [None]:
import numpy as np
import ruptures as rpt  # our package
from os.path import join
import matplotlib.pyplot as plt
import itertools

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from gensim.models import Word2Vec

In [None]:
# Downloaded from here : http://web.archive.org/web/20010422042459/http://www.cs.man.ac.uk/~choif/software/C99-1.2-release.tgz
# Link found in paper : [Alemi & Ginsparg](https://arxiv.org/pdf/1503.05543.pdf)
data_path = "/Users/OBoulant/Downloads/naacl00Exp/data/1/9-11/"


def read_datafile(path: str, filename: str):
    original = []
    preprocessed = []
    bkps = []
    with open(join(path, filename)) as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if line == "==========\n":
                if len(original) > 0:
                    bkps.append(i - 1 - len(bkps))
                original.append(line)
                continue
            else:
                # Original
                original.append(line)
                # Preprocess
                line = re.sub("'s", "", line)  # Remove "'s" strings
                line = line.translate(
                    str.maketrans("", "", string.punctuation)
                )  # Remove punctuation
                text_tokens = word_tokenize(line)
                ps = PorterStemmer()
                text_tokens = [ps.stem(word) for word in text_tokens]
                tokens_without_sw = [
                    word for word in text_tokens if not word in stopwords.words()
                ]  # Remove stopwords
                # Append new document
                preprocessed.append(tokens_without_sw)
        return original, bkps, preprocessed

In [None]:
original, bkps, preprocessed = read_datafile(data_path, "0.ref")
bkps
n_sentences = len(preprocessed)

In [None]:
emb_dims = 20
word2vec = Word2Vec(sentences=preprocessed, min_count=2, size=emb_dims, window=10)
word2vec.train(
    preprocessed, total_examples=word2vec.corpus_count, epochs=word2vec.epochs
)  # train word vectors

In [None]:
vocab_keys = word2vec.wv.vocab.keys()
in_vocab = [el for el in preprocessed[0] if el in vocab_keys]
mean = np.zeros((1, emb_dims))
for el in in_vocab:
    mean = mean + word2vec.wv[el].reshape(1, -1)
mean = mean / len(in_vocab)

In [None]:
mean

In [None]:
def get_embeddings_mean(word2vec_model, sentence):
    mean = np.zeros((1, emb_dims))
    vocab_keys = word2vec.wv.vocab.keys()
    in_vocab = [el for el in sentence if el in vocab_keys]
    if len(in_vocab) == 0:
        return np.full((1, emb_dims), np.nan)
    for el in in_vocab:
        mean = mean + word2vec.wv[el].reshape(1, -1)
    return mean / len(in_vocab)

In [None]:
res = np.zeros((n_sentences, emb_dims))
for i in np.arange(n_sentences):
    res[i, :] = get_embeddings_mean(word2vec, preprocessed[i])

In [None]:
plt.imshow(res)
plt.show()

In [None]:
n_bkps = 9
algo = rpt.KernelCPD(kernel="linear", min_size=1, jump=1).fit(res)  # written in C
result = algo.predict(n_bkps=n_bkps)

In [None]:
print(result)
print(bkps)

In [None]:
fig, ax_array = rpt.display(res, bkps, result)