Replicate [Hearst](https://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=D4FEFBD122DADCCB8F551299C6E99D29?doi=10.1.1.35.4491&rep=rep1&type=pdf)

In [None]:
import numpy as np
import ruptures as rpt  # our package
from os.path import join
import matplotlib.pyplot as plt
import itertools

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Downloaded from here : http://web.archive.org/web/20010422042459/http://www.cs.man.ac.uk/~choif/software/C99-1.2-release.tgz
# Link found in paper : [Alemi & Ginsparg](https://arxiv.org/pdf/1503.05543.pdf)
data_path = "/Users/OBoulant/Downloads/naacl00Exp/data/1/9-11/"


def read_datafile(path: str, filename: str):
    original = []
    preprocessed = []
    bkps = []
    with open(join(path, filename)) as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if line == "==========\n":
                if len(original) > 0:
                    bkps.append(i - 1 - len(bkps))
                original.append(line)
                continue
            else:
                # Original
                original.append(line)
                # Preprocess
                line = re.sub("'s", "", line)  # Remove "'s" strings
                line = line.translate(
                    str.maketrans("", "", string.punctuation)
                )  # Remove punctuation
                text_tokens = word_tokenize(line)
                ps = PorterStemmer()
                text_tokens = [ps.stem(word) for word in text_tokens]
                tokens_without_sw = [
                    word for word in text_tokens if not word in stopwords.words()
                ]  # Remove stopwords
                # Append new document
                preprocessed.append(tokens_without_sw)
        return original, bkps, preprocessed

In [None]:
original, bkps, preprocessed = read_datafile(data_path, "0.ref")

In [None]:
bkps

In [None]:
def cosine_similarity_sk(s1: list, s2: list):
    corpus = [
        " ".join(s1),
        " ".join(s2),
    ]
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus).toarray()
    cosine = np.sum(X[0, :] * X[1, :])
    cosine = cosine / (np.sum(X[0, :] * X[0, :]) * np.sum(X[1, :] * X[1, :])) ** 0.5
    return cosine

In [None]:
bla = cosine_similarity_sk(preprocessed[-1], preprocessed[-1])

In [None]:
bla

In [None]:
n_sentences = len(preprocessed)
blocksize = 6

In [None]:
res = np.zeros((n_sentences,))
for i in np.arange(blocksize - 1, n_sentences - blocksize):
    left_block = preprocessed[i - blocksize + 1 : i + 1]
    right_block = preprocessed[i + 1 : i + 1 + blocksize]
    res[i] = cosine_similarity_sk(
        list(itertools.chain(*left_block)), list(itertools.chain(*right_block))
    )

In [None]:
n_bkps = 9
algo = rpt.KernelCPD(kernel="linear", min_size=blocksize, jump=1).fit(
    res[blocksize - 1 : -blocksize - 1]
)  # written in C
result = algo.predict(n_bkps=n_bkps)

In [None]:
result = [el + blocksize - 1 for el in result]
print(result)
print(bkps)

In [None]:
print(len(bkps))
print(len(result))

In [None]:
fig, ax_array = rpt.display(res, bkps, result)