# Distances

## Adjacent sentences

* accross a paragraph boundary (last of one paragraph, first of next)
* paragraph-internal
* first two sentences
* last two sentences

In [1]:
%load_ext rpy2.ipython

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from copy import deepcopy

In [35]:
n_articles = 100
input_matrix_file = "data/wiki/wiki_skipthoughts_%d.npy" %(n_articles)
input_data_file = "data/wiki/wiki_reformatted_%d.csv" %(n_articles)
# load data
df = pd.read_csv(input_data_file)
# load embeddings
embeddings = np.load(input_matrix_file)

In [36]:
# all pairwise distances
D = 1 - cosine_similarity(embeddings, embeddings)

In [37]:
# label all pairwise distances
distance_list = []
single_distance_data = {
    "distance": -1,
    "same_paragraph": None,
    "sentence_offset": -1,
    "same_document": None,
    "label": "IGNORE",
    "paragraph_bookends": False,
    "adjacent": None
}
n_sentences = df.shape[0]
# n_sentences = 10
for i in range(n_sentences):
    for j in range(i+1, n_sentences):
        single_distance_data["distance"] = D[i,j]
        same_paragraph = (df.par_id[i] == df.par_id[j])
        single_distance_data["same_paragraph"] = same_paragraph
        single_distance_data["same_document"] = (df.doc_id[i] == df.doc_id[j])
        sentence_offset = (df.s_id[j] - df.s_id[i])
        single_distance_data["sentence_offset"] = sentence_offset
        single_distance_data["i"] = i
        single_distance_data["j"] = j
        j_is_last = (j == (df.shape[0]-1)) or (df.par_id[j] != df.par_id[j+1])
        i_is_first = (df.s_in_par[i] == 0)
        single_distance_data["paragraph_bookends"] = (j_is_last and i_is_first and same_paragraph)
        adjacent = (sentence_offset == 1)
        single_distance_data["adjacent"] = adjacent
        # first sentence of doc: IGNORE
        # second sentence of paragraph: first
        # third to second-to-last sentences of paragraph : internal
        # last sentence of paragraph: last
        # first sentence of paragraph: crossing
        # ...
        # last sentence of document: last
        if (adjacent):
            # j always comes after i
            if (df.s_in_par[j] == 0):
                # if we're not at the start of a document
                # but we are at the start of a paragraph,
                # then this and its previous sentence cross
                # a paragraph boundary
                label = "Across Paragraphs"
            elif (df.s_in_par[j] == 1):
                # compare second sentence of a paragraph to the first
                label = "Within (First)"
            elif (j_is_last):
                label = "Within (Last)"
            else:
                label = "Within (Internal)"
        else:
            label = "IGNORE"
        single_distance_data["label"] = label
        distance_list.append(deepcopy(single_distance_data))
                
dist_df = pd.DataFrame(distance_list)

KeyboardInterrupt: 

In [None]:
adjacent = dist_df[dist_df.label != "IGNORE"]
adjacent

Within vs. Across document distances for any sentences:

## R visualization

In [None]:
%%R
library(tidyverse)
library(Hmisc)
library(ggthemes)

In [None]:
%%R -i dist_df
names(dist_df)

In [None]:
%%R -w 300 -h 200 -u px
dist_df %>%
mutate(same_document = ifelse(same_document,
                              "Same Document",
                              "Different Document")) %>%
ggplot(., aes(x = same_document, y = distance)) + 
stat_summary(fun.y = mean, geom = "bar", fill="gray", colour="black") +
stat_summary(fun.data = mean_cl_boot,
             geom = "errorbar",
             colour="black",
             lwd=1,
             width=0.1) +
ylab("Cosine Distance") +
xlab("Sentence Pair") +
theme_few(12)
ggsave("within_vs_across_documents.png")

In [None]:
%%R
doc_distances = dist_df %>%
group_by(same_document) %>%
summarise(dist = mean(distance)) %>%
as.data.frame

In [None]:
%%R -w 200 -h 200 -u px
adjacent %>%
mutate(crossing = ifelse(label=="Across Paragraphs",
                         "Across Paragraphs",
                         "Within Paragraph")) %>%
ggplot(., aes(x = crossing, y = distance)) +
geom_hline(yintercept = doc_distances$dist[!doc_distances$same_document], colour="gray") +
geom_hline(yintercept = doc_distances$dist[doc_distances$same_document], colour="gray") +
stat_summary(fun.y = mean, geom = "bar", fill="gray", colour="black") +
stat_summary(fun.data = mean_cl_boot,
             geom = "errorbar",
             colour="black",
             width=0.1) +
ylab("Cosine Distance") +
xlab("Sentence Pair") +
theme_few(10)
ggsave("adjacent_sentences_within_and_across_paragraphs.png")

In [None]:
%%R -w 400 -h 200 -u px
dist_df %>%
filter(paragraph_bookends | label!="IGNORE") %>%
mutate(label = ifelse(paragraph_bookends,
                      "Bookends",
                      as.character(label))) %>%
ggplot(., aes(x = label, y = distance)) +
geom_hline(yintercept = doc_distances$dist[!doc_distances$same_document], colour="gray") +
geom_hline(yintercept = doc_distances$dist[doc_distances$same_document], colour="gray") +
stat_summary(fun.y = mean, geom = "bar", fill="gray", colour="black") +
stat_summary(fun.data = mean_cl_boot,
             geom = "errorbar",
             colour="black",
             width=0.1) +
ylab("Cosine Distance") +
xlab("Sentence Pair") +
theme_few(10)
ggsave("adjacent_sentences_more_details.png")