# Simple Segmentation ReFree metric example

In [1]:
#Imports
from sentence_transformers import SentenceTransformer
import segeval
import numpy as np

#Define embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [2]:
#Define necessary functions

#L2 distance
def l2dist(point1,listofpoints2):
    if len(listofpoints2.shape) > 1:
    
        dists = np.zeros((len(listofpoints2),))
        for idx,point2 in enumerate(listofpoints2):
            dists[idx] = np.sqrt(np.sum(np.square(point2-point1)))
    else:
        dists = np.sqrt(np.sum(np.square(listofpoints2-point1)))
    return dists

#Segmentation metric code
def segmentation_re_free(X, labels, dist = l2dist):

    dist_method = dist
    n_labels = len(np.unique(labels))
    intra_dists = np.zeros((n_labels,)) 
    centroids = np.zeros((n_labels, len(X[0])), dtype=float)
    length1segments = 0
    for k in range(n_labels):
        cluster_k = X[np.where(labels == k),:][0]
        centroid = cluster_k.mean(axis=0)
        centroids[k] = centroid
        if len(cluster_k) > 1:
            correction_factor = 1 -1/np.sqrt(len(cluster_k))
        else:
            correction_factor =  0.25 #ends up not mattering, but prevents nan
            print('Warning, segment of length 1 detected')
            length1segments += 1
                
        intra_dists[k] = np.average(dist_method(centroid, cluster_k))/correction_factor

    if length1segments > 0:
        print('There are ' + str(length1segments) + ' segments of length 1 found in this segmentation. This may create an artificially low score.')
    centroid_distances = np.zeros((n_labels,3))
    centroid_distances[0,2] = dist_method(centroids[0],centroids[1])
    centroid_distances[-1,0] = dist_method(centroids[-1],centroids[-2])

    for k in np.arange(1,n_labels-1):
        centroid_distances[k,:] = dist_method(centroids[k],centroids[k-1:k+2])

    centroid_distances = np.delete(centroid_distances,1,1)

    if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):
        return 0.0

    centroid_distances[centroid_distances == 0] = np.inf


    combined_intra_dists = np.zeros((n_labels,2)) 
    combined_intra_dists[0,1] = intra_dists[0]+intra_dists[1]
    combined_intra_dists[-1,0] = intra_dists[-1] + intra_dists[-2]
    for k in np.arange(1,n_labels-1):
        
        combined_intra_dists[k,0] = intra_dists[k] + intra_dists[k-1]
        combined_intra_dists[k,1] = intra_dists[k] + intra_dists[k+1]


    scores = np.max(combined_intra_dists / centroid_distances, axis=1)
    return np.mean(scores)

#Helper function for segeval code
def formatSegments(segs):
    x_arrstr = np.char.mod('%i', segs)
    formattedSegs = "".join(x_arrstr)
    return formattedSegs

In [3]:
#Create some made up text about a segmentation example
sampleText = '''
This block of text is segmented into sets of sentences that relate to one another. 
The relationship of adjacent sentences in this segmentation is based on semantic meaning. 

The way that we have chosen to mark these segmentations is both through highlighting and font changes. 
An alternate (poor) segmentation is marked with hash marks. The reason we have chosen both font and 
color changes is to make this more easily readable for those that are colorblind. The highlighting 
colors have also been chosen with different luminosity in order to aid differentiation by those that 
are colorblind.

This work focuses on segmentation at the sentence boundary, although some segmentation work 
happens at the sub-sentence level. Segmentation at the level done in this work is often used as a 
way to split apart large texts into relevant parts as a pre-processing step before other operations 
such as summarization. This can be necessary because some algorithms are not able to handle rather 
long inputs, or it can improve quality of results by grouping similar content.'''

In [4]:
#Create sentence embeddings
sentences = sampleText.split('.')
embeddings = embedding_model.encode(sentences[:-1])#ignore last part of split list

#Define made up boundary sets, with 0 being topically good and 1 being shifted.
boundarySet0 = np.asarray([0,0,1,0,0,0,1,0,0])
boundarySet1 = np.asarray([0,0,0,1,0,0,0,1,0])

#Create labels for clustering
segmentationLabels0 = np.cumsum(boundarySet0)
segmentationLabels1 = np.cumsum(boundarySet1)

#Convert boundary format from boundary set to mass format
reference_topic_segments = segeval.convert_nltk_to_masses(formatSegments(boundarySet0))
candidate_topic_segments = segeval.convert_nltk_to_masses(formatSegments(boundarySet1))

#Generate classic metrics
seg_pk = segeval.pk(reference_topic_segments, candidate_topic_segments)
seg_wd = segeval.window_diff(reference_topic_segments, candidate_topic_segments)
seg_s = segeval.segmentation_similarity(reference_topic_segments, candidate_topic_segments)
seg_b = segeval.boundary_similarity(reference_topic_segments, candidate_topic_segments)

In [5]:
print('The Segmentation ReFree score for the first segmenation of the text is ' + str(segmentation_re_free(embeddings, segmentationLabels0)))
print('The Segmentation ReFree score for the second segmenation of the text is ' + str(segmentation_re_free(embeddings, segmentationLabels1)))
print('For reference, the P_k score is ' + str(seg_pk) + ', the WindowDiff score is ' + str(seg_wd) + ', the Segmentation Similarity is ' + str(seg_s) + ', and the Boundary Similarity is '+ str(seg_b))

The Segmentation ReFree score for the first segmenation of the text is 3.0498791797046145
The Segmentation ReFree score for the second segmenation of the text is 4.561183315917675
For reference, the P_k score is 0.5, the WindowDiff score is 0.5, the Segmentation Similarity is 0.8888888888888888888888888889, and the Boundary Similarity is 0.5
