# Modeling The Line Break Problem

{Adapted from Kleinberg & Tardos, p 318}(10) When a product like Microsoft Word formats documents, it tries to keep the right margins of the text relatively even, in spite of different lengths in terms of the number of characters, counting blanks, in each line.

We can model this line break problem as a shortest path problem in the following way. Consider an input of text with $n$ total words and any type of spacing. The input must also contain a value $L$ denoting the target length of each line. We can model this as the shortest path problem by considering $n + 1$ nodes, one for each word, and a start node representing the zeroth word (empty). Each node traveled to by an arc represents a word at the end of a line (I.e. where to break the line).

Consider an input including a long string contained only on one line, and the desired length per line, $L$.
<br><br>
We can now use code developed to solve the shortest path problem to split our text input into lines optimally close to length $L$. Lets begin with a desired line length of 20 characters.

In [None]:
import math
import numpy as np
import pandas as pd

from tkinter import *
from tkinter import font
from time import sleep

In [None]:
# inputs
text = "Call me Ishmael. Some years ago, never mind how long precisely, having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world."
L = 20 # desired line length

# construction
words = [''] + text.split(' ') # list of words (len(words) = n)
c = [len(i) for i in words] # list of characters in each word
A = np.zeros((len(words),len(words))) # forms 2D array of 0s with n rows and n columns used by objective function
isArc = [[False for j in range(len(words))] for i in range(len(words))]
iterations_list=[]
string_progress_list=[]

# objective function
def obj_func(): # assigns values to array based on how many characters are in a line from the row's word to column's word and compares this with L
    for i in range(len(A)):
        for j in range(len(A)): # uses 2D array to iterate over all n words
            if i < j:
                isArc[i][j]=True
                l = sum(c[k] + 1 for k in range(i+1, j)) + c[j] # finds number of characters in line
                A[i,j] = (L - l)**2 # this is the third type of objective function from HW 2, The goal is to minimize the sum of this

# set up for djikstra's algorithm table display
def draw_iteration(S, d, p, words):
    index_list=[]
    for i in range(len(words)):
        index_list.append(str(i))
    index_list=pd.Series(index_list)
    for node in S:
        index_list[node]+="*"
    
    words_series=pd.Series(words)
    
    distances_list=[]
    for node in d:
        if(node!=float('inf')):
            distances_list.append(str(node))
        else:
            distances_list.append("--")
    distances_series=pd.Series(distances_list)
    
    prevs_list=[]
    for node in p:
        if(node!=float('nan')):
            prevs_list.append(str(node))
        else:
            prevs_list.append("--")
    prevs_series=pd.Series(prevs_list)
    
    current_representation = pd.DataFrame({"Nodes":index_list, 
                                           "Word":words_series, 
                                           "Distance": distances_series,
                                           "prev": prevs_series})
    return(current_representation)
        
                
# dijkstras algorithm
def dijkstras(A, s=0):
    d = [float('inf')] * len(A) # distance of path (amount of characters)
    p = [float('nan')] * len(A) # previous
    S, F = [], [] # S-searched, F-frontier
    F.append(s)
    d[s] = 0

    while len(F) > 0: # While there are items within the frontier
        F.sort(reverse=True, key=lambda x: d[x])
        f = F.pop() # Takes last item of F as f
        S.append(f) # Settles f
        for w in range(len(A)): # Iterates through the length of A (The amount of words)
            if isArc[f][w]:# Check if we are looking at an arc
                if w not in S and w not in F: # If the word is not settled or in the frontier:
                    d[w] = d[f] + A[f][w] # Adds the word's distance to the list of distances
                    p[w] = f # Adds the current word f as the prev for the word being looked at
                    F.append(w) # Add the word being looked at to the frontier.
                else: # If the word being looked at is already in the frontier:
                    if d[f] + A[f][w] < d[w]: # If the path to the word through the current word f is shorter than its previous path:
                        d[w] = d[f] + A[f][w] # Update the distance to the word to be that of the path through F
                        p[w] = f # Update the prev of the word to be f
        
        iterations_list.append(draw_iteration(S, d, p, words)) # Function to draw iteration
        string_progress_list.append(formatOutput(p))

    return p

# return string output
def formatOutput(p):
    string_to_return=""
    path=[]
    j=len(p)-1
    while not math.isnan(p[j]):
        path[0:0] = [p[j]]
        j = p[j]
    for i in range(len(path)-1): # prints the list of words one line at a time, stopping at optimal last word in each line
        string_to_return+=(' '.join(words[path[i]+1:path[i+1]+1]))
        string_to_return+="\n"
    string_to_return+=(' '.join(words[path[len(path)-1]+1:]))
    return string_to_return

# print output
def printOutput(p):
    print("12345678901234567890")
    path = []
    j = len(p) - 1
    while not math.isnan(p[j]): # using list of previous nodes to obtain path representing the last word of each line
        path[0:0] = [p[j]]
        j = p[j]
    for i in range(len(path)-1): # prints the list of words one line at a time, stopping at optimal last word in each line
        print(' '.join(words[path[i]+1:path[i+1]+1]))
    print(' '.join(words[path[len(path)-1]+1:]))

In [None]:
obj_func()
printOutput(dijkstras(A,0))

You can run the cell below to see the steps taken to process the string.

In [None]:
# GUI
dijkstra_visualization = Tk()
dijkstra_visualization.geometry("1600x900")
dijkstra_visualization.title("Visualizing the Dijkstra Iterations")

# slider
slider = Scale(dijkstra_visualization, from_=0, to=len(iterations_list)-1, 
               orient=HORIZONTAL, label="Iteration To Display", length=1500, sliderlength= 100)
slider.set(0)
check=0
slider.pack()

# text
table_textVar = StringVar()
table_textVar.set(iterations_list[0].to_string())
table_label = Label(dijkstra_visualization, 
                    textvariable = table_textVar,
                    font = "Courier 14", 
                    justify="left",
                    padx = 100)
table_label.pack(side="left")

output_textVar = StringVar()
output_textVar.set("12345678901234567890\n" + string_progress_list[0])
output_label = Label(dijkstra_visualization,
                     textvariable = output_textVar,
                     font = "Courier 20", 
                     justify="left", anchor="w",
                     padx = 100)
output_label.pack(side="left")


# mainloop
windowOpen=True
while windowOpen:
    try:
        dijkstra_visualization.update_idletasks()
        dijkstra_visualization.update()
        if slider.get() != check:
            check=slider.get()                        
            index_to_show = slider.get()
            iters_text = (iterations_list[index_to_show].to_string())
            table_textVar.set(iters_text)
            out_text = ("12345678901234567890\n" + string_progress_list[index_to_show])
            output_textVar.set(out_text)
    except TclError:
        windowOpen=False
        print("window successfully closed")

The following cells illustrate the use of djikstra's algorithm to solve the line break problem with a large text input and varying $L$.

In [None]:
# new objective function
option = 2
def obj_func_option(option): # assigns values to array based on how many characters are in a line from the row's word to column's word and compares this with L
    global isArc
    for i in range(len(A)):
        for j in range(len(A)): # uses 2D array to iterate over all n words
            if i < j:
                isArc[i][j] = True
                l = sum(c[k] + 1 for k in range(i+1, j)) + c[j] # finds number of characters in line
                if option == 0:
                    A[i,j] = l - L
                elif option == 1:
                    A[i,j] = math.fabs(L - l)
                else:
                    A[i,j] = (L - l)**2


# new input
text = "Edsger Wybe Dijkstra; 11 May 1930 - 6 August 2002 was a Dutch computer scientist, programmer, software engineer, systems scientist, science essayist, and pioneer in computing science. A theoretical physicist by training, he worked as a programmer at the Mathematisch Centrum (Amsterdam) from 1952 to 1962. A university professor for much of his life, Dijkstra held the Schlumberger Centennial Chair in Computer Sciences at the University of Texas at Austin from 1984 until his retirement in 1999. He was a professor of mathematics at the Eindhoven University of Technology (1962-1984) and a research fellow at the Burroughs Corporation (1973-1984). In 1972, he became the first person to win the Turing Award who was neither American nor British. Via Wikepedia.com"
L = 35
words = [''] + text.split(' ') # list of words (len(words) = n)
c = [len(i) for i in words] # list of characters in each word
A = np.zeros((len(words),len(words))) # forms 2D array of 0s with n rows and n columns used by objective function
isArc = [[False for j in range(len(words))] for i in range(len(words))]
obj_func_option(2)

# GUI
master = Tk()
master.geometry("1600x700")
master.title("The Line Break Problem")

# slider
slider = Scale(master, from_=5, to=100, orient=HORIZONTAL, font = "Courier 14", label="Desired Characters per Line", length=1500, sliderlength= 100)
slider.set(L)
check=L
slider.grid(row=0, column=0, columnspan = 12)

# change alignment
just = "left"
def justify(str):
    global labels
    global just
    just=str
    for i in range(5):
        labels[i].destroy()
        labels[i] = Label(master, textvariable= textVars[i], font = "Courier 20", justify = just)
        labels[i].grid(row=6, column=2*(i+1))

def swap(choice):
    global option
    option = choice
    obj_func_option(option)
    strs = splitOutput(dijkstras(A,0))
    global textVars
    for i in range(5):
        textVars[i].set(strs[i])
    
# menu
menuLabel = Label(master, text= "Menu", font = "Courier 20", justify = "left")
menuLabel.grid(row=2, column=0, columnspan=2)

# buttons
justifyL = Button(master, text="Left-align", command = lambda: justify("left"))
justifyL.grid(row=3, column=0)
justifyM = Button(master, text="Center-align", command = lambda: justify("center"))
justifyM.grid(row=4, column=0)
justifyR = Button(master, text="Right-align", command = lambda: justify("right"))
justifyR.grid(row=5, column=0)  

obj1 = Button(master, text=" length - L ", command = lambda: swap(0))
obj1.grid(row=3, column=1)
obj2 = Button(master, text="| length - L |", command = lambda: swap(1))
obj2.grid(row=4, column=1)
obj3 = Button(master, text="( length - L )^2", command = lambda: swap(2))
obj3.grid(row=5, column=1) 

# formats output for columns
colLength = 22
def splitOutput(p):
    strs = ["","","","",""]
    path=[]
    j=len(p)-1
    while not math.isnan(p[j]):
        path[0:0] = [p[j]]
        j = p[j]
    linesLeft = len(path) - 1
    cols=0
    loop = True
    while loop and cols <5:
        nextLinesLeft = linesLeft - colLength
        if nextLinesLeft >0:
            for j in range(cols * colLength, (cols +1) * colLength):
                strs[cols] +=(' '.join(words[path[j]+1:path[j+1]+1]))
                strs[cols] +="\n"
        elif linesLeft >0:
            for j in range(cols * colLength, len(path) - 1):
                strs[cols] +=(' '.join(words[path[j]+1:path[j+1]+1]))
                strs[cols] +="\n"
            strs[cols] += (' '.join(words[path[len(path)-1]+1:]))
            loop=False
        linesLeft=nextLinesLeft
        cols+=1
    return strs

# text
textVars = [None] * 5
labels = [None] * 5
start = splitOutput(dijkstras(A,0))
for i in range(5):
    textVars[i] = StringVar()
    labels[i] = Label(master, textvariable= textVars[i], font = "Courier 20", justify = just)
    labels[i].grid(row=6, column=2*(i+1))
    textVars[i].set(start[i])
    
# mainloop
windowOpen=True
while windowOpen:
    try:
        master.update_idletasks()
        master.update()
        if slider.get() != check:
            check=slider.get()                        
            L = slider.get()
            obj_func_option(option)
            strs = splitOutput(dijkstras(A,0))
            for i in range(5):
                textVars[i].set(strs[i])
    except TclError:
        windowOpen=False
        print("window successfully closed")