**Imports**

In [34]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import collections
import copy

1. Loading the data

In [35]:
with open ('data/data_small.txt', 'r') as f:
    text_to_segment = f.read()

char_num = len(text_to_segment)
print('The number of characters in the text is: ', char_num)

The number of characters in the text is:  652297


Initializing the s distribution

In [36]:
#Initializing the s distribution, either 0 or 1 for each character
s = np.random.randint(2, size=char_num)
print('The number of 1s in the s distribution is: ', np.sum(s))
print(s[:10])

The number of 1s in the s distribution is:  326126
[1 1 0 1 1 1 0 0 1 0]


Base functions for CRP

In [37]:
def segment_text(text_to_segment, s = None):
    # Split the text into words based on the s distribution
    text_len = len(text_to_segment)
    unique_chars = len(set(text_to_segment))
    if s is None:
        s = np.random.randint(2, size=text_len)
    words = []
    word = ''
    for i in range(len(s)):
        if s[i] == 1:
            word += text_to_segment[i]
            words.append(word)
            word = ''
        else:
            word += text_to_segment[i]
    words.append(word)
    return words, text_len, unique_chars, s

def find_previous_word(index, text, s):
    # Find the previous word of the current word
    if index == 0 or (s[index] == 1 and s[index - 1] == 1):
        return text[index]

    start_idx = index - 1 if s[index] == 1 and s[index - 1] == 0 else index

    while start_idx >= 0 and s[start_idx] == 0:
        start_idx -= 1

    start_idx = max(start_idx + 1, 0)
    return text[start_idx:index + 1]

def find_next_word(index, text, s):
    # Find the next word of the current word
    start_idx = index + 1
    end_idx = start_idx

    while end_idx < len(s) and s[end_idx] == 0:
        end_idx += 1

    return text[start_idx:end_idx+1] if end_idx < len(s) else text[start_idx:]

def calculate_word_probability(word, likelihood, unique_chars):
    uniform_prob = (1.0 / float(unique_chars)) ** len(word)
    continuation_prob = likelihood ** (len(word) - 1)
    discontinuation_prob = (1 - likelihood)
    
    return uniform_prob * continuation_prob * discontinuation_prob

Simple functions

In [38]:
def update_word_count(word, word_counts, increment):
    word_counts[word] = max(0, word_counts[word] + increment)

def calculate_p0(alpha, p_c, word, count_word, total_count, unique_chars):
    return (alpha * calculate_word_probability(word, p_c, unique_chars) + count_word) / (alpha + total_count)

def calculate_p1(alpha, p_c, p_cont, prev_word, next_word, word_counts, total_count, unique_chars):
    return (alpha * calculate_word_probability(prev_word, p_c, unique_chars) + word_counts[prev_word]) / (alpha + total_count) * (alpha * calculate_word_probability(next_word, p_c, unique_chars) + word_counts[next_word]) / (alpha + total_count + 1) * p_cont

def apply_annealing(p_0, p_1, T):
    return p_0 ** (1 / T), p_1 ** (1 / T)

def apply_normalization(p_0, p_1):
    total_probability = p_0 + p_1
    if total_probability == 0:
        return 0.5, 0.5
    return p_0 / total_probability, p_1 / total_probability

**Implementation**

In [39]:
def CRP(text, alpha, p_c, p_cont, iterations, T=1, cool_down=1):
    # Create initial word segmentation
    words, text_length, unique_chars, s = segment_text(text)
    word_counts = collections.Counter(words)
    total_word_count = sum(word_counts.values())
    for _ in range(iterations):
        # Iterate randomly through text positions
        for position in np.random.permutation(range(0, text_length - 1)):
            previous_word = find_previous_word(position, text, s)
            next_word = find_next_word(position, text, s)
            if s[position] == 0:
                update_word_count(previous_word + next_word, word_counts, -1)
                total_word_count -= 1
            else:
                update_word_count(previous_word, word_counts, -1)
                update_word_count(next_word, word_counts, -1)
                total_word_count -= 2
            p_0 = calculate_p0(alpha, p_c, previous_word + next_word, word_counts[previous_word + next_word], total_word_count, unique_chars)
            p_1 = calculate_p1(alpha, p_c, p_cont, previous_word, next_word, word_counts, total_word_count, unique_chars)
            # Annealing
            p_0, p_1 = apply_annealing(p_0, p_1, T)
            # Normalization
            p_0, p_1 = apply_normalization(p_0, p_1)
            # Randomly assign segmentation marker based on probabilities
            s[position] = np.random.choice([0, 1], p=[p_0, p_1])
            if s[position] == 0:
                update_word_count(previous_word + next_word, word_counts, 1)
                total_word_count += 1
            else:
                update_word_count(previous_word, word_counts, 1)
                update_word_count(next_word, word_counts, 1)
                total_word_count += 2
        # Decrease temperature for annealing
        T = T * cool_down
    # Generate final segmented text
    updated_words = segment_text(text, s)[0]
    return " ".join(updated_words)


**Execution**

In [40]:

try: #load the existing initial results
    output_file = open('data/initial_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 100
    p_c = 0.5
    p_cont = 0.99
    T=1
    segmented_text = CRP(text_to_segment, alpha, p_c, p_cont, 100, T)
    output_file = open('data/initial_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

Evaluation

In [41]:
def eval(gold_file, result_file):

    gold_spaces = [1]
    gold_wc = 0

    with open(gold_file) as f:
        for line in f:
            for word in line.strip().split(" "):
                if word == '':
                    continue
                for i in range(len(word) - 1):
                    gold_spaces.append(0)
                gold_spaces.append(1)
                gold_wc += 1
    f.close()

    test_spaces = [1]
    test_wc = 0

    with open(result_file) as f:
        for line in f:
            for word in line.strip().split(" "):
                #print(word)
                if word == '':
                    continue
                for i in range(len(word) - 1):
                    test_spaces.append(0)
                test_spaces.append(1)
                test_wc += 1
    f.close()

    #print(gold_spaces)
    #print(test_spaces)

    if len(test_spaces) != len(gold_spaces):
        print("WARNING: Different sizes of test and gold files: TEST:", len(test_spaces), "GOLD:", len(gold_spaces))

    begin_ok = 0
    correct_count = 0
    for i in range(len(gold_spaces)):
        if gold_spaces[i] == 1 and test_spaces[i] == 1:
            if begin_ok == 1:
                correct_count += 1
            begin_ok = 1
        elif gold_spaces[i] != test_spaces[i]:
            begin_ok = 0

    precision = correct_count / test_wc
    recall = correct_count / gold_wc
    # print(precision)
    # print(recall)
    f1 = 2 * precision * recall / (precision + recall)

    return precision, recall, f1


In [42]:
initial_precision, initial_recall, initial_f1 = eval('data/data_small_gold.txt', 'data/initial_results.txt')
print('Initial precision: ', initial_precision, 'Initial recall: ', initial_recall, 'Initial f1: ', initial_f1)

Initial precision:  0.1774104501205638 Initial recall:  0.2960906613744113 Initial f1:  0.22187731448012535


We can see above our initial results.

Trying out with high alpha and pc values

In [43]:
try: #load the existing initial results
    output_file = open('data/high_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 10000
    p_c = 0.99
    p_cont = 0.99
    T=1
    segmented_text = CRP(text_to_segment, alpha, p_c, p_cont, 100, T)
    output_file = open('data/high_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

In [44]:
high_precision, high_recall, high_f1 = eval('data/data_small_gold.txt', 'data/high_results.txt')
print('High precision: ', high_precision, 'High recall: ', high_recall, 'High f1: ', high_f1)

High precision:  0.17553939393939394 High recall:  0.2851281324323899 High f1:  0.21729866637857564


Trying out with annealing

In [45]:
try: #load the existing initial results
    output_file = open('data/annealing_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 10000
    p_c = 0.99
    p_cont = 0.99
    T=1
    cool_down = 0.99
    segmented_text = CRP(text_to_segment, alpha, p_c, p_cont, 100, T, cool_down)
    output_file = open('data/annealing_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

In [46]:
annealing_precision, annealing_recall, annealing_f1 = eval('data/data_small_gold.txt', 'data/annealing_results.txt')
print('Annealing precision: ', annealing_precision, 'Annealing recall: ', annealing_recall, 'Annealing f1: ', annealing_f1)

Annealing precision:  0.14821982188345412 Annealing recall:  0.23645040873222134 Annealing f1:  0.18221653082156447


It didn't work really well, let's try by increasing the initial temperature

In [47]:
try: #load the existing initial results
    output_file = open('data/temp_annealing_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 10000
    p_c = 0.99
    p_cont = 0.99
    T=5
    cool_down = 0.99
    segmented_text = CRP(text_to_segment, alpha, p_c, p_cont, 100, T, cool_down)
    output_file = open('data/temp_annealing_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

In [48]:
temp_annealing_precision, temp_annealing_recall, temp_annealing_f1 = eval('data/data_small_gold.txt', 'data/temp_annealing_results.txt')
print('Temperature annealing precision: ', temp_annealing_precision, 'Temperature annealing recall: ', temp_annealing_recall, 'Temperature annealing f1: ', temp_annealing_f1)

Temperature annealing precision:  0.18599802930496506 Temperature annealing recall:  0.3211028682133913 Temperature annealing f1:  0.23555273115919


Pitman-Yor Process

In [49]:
#base functions for Pitman-Yor process
def update_word_count_PitmanYor(word, word_counts, increment, K):
    if word not in word_counts:
        word_counts[word] = max(0, word_counts[word] + increment)
        K += 1
    else:
        initial = word_counts[word]
        word_counts[word] = max(0, word_counts[word] + increment)
        if initial == 0 and word_counts[word] > 0:
            K += 1
        elif initial > 0 and word_counts[word] == 0:
            K -= 1

def calculate_word_probability_PitmanYor(word, counts, discount, alpha, word_num, unique_chars, p_c, K):
    if counts[word] > 0:
        return (counts[word] - discount)/(alpha+word_num-1)
    else:
        return ((alpha + discount*K)/(alpha+word_num-1))*calculate_word_probability(word, p_c, unique_chars)
    
def calculate_p0_PitmanYor(alpha, p_c, word, counts, total_count, unique_chars, discount, word_num, K):
    return (alpha * calculate_word_probability_PitmanYor(word, counts, discount, alpha, word_num, unique_chars, p_c, K) + counts[word]) / (alpha + total_count)

def calculate_p1_PitmanYor(alpha, p_c, p_cont, prev_word, next_word, word_counts, total_count, unique_chars, discount, word_num, K):
    return ((alpha * calculate_word_probability_PitmanYor(prev_word, word_counts, discount, alpha, word_num, unique_chars, p_c, K) + word_counts[prev_word]) / ((alpha + total_count)) * (alpha * calculate_word_probability_PitmanYor(next_word, word_counts, discount, alpha, word_num, unique_chars, p_c, K) + word_counts[next_word]) / (alpha + total_count + 1)) * p_cont

In [50]:
def PitmanYor(text, alpha, discount, iterations, p_c, p_cont, T=1, cool_down=1):
    # Create initial word segmentation
    words, text_length, unique_chars, s = segment_text(text)
    word_counts = collections.Counter(words)
    K = len(word_counts)
    total_word_count = sum(word_counts.values())
    for _ in range(iterations):
        # Iterate randomly through text positions
        for position in np.random.permutation(range(0, text_length - 1)):
            previous_word = find_previous_word(position, text, s)
            next_word = find_next_word(position, text, s)
            if s[position] == 0:
                update_word_count_PitmanYor(previous_word + next_word, word_counts, -1, K)
                total_word_count -= 1
            else:
                update_word_count_PitmanYor(previous_word, word_counts, -1, K)
                update_word_count_PitmanYor(next_word, word_counts, -1, K)
                total_word_count -= 2
            p_0 = calculate_p0_PitmanYor(alpha, p_c, previous_word + next_word, word_counts, total_word_count, unique_chars, discount, total_word_count, K)
            p_1 = calculate_p1_PitmanYor(alpha, p_c, p_cont, previous_word, next_word, word_counts, total_word_count, unique_chars, discount, total_word_count, K)
            # Annealing
            p_0, p_1 = apply_annealing(p_0, p_1, T)
            # Normalization
            p_0, p_1 = apply_normalization(p_0, p_1)   
            # Randomly assign segmentation marker based on probabilities
            s[position] = np.random.choice([0, 1], p=[p_0, p_1])
            if s[position] == 0:
                update_word_count_PitmanYor(previous_word + next_word, word_counts, 1, K)
                total_word_count += 1
            else:
                update_word_count_PitmanYor(previous_word, word_counts, 1, K)
                update_word_count_PitmanYor(next_word, word_counts, 1, K)
                total_word_count += 2
        # Decrease temperature for annealing
        T = T * cool_down
    # Generate final segmented text
    updated_words = segment_text(text, s)[0]
    return " ".join(updated_words)

Trying out with the same parameter as previous so that we can compare

In [51]:
try: #load the existing initial results
    output_file = open('data/py_7_initial_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 100
    p_c = 0.5
    p_cont = 0.99
    T=1
    discount = 0.7
    segmented_text = PitmanYor(text_to_segment, alpha, discount, 100, p_c, p_cont, T)
    output_file = open('data/py_7_initial_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

Evalutation and comparison

In [53]:
py_initial_precision, py_initial_recall, py_initial_f1 = eval('data/data_small_gold.txt', 'data/py_7_initial_results.txt')
print('Pitman-Yor initial precision: ', py_initial_precision, 'Pitman-Yor initial recall: ', py_initial_recall, 'Pitman-Yor initial f1: ', py_initial_f1)

Pitman-Yor initial precision:  0.1773841859415972 Pitman-Yor initial recall:  0.297271968372474 Pitman-Yor initial f1:  0.2221875588623093


In [54]:
try: #load the existing initial results
    output_file = open('data/py_5_initial_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 100
    p_c = 0.5
    p_cont = 0.99
    T=1
    discount = 0.5
    segmented_text = PitmanYor(text_to_segment, alpha, discount, 100, p_c, p_cont, T)
    output_file = open('data/py_5_initial_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

In [55]:
py_initial_5_precision, py_initial_5_recall, py_initial_5_f1 = eval('data/data_small_gold.txt', 'data/py_5_initial_results.txt')
print("Pitmam-Yor initial d=0.5 precision: ", py_initial_5_precision, "Pitmam-Yor initial d=0.5 recall: ", py_initial_5_recall, "Pitmam-Yor initial d=0.5 f1: ", py_initial_5_f1)

Pitmam-Yor initial d=0.5 precision:  0.17773175495987387 Pitmam-Yor initial d=0.5 recall:  0.2973743483123061 Pitmam-Yor initial d=0.5 f1:  0.22248867965106517


In [57]:
try: #load the existing initial results
    output_file = open('data/py_temp_annealing_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 10000
    p_c = 0.5
    p_cont = 0.99
    T=5
    cool_down = 0.99
    discount = 0.5
    segmented_text = PitmanYor(text_to_segment, alpha, discount, 100, p_c, p_cont, T, cool_down)
    output_file = open('data/py_temp_annealing_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

In [58]:
py_temp_annealing_precision, py_temp_annealing_recall, py_temp_annealing_f1 = eval('data/data_small_gold.txt', 'data/py_temp_annealing_results.txt')
print('Pitman-Yor temperature annealing precision: ', py_temp_annealing_precision, 'Pitman-Yor temperature annealing recall: ', py_temp_annealing_recall, 'Pitman-Yor temperature annealing f1: ', py_temp_annealing_f1)

Pitman-Yor temperature annealing precision:  0.17221563280973035 Pitman-Yor temperature annealing recall:  0.30988045173179607 Pitman-Yor temperature annealing f1:  0.22139262193389392


In [59]:
try: #load the existing initial results
    output_file = open('data/py_temp_annealing_results_1000.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 1000
    p_c = 0.5
    p_cont = 0.99
    T=5
    cool_down = 0.99
    discount = 0.5
    segmented_text = PitmanYor(text_to_segment, alpha, discount, 100, p_c, p_cont, T, cool_down)
    output_file = open('data/py_temp_annealing_results_1000.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

In [60]:
py_temp_annealing_1000_precision, py_temp_annealing_1000_recall, py_temp_annealing_1000_f1 = eval('data/data_small_gold.txt', 'data/py_temp_annealing_results_1000.txt')
print('Pitman-Yor temperature annealing precision with alpha=1000: ', py_temp_annealing_1000_precision, 'Pitman-Yor temperature annealing recall with alpha=1000: ', py_temp_annealing_1000_recall, 'Pitman-Yor temperature annealing f1 with alpha=1000: ', py_temp_annealing_1000_f1)

Pitman-Yor temperature annealing precision with alpha=1000:  0.15877510290632407 Pitman-Yor temperature annealing recall with alpha=1000:  0.3007371355667911 Pitman-Yor temperature annealing f1 with alpha=1000:  0.2078271943573668


Let's have a dataframe to have a better view on each metric

In [64]:
#dataframe to compare the results
import pandas as pd
data = {'Precision': [initial_precision, high_precision, annealing_precision, temp_annealing_precision, py_initial_precision, py_initial_5_precision, py_temp_annealing_precision, py_temp_annealing_1000_precision],
        'Recall': [initial_recall, high_recall, annealing_recall, temp_annealing_recall, py_initial_recall, py_initial_5_recall, py_temp_annealing_recall, py_temp_annealing_1000_recall],
        'F1': [initial_f1, high_f1, annealing_f1, temp_annealing_f1, py_initial_f1, py_initial_5_f1, py_temp_annealing_f1, py_temp_annealing_1000_f1]}
dataframe = pd.DataFrame(data, index=['Initial', 'High', 'Annealing', 'Temperature Annealing', 'Pitman-Yor Initial d=0.7', 'Pitman-Yor Initial d=0.5', 'Pitman-Yor Temperature Annealing a=1e4', 'Pitman-Yor Temperature Annealing a=1e3'])
print(dataframe)

                                        Precision    Recall        F1
Initial                                  0.177410  0.296091  0.221877
High                                     0.175539  0.285128  0.217299
Annealing                                0.148220  0.236450  0.182217
Temperature Annealing                    0.185998  0.321103  0.235553
Pitman-Yor Initial d=0.7                 0.177384  0.297272  0.222188
Pitman-Yor Initial d=0.5                 0.177732  0.297374  0.222489
Pitman-Yor Temperature Annealing a=1e4   0.172216  0.309880  0.221393
Pitman-Yor Temperature Annealing a=1e3   0.158775  0.300737  0.207827


We can see here that our best model was using a higher initial temperature with annealing with the Chinese Restaurant Process