**Imports**

In [2]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import collections
import copy

1. Loading the data

In [3]:
with open ('data/data_small.txt', 'r') as f:
    text_to_segment = f.read()

char_num = len(text_to_segment)
print('The number of characters in the text is: ', char_num)

The number of characters in the text is:  652297


Initializing the s distribution

In [4]:
#Initializing the s distribution, either 0 or 1 for each character
s = np.random.randint(2, size=char_num)
print('The number of 1s in the s distribution is: ', np.sum(s))
print(s[:10])

The number of 1s in the s distribution is:  326087
[0 0 1 1 0 0 1 1 1 0]


Base functions for CRP

In [5]:
def segment_text(text_to_segment, s = None):
    # Split the text into words based on the s distribution
    text_len = len(text_to_segment)
    unique_chars = len(set(text_to_segment))
    if s is None:
        s = np.random.randint(2, size=text_len)
    words = []
    word = ''
    for i in range(len(s)):
        if s[i] == 1:
            word += text_to_segment[i]
            words.append(word)
            word = ''
        else:
            word += text_to_segment[i]
    words.append(word)
    return words, text_len, unique_chars, s

def find_previous_word(index, text, s):
    # Find the previous word of the current word
    if index == 0 or (s[index] == 1 and s[index - 1] == 1):
        return text[index]

    start_idx = index - 1 if s[index] == 1 and s[index - 1] == 0 else index

    while start_idx >= 0 and s[start_idx] == 0:
        start_idx -= 1

    start_idx = max(start_idx + 1, 0)
    return text[start_idx:index + 1]

def find_next_word(index, text, s):
    # Find the next word of the current word
    start_idx = index + 1
    end_idx = start_idx

    while end_idx < len(s) and s[end_idx] == 0:
        end_idx += 1

    return text[start_idx:end_idx+1] if end_idx < len(s) else text[start_idx:]

def calculate_word_probability(word, likelihood, unique_chars):
    uniform_prob = (1.0 / float(unique_chars)) ** len(word)
    continuation_prob = likelihood ** (len(word) - 1)
    discontinuation_prob = (1 - likelihood)
    
    return uniform_prob * continuation_prob * discontinuation_prob

Simple functions

In [6]:
def update_word_count(word, word_counts, increment):
    word_counts[word] = max(0, word_counts[word] + increment)

def calculate_p0(alpha, p_c, word, count_word, total_count, unique_chars):
    return (alpha * calculate_word_probability(word, p_c, unique_chars) + count_word) / (alpha + total_count)

def calculate_p1(alpha, p_c, p_cont, prev_word, next_word, word_counts, total_count, unique_chars):
    return (alpha * calculate_word_probability(prev_word, p_c, unique_chars) + word_counts[prev_word]) / (alpha + total_count) * (alpha * calculate_word_probability(next_word, p_c, unique_chars) + word_counts[next_word]) / (alpha + total_count + 1) * p_cont

def apply_annealing(p_0, p_1, T):
    return p_0 ** (1 / T), p_1 ** (1 / T)

def apply_normalization(p_0, p_1):
    total_probability = p_0 + p_1
    if total_probability == 0:
        return 0.5, 0.5
    return p_0 / total_probability, p_1 / total_probability

**Implementation**

In [7]:
def CRP(text, alpha, p_c, p_cont, iterations, T=1, cool_down=1):
    # Create initial word segmentation
    words, text_length, unique_chars, s = segment_text(text)
    word_counts = collections.Counter(words)
    total_word_count = sum(word_counts.values())
    for _ in range(iterations):
        # Iterate randomly through text positions
        for position in np.random.permutation(range(0, text_length - 1)):
            previous_word = find_previous_word(position, text, s)
            next_word = find_next_word(position, text, s)
            
            if s[position] == 0:
                update_word_count(previous_word + next_word, word_counts, -1)
                total_word_count -= 1
            else:
                update_word_count(previous_word, word_counts, -1)
                update_word_count(next_word, word_counts, -1)
                total_word_count -= 2
            
            p_0 = calculate_p0(alpha, p_c, previous_word + next_word, word_counts[previous_word + next_word], total_word_count, unique_chars)
            p_1 = calculate_p1(alpha, p_c, p_cont, previous_word, next_word, word_counts, total_word_count, unique_chars)
            
            # Annealing
            p_0, p_1 = apply_annealing(p_0, p_1, T)
            # Normalization
            p_0, p_1 = apply_normalization(p_0, p_1)
            
            # Randomly assign segmentation marker based on probabilities
            s[position] = np.random.choice([0, 1], p=[p_0, p_1])
            
            if s[position] == 0:
                update_word_count(previous_word + next_word, word_counts, 1)
                total_word_count += 1
            else:
                update_word_count(previous_word, word_counts, 1)
                update_word_count(next_word, word_counts, 1)
                total_word_count += 2
        # Decrease temperature for annealing
        T = T * cool_down
    
    # Generate final segmented text
    updated_words = segment_text(text, s)[0]
    return " ".join(updated_words)


**Execution**

In [8]:

try: #load the existing initial results
    output_file = open('data/initial_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 100
    p_c = 0.5
    p_cont = 0.99
    T=1
    segmented_text = CRP(text_to_segment, alpha, p_c, p_cont, 1000, T)
    output_file = open('data/initial_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

Evaluation

In [9]:
def eval(gold_file, result_file):

    gold_spaces = [1]
    gold_wc = 0

    with open(gold_file) as f:
        for line in f:
            for word in line.strip().split(" "):
                if word == '':
                    continue
                for i in range(len(word) - 1):
                    gold_spaces.append(0)
                gold_spaces.append(1)
                gold_wc += 1
    f.close()

    test_spaces = [1]
    test_wc = 0

    with open(result_file) as f:
        for line in f:
            for word in line.strip().split(" "):
                #print(word)
                if word == '':
                    continue
                for i in range(len(word) - 1):
                    test_spaces.append(0)
                test_spaces.append(1)
                test_wc += 1
    f.close()

    #print(gold_spaces)
    #print(test_spaces)

    if len(test_spaces) != len(gold_spaces):
        print("WARNING: Different sizes of test and gold files: TEST:", len(test_spaces), "GOLD:", len(gold_spaces))

    begin_ok = 0
    correct_count = 0
    for i in range(len(gold_spaces)):
        if gold_spaces[i] == 1 and test_spaces[i] == 1:
            if begin_ok == 1:
                correct_count += 1
            begin_ok = 1
        elif gold_spaces[i] != test_spaces[i]:
            begin_ok = 0

    precision = correct_count / test_wc
    recall = correct_count / gold_wc
    # print(precision)
    # print(recall)
    f1 = 2 * precision * recall / (precision + recall)

    return precision, recall, f1


In [10]:
initial_precision, initial_recall, initial_f1 = eval('data/data_small_gold.txt', 'data/initial_results.txt')
print('Initial precision: ', initial_precision, 'Initial recall: ', initial_recall, 'Initial f1: ', initial_f1)

Initial precision:  0.19022270556978568 Initial recall:  0.31407015388492493 Initial f1:  0.23693880764169242


Trying out with high alpha and pc values

In [11]:
try: #load the existing initial results
    output_file = open('data/high_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 10000
    p_c = 0.99
    p_cont = 0.99
    T=1
    segmented_text = CRP(text_to_segment, alpha, p_c, p_cont, 1000, T)
    output_file = open('data/high_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

In [12]:
high_precision, high_recall, high_f1 = eval('data/data_small_gold.txt', 'data/high_results.txt')
print('High precision: ', high_precision, 'High recall: ', high_recall, 'High f1: ', high_f1)

High precision:  0.20540769404976417 High recall:  0.32857660382113435 High f1:  0.25278706800445927


Trying out with annealing

In [13]:
try: #load the existing initial results
    output_file = open('data/annealing_results.txt', 'r')
    segmented_text = output_file.read()
    output_file.close()
except:
    alpha = 10000
    p_c = 0.99
    p_cont = 0.99
    T=1
    cool_down = 0.99
    segmented_text = CRP(text_to_segment, alpha, p_c, p_cont, 1000, T, cool_down)
    output_file = open('data/annealing_results.txt', 'w')
    output_file.write(segmented_text)
    output_file.close()

In [None]:
annealing_precision, annealing_recall, annealing_f1 = eval('data/data_small_gold.txt', 'data/annealing_results.txt')