In [2]:
## Gale & Church Algorithm (Length-based)
# https://www.nltk.org/api/nltk.translate.html#module-nltk.translate.gale_church

import numpy as np
import jieba
import nltk
import jieba
import multiprocessing 

from nltk.translate import gale_church

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
en_file = 'shinpakusuu_en.txt'
zh_file = 'shinpakusuu_zh.txt'
en_mess_file = 'shinpakusuu_en_messed.txt'
en_reorder_file = 'shinpakusuu_en_reordered.txt'

def read_len_f(x_file):
    len_list = []
    index_cursor = 0
    len_sub_list = []
    with open(x_file,'r', encoding='utf-8') as file:
        for line in file:
            if line == "\n":
                len_list.append(len_sub_list)
                len_sub_list = []
                index_cursor +=1
            else:
                len_sub_list.append(len(line.rstrip())) # characters
                
    if len_sub_list:
        len_list.append(len_sub_list)
    return len_list

def tokenize_length(text,lang='en'):
    if lang=='zh':
        return len(jieba.lcut(text.rstrip()))
    else:
        return len(text.rstrip().split())

def read_word_count_f(x_file,lang='en'):
    len_list = []
    len_sub_list = []
    with open(x_file,'r', encoding='utf-8') as file:
        for line in file:
            if line == "\n":
                len_list.append(len_sub_list)
                len_sub_list = []
            else:
                len_sub_list.append(tokenize_length(line,lang)) # characters
                
    if len_sub_list:
        len_list.append(len_sub_list)
    return len_list
    
en_list = read_len_f(en_file)
en_w_list = read_word_count_f(en_file)
zh_list = read_len_f(zh_file)
zh_w_list = read_word_count_f(zh_file,'zh')
en_mess_list = read_len_f(en_mess_file)
en_mess_w_list = read_word_count_f(en_mess_file)
en_reorder_list = read_len_f(en_reorder_file)
en_reorder_w_list = read_word_count_f(en_reorder_file)

In [6]:
en_w_list

[[6, 13, 9, 11],
 [12, 9, 10, 7],
 [5, 7, 8, 7],
 [6, 6, 8, 9],
 [12, 9, 9, 7],
 [6, 9, 9, 5],
 [6, 7, 9, 8],
 [5, 7, 8, 5]]

In [7]:
zh_w_list

[[9, 15, 11, 9],
 [11, 9, 12, 6],
 [7, 7, 5, 8],
 [8, 12, 11, 8],
 [11, 9, 12, 6],
 [6, 5, 5, 9],
 [6, 11, 9, 8],
 [7, 7, 5, 4]]

In [8]:
en_mess_w_list

[[9, 11, 13, 6],
 [7, 9, 12, 10],
 [5, 8, 7, 7],
 [6, 6, 8, 9],
 [12, 9, 9, 7],
 [6, 9, 9, 5],
 [6, 7, 9, 8],
 [5, 7, 8, 5]]

In [39]:
en_reorder_w_list

[[19, 11, 9],
 [12, 9, 10],
 [12, 15],
 [6, 6, 15, 2],
 [12, 9, 9, 7],
 [9, 6, 9, 5],
 [6, 7, 9, 8],
 [5, 7, 5, 8]]

In [17]:
gale_church.align_texts(zh_list, en_reorder_w_list)

[[(0, 0), (1, 0), (2, 1), (3, 2)],
 [(0, 0), (1, 0), (2, 1), (3, 2)],
 [(0, 0), (1, 0), (2, 1), (3, 1)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)]]

In [13]:
gale_church.align_texts(en_reorder_w_list, zh_list)

[[(0, 0), (0, 1), (1, 2), (2, 3)],
 [(0, 0), (0, 1), (1, 2), (2, 3)],
 [(0, 0), (0, 1), (1, 2), (1, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)]]

In [14]:
gale_church.align_texts(en_reorder_list, zh_list)

[[(0, 0), (0, 1), (1, 2), (2, 3)],
 [(0, 0), (0, 1), (1, 2), (2, 3)],
 [(0, 0), (0, 1), (1, 2), (1, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)]]

In [15]:
gale_church.align_texts(en_reorder_list, en_list)

[[(0, 0), (0, 1), (1, 2), (2, 3)],
 [(0, 0), (1, 1), (2, 2), (2, 3)],
 [(0, 0), (0, 1), (1, 2), (1, 3)],
 [(0, 0), (1, 1), (2, 2), (2, 3), (3, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)]]

In [16]:
gale_church.align_texts(en_reorder_w_list, en_list)

[[(0, 0), (0, 1), (1, 2), (2, 3)],
 [(0, 0), (1, 1), (2, 2), (2, 3)],
 [(0, 0), (0, 1), (1, 2), (1, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)],
 [(0, 0), (1, 1), (2, 2), (3, 3)]]

In [33]:
print([x*10 for x in en_reorder_list[0]])
print([x*10 for x in en_list[0]])

[870, 490, 510]
[250, 610, 510, 490]


In [34]:
gale_church.align_blocks([x*10 for x in en_reorder_list[0]], [x*10 for x in en_list[0]])

[(0, 0), (0, 1), (1, 2), (2, 3)]

In [55]:
log_list = []
log_sub_list = []

for ii in range(len(en_list)):
    for i in range(len(en_list[ii])):
        log_p = int(gale_church.align_log_prob(i, i, en_list[ii], zh_list[ii], (1,1), gale_church.LanguageIndependent))
        log_sub_list.append(log_p)
    log_list.append(log_sub_list)
    log_sub_list = []

# gale_church.align_log_prob(1, 1, en_list[0], zh_list[0], (1,1), gale_church.LanguageIndependent)
# first two offset values (where is the sentence in the paragraph)
# takes one block, not full text
# (1,1) alignment type 1 is to 1 sentence

log_list

#result is -log, if is 0 is Prob =1
#closer to 0 wins

[[4, 1, 3, 4],
 [3, 5, 2, 2],
 [2, 2, 4, 3],
 [3, 1, 2, 1],
 [3, 5, 2, 3],
 [2, 3, 4, 3],
 [3, 3, 2, 4],
 [3, 2, 4, 4]]

In [58]:
log_list = []
log_sub_list = []

for ii in range(len(en_w_list)):
    for i in range(len(en_w_list[ii])):
        log_p = int(gale_church.align_log_prob(i, i, en_w_list[ii], zh_w_list[ii], (1,1), gale_church.LanguageIndependent)*10)
        log_sub_list.append(log_p)
    log_list.append(log_sub_list)
    log_sub_list = []

# gale_church.align_log_prob(1, 1, en_list[0], zh_list[0], (1,1), gale_church.LanguageIndependent)
# first two offset values (where is the sentence in the paragraph)
# takes one block, not full text
# (1,1) alignment type 1 is to 1 sentence

log_list

#result is -log, if is 0 is Prob =1
#closer to 0 wins

[[3, 5, 2, 3],
 [2, 2, 1, 3],
 [2, 3, 1, 5],
 [2, 3, 9, 4],
 [2, 2, 1, 4],
 [6, 1, 6, 6],
 [1, 1, 6, 1],
 [2, 3, 1, 5]]

In [17]:
with open(en_file,'r', encoding='utf-8') as file:
    print(file.read())

My heart – when it stops,
I’m sure that this world – I think I’ll have fully enjoyed it
As for what I’ll leave behind, pretty much nothing;
At your side, I think I just want to keep smiling

Through the pounding in my chest, I still want to protect you
As a reason to live, that’s fine by me
One more, one more – I count the same tears,
And once again, we know each other

My throbbing pulse conveys them,
The recurring sounds and my running thoughts
Let us promise to be apart no longer,
So that you should never be lonely

My heart – in one minute,
Seventy times, it shouts “I live”
But when I’m with you, it runs fast,
And one hundred ten times, it shouts “I love”

Through the pounding in my chest, I still want to protect you
As a reason to live, that’s fine by me
Once more, once more – the same heart repeats,
And once again, we know each other

The meetings between you and I:
If there had to be some reason for them,
While I don’t know if it would be fate,
Their sheer happiness is unchangin

In [19]:
with open(zh_file,'r', encoding='utf-8') as file:
    print(file.read())

在我的心脏，停下的时候呢
我一定是觉得已经，充分享受过这个世界才结束的吧
彷佛没做完的事，几乎都没有般
希望能在你身旁，一直笑着

仍然想在这颗心跳动的时间内守护你
只要以那件事为生存意义就好了
「再一个、再一个」的数着相同的眼泪
我们又再度了解了彼此

巨大的跳动声传达来的
重叠的声响与流泄的思念
约定再也不要分开吧
希望无论何时都不要让你寂寞

我的心脏，在一分钟内呢
会喊出70次的，「我正活着」
但是和你在一起时，就会稍微加快脚步
喊出110次的，「我爱你」

仍然想在这颗心跳动的时间内守护你
只要以那件事为生存意义就好了
「再一次、再一次」的重迭相同的心意
我们又再度了解了彼此

如果我和你的相遇
是有什么理由的话
就算不知道是不是命运
那份喜悦也是不会改变的喔

直到某天你放弃我为止
你还会说出多少次「喜欢」呢？
去感谢能身在这里的这件事吧
就为了活着这件事而感谢吧

巨大的跳动声传达来的
重叠的声响与流泄的思念
约定一直相爱下去吧
直到心跳停止为止
