In [1]:
## Word Embeddings ##
# LASER Embeddings pre-trained model

import pandas as pd
import numpy as np
import nltk
import jieba
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize  # splits by contractions which I don't like
from string import punctuation

from laserembeddings import Laser

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
zh_file = "zh-en.training.zh"
en_file = "zh-en.training.en"

zh_df = pd.read_csv(zh_file, names=['ID_zh','Sentence_zh'], sep='\t')
en_df = pd.read_csv(en_file, names=['ID_en','Sentence_en'], sep='\t')

In [4]:
pair_file = "zh-en.training.gold"
pair_df = pd.read_csv(pair_file, names=['ID_zh','ID_en'], sep='\t')

new_df = pair_df.merge(zh_df, 'inner', 'ID_zh')
new_df = new_df.merge(en_df, 'inner', 'ID_en')
new_df

Unnamed: 0,ID_zh,ID_en,Sentence_zh,Sentence_en
0,zh-000000033,en-000005983,1989年以前，全球经济包含大约8亿到10亿人口。,"Until 1989, the global market encompassed betw..."
1,zh-000000231,en-000047360,今日全球面临的威胁是超民族的，因此也必须采取超民族的方式来应对。,The threats facing the world today are suprana...
2,zh-000000272,en-000027140,欧盟移民政策的硬伤还有一个不太显著的方面。,"There is another, less obvious, reason why the..."
3,zh-000000438,en-000065621,只有让民粹主义服务于自由主义改革，政府才能取得长久的利益。,Only if populism is put at the service of libe...
4,zh-000000639,en-000005169,但社会民主派必须理解为何示威的发展会独立于现有的有组织中左翼政治。,But social democrats must understand why the p...
...,...,...,...,...
1848,zh-000094590,en-000013258,事件发生后当局在尚未进行调查的情况下就匆匆掩埋了出事列车残骸。,The wrecked body of the ruined train was burie...
1849,zh-000094593,en-000061419,北方拥有丰富的自然资源，就连电力也是从北方输送到南方。,"Natural resources were abundant in the North, ..."
1850,zh-000094607,en-000039373,如果利率为3%，那么年税收额必须增加15亿美元。,"If it is 3%, the required increase in annual t..."
1851,zh-000094611,en-000003807,五年前，叙利亚北部边陲城镇享受着土耳其高速经济增长的红利。,"Five years ago, Syria’s northern border towns ..."


In [6]:
# English
# 1. Tokenize Sentence -> Words
# 2. Remove punctuation and stopwords
# 3. Stemming Words



en_stopwords=stopwords.words("english")+["'s"]  #chinese de is stopword
stemmer=PorterStemmer()
punctuation = punctuation +'–’“”'

def en_proc(sentence):
    word_list = word_tokenize(sentence)
#    word_list = [w for w in word_list if w.lower() not in en_stopwords]
#    word_list = [w.translate(str.maketrans('', '', punctuation)) for w in word_list]
    bow_list = [stemmer.stem(w.lower()) for w in word_list if w.lower() not in en_stopwords and w not in punctuation]
    
    return bow_list


#en_list_token = [word_tokenize(s) for s in en_list]
#en_list_proc = []
#for s in en_list_token:
   # en_list_proc.append([stemmer.stem(w.lower()) for w in s if w.lower() not in en_stopwords and w not in punctuation])

# Chinese
# 1. Segmentation
# 2. Remove punctuation and stopwords

import re
with open('../zh_stopwords.txt','r', encoding='utf-8') as file:
    zh_stopwords = file.read()
zh_stopwords = re.sub('[ A-Za-z]+\n', ',', zh_stopwords)
zh_stopwords = zh_stopwords.translate(str.maketrans('', '', '\n')).split(',') 
zh_stopwords = list(filter(None, zh_stopwords))
punctuation = punctuation + '，「」。！？《》【】、'


def zh_proc(sentence):
    bow_list = [w for w in jieba.cut(sentence) if w not in zh_stopwords and w not in punctuation]
    return bow_list



#zh_list_proc = []
#for s in zh_list:
   # zh_list_proc.append([w for w in jieba.cut(s) if w not in zh_stopwords and w not in punctuation])


In [11]:
laser = Laser()

In [48]:
def get_vector(sentence,lang="en",proc=True):
    if proc==True:
        if lang == "en":
            s = ' '.join(en_proc(sentence))
        elif lang == "zh":
            s = ' '.join(zh_proc(sentence))
        else:
            print('No proccessing method for this language.')
    else:
        s = sentence
    return laser.embed_sentences(s, lang=[lang])


def cosine_similarity(v1,v2):
    '''cosine_similarity(transformed_docs[2], transformed_docs[2])'''
    ## Idk why need to np.squeeze (1,148) into (148,) shape to dot product [error: shapes not aligned]
    ## toarray() [error: dimension mismatch] v1.toarray()v2.toarray()
    
    #print('Calculating Cosine Similarity...')
    
    v1 = np.squeeze(v1)
    v2 = np.squeeze(v2)
    return np.dot(v1,v2) / ( np.sqrt(np.dot(v1,v1)) * np.sqrt(np.dot(v2,v2)) )

        
def test(n, x=0):
    k=0
    j=''
    for i in range(x, n):
        this_value = cosine_similarity( get_vector(new_df.iloc[i]['Sentence_en']), get_vector(new_df.iloc[i]['Sentence_zh'],lang="zh") )
        if i == x:
            most = this_value
            s_index = 0
        elif most <= this_value:
            most = this_value
            s_index = i
        
        if this_value>0.7:
            j='YES'
            k+=1
        else:
            j='NO'
        
        print('Line:',i,'\tSim:',round(this_value,2), '\tPair:', j)
        
    return s_index,most, 'Success Rate:'+str(round(k/(n-x)*100,1))+'%'
            

    
test(50)

Line: 0 	Sim: 0.84 	Pair: YES
Line: 1 	Sim: 0.7 	Pair: YES
Line: 2 	Sim: 0.61 	Pair: NO
Line: 3 	Sim: 0.76 	Pair: YES
Line: 4 	Sim: 0.82 	Pair: YES
Line: 5 	Sim: 0.81 	Pair: YES
Line: 6 	Sim: 0.77 	Pair: YES
Line: 7 	Sim: 0.73 	Pair: YES
Line: 8 	Sim: 0.74 	Pair: YES
Line: 9 	Sim: 0.58 	Pair: NO
Line: 10 	Sim: 0.71 	Pair: YES
Line: 11 	Sim: 0.71 	Pair: YES
Line: 12 	Sim: 0.69 	Pair: NO
Line: 13 	Sim: 0.72 	Pair: YES
Line: 14 	Sim: 0.75 	Pair: YES
Line: 15 	Sim: 0.76 	Pair: YES
Line: 16 	Sim: 0.68 	Pair: NO
Line: 17 	Sim: 0.78 	Pair: YES
Line: 18 	Sim: 0.78 	Pair: YES
Line: 19 	Sim: 0.79 	Pair: YES
Line: 20 	Sim: 0.79 	Pair: YES
Line: 21 	Sim: 0.79 	Pair: YES
Line: 22 	Sim: 0.69 	Pair: NO
Line: 23 	Sim: 0.69 	Pair: NO
Line: 24 	Sim: 0.77 	Pair: YES
Line: 25 	Sim: 0.7 	Pair: YES
Line: 26 	Sim: 0.77 	Pair: YES
Line: 27 	Sim: 0.72 	Pair: YES
Line: 28 	Sim: 0.75 	Pair: YES
Line: 29 	Sim: 0.73 	Pair: YES
Line: 30 	Sim: 0.71 	Pair: YES
Line: 31 	Sim: 0.84 	Pair: YES
Line: 32 	Sim: 0.79 	Pair:

(31, 0.843045, 'Success Rate:82.0%')

In [40]:
from random import randrange

t = 50
n = 0

while t > n:
    x = randrange(1853)
    y = randrange(1853)
    if x != y:
        z = cosine_similarity( get_vector(new_df.iloc[x]['Sentence_en']), get_vector(new_df.iloc[y]['Sentence_zh'],lang="zh") )
        zz = 'YES' if z>0.7 else 'NO'
    
        print('Line EN:',x,'\tLine ZH:',y,'\tSim:',round(z,2), '\tPair:', zz)
    n+=1

Line EN: 1048 	Line ZH: 956 	Sim: 0.6 	Pair: NO
Line EN: 583 	Line ZH: 582 	Sim: 0.54 	Pair: NO
Line EN: 298 	Line ZH: 664 	Sim: 0.58 	Pair: NO
Line EN: 267 	Line ZH: 1709 	Sim: 0.5 	Pair: NO
Line EN: 1300 	Line ZH: 421 	Sim: 0.59 	Pair: NO
Line EN: 140 	Line ZH: 1055 	Sim: 0.48 	Pair: NO
Line EN: 1403 	Line ZH: 1333 	Sim: 0.57 	Pair: NO
Line EN: 1734 	Line ZH: 1255 	Sim: 0.52 	Pair: NO
Line EN: 990 	Line ZH: 1525 	Sim: 0.47 	Pair: NO
Line EN: 1106 	Line ZH: 491 	Sim: 0.49 	Pair: NO
Line EN: 1714 	Line ZH: 1173 	Sim: 0.5 	Pair: NO
Line EN: 1328 	Line ZH: 1212 	Sim: 0.54 	Pair: NO
Line EN: 66 	Line ZH: 179 	Sim: 0.55 	Pair: NO
Line EN: 1797 	Line ZH: 1196 	Sim: 0.48 	Pair: NO
Line EN: 166 	Line ZH: 772 	Sim: 0.57 	Pair: NO
Line EN: 1051 	Line ZH: 1598 	Sim: 0.6 	Pair: NO
Line EN: 506 	Line ZH: 1737 	Sim: 0.55 	Pair: NO
Line EN: 157 	Line ZH: 655 	Sim: 0.55 	Pair: NO
Line EN: 0 	Line ZH: 811 	Sim: 0.58 	Pair: NO
Line EN: 852 	Line ZH: 80 	Sim: 0.59 	Pair: NO
Line EN: 1306 	Line ZH: 789 	

In [36]:
# Setting requirement to 0.7 similarity to exclude false positives
# Line 9 is a problem, similarity close to non-pair sentences
print(new_df.iloc[9]['Sentence_en'],'\t',new_df.iloc[9]['Sentence_zh'])

Millions of people around the world will live, or die, depending on what these governments decide in December. 	 各国政府在11月的决定将影响数百万人的生死。


In [49]:
# Testing without proccessing text

def test_no_proc(n, x=0):
    k=0
    j=''
    for i in range(x, n):
        this_value = cosine_similarity( get_vector(new_df.iloc[i]['Sentence_en'], proc=False), get_vector(new_df.iloc[i]['Sentence_zh'],lang="zh",proc=False) )
        if i == x:
            most = this_value
            s_index = 0
        elif most <= this_value:
            most = this_value
            s_index = i
        
        if this_value>0.7:
            j='YES'
            k+=1
        else:
            j='NO'
        
        print('Line:',i,'\tSim:',round(this_value,2), '\tPair:', j)
        
    return s_index,most, 'Success Rate:'+str(round(k/(n-x)*100,1))+'%'
            

    
test_no_proc(50)

Line: 0 	Sim: 0.9 	Pair: YES
Line: 1 	Sim: 0.85 	Pair: YES
Line: 2 	Sim: 0.74 	Pair: YES
Line: 3 	Sim: 0.87 	Pair: YES
Line: 4 	Sim: 0.88 	Pair: YES
Line: 5 	Sim: 0.87 	Pair: YES
Line: 6 	Sim: 0.86 	Pair: YES
Line: 7 	Sim: 0.87 	Pair: YES
Line: 8 	Sim: 0.82 	Pair: YES
Line: 9 	Sim: 0.67 	Pair: NO
Line: 10 	Sim: 0.93 	Pair: YES
Line: 11 	Sim: 0.83 	Pair: YES
Line: 12 	Sim: 0.81 	Pair: YES
Line: 13 	Sim: 0.89 	Pair: YES
Line: 14 	Sim: 0.86 	Pair: YES
Line: 15 	Sim: 0.88 	Pair: YES
Line: 16 	Sim: 0.86 	Pair: YES
Line: 17 	Sim: 0.82 	Pair: YES
Line: 18 	Sim: 0.89 	Pair: YES
Line: 19 	Sim: 0.9 	Pair: YES
Line: 20 	Sim: 0.89 	Pair: YES
Line: 21 	Sim: 0.84 	Pair: YES
Line: 22 	Sim: 0.86 	Pair: YES
Line: 23 	Sim: 0.93 	Pair: YES
Line: 24 	Sim: 0.9 	Pair: YES
Line: 25 	Sim: 0.88 	Pair: YES
Line: 26 	Sim: 0.88 	Pair: YES
Line: 27 	Sim: 0.83 	Pair: YES
Line: 28 	Sim: 0.92 	Pair: YES
Line: 29 	Sim: 0.91 	Pair: YES
Line: 30 	Sim: 0.84 	Pair: YES
Line: 31 	Sim: 0.92 	Pair: YES
Line: 32 	Sim: 0.86 	P

(36, 0.9389751, 'Success Rate:98.0%')

In [46]:
from random import randrange

t = 50
n = 0

while t > n:
    x = randrange(1853)
    y = randrange(1853)
    if x != y:
        z = cosine_similarity( get_vector(new_df.iloc[x]['Sentence_en'], proc=False), get_vector(new_df.iloc[y]['Sentence_zh'],lang="zh", proc=False) )
        zz = 'YES' if z>0.7 else 'NO'
    
        print('Line EN:',x,'\tLine ZH:',y,'\tSim:',round(z,2), '\tPair:', zz)
    n+=1

Line EN: 463 	Line ZH: 724 	Sim: 0.54 	Pair: NO
Line EN: 749 	Line ZH: 1125 	Sim: 0.52 	Pair: NO
Line EN: 996 	Line ZH: 1843 	Sim: 0.51 	Pair: NO
Line EN: 503 	Line ZH: 495 	Sim: 0.5 	Pair: NO
Line EN: 1527 	Line ZH: 963 	Sim: 0.59 	Pair: NO
Line EN: 311 	Line ZH: 186 	Sim: 0.5 	Pair: NO
Line EN: 519 	Line ZH: 997 	Sim: 0.49 	Pair: NO
Line EN: 361 	Line ZH: 1120 	Sim: 0.54 	Pair: NO
Line EN: 273 	Line ZH: 1662 	Sim: 0.53 	Pair: NO
Line EN: 624 	Line ZH: 1826 	Sim: 0.54 	Pair: NO
Line EN: 916 	Line ZH: 1138 	Sim: 0.5 	Pair: NO
Line EN: 1612 	Line ZH: 1274 	Sim: 0.52 	Pair: NO
Line EN: 712 	Line ZH: 1277 	Sim: 0.5 	Pair: NO
Line EN: 529 	Line ZH: 18 	Sim: 0.52 	Pair: NO
Line EN: 643 	Line ZH: 604 	Sim: 0.52 	Pair: NO
Line EN: 98 	Line ZH: 1669 	Sim: 0.45 	Pair: NO
Line EN: 727 	Line ZH: 773 	Sim: 0.49 	Pair: NO
Line EN: 434 	Line ZH: 648 	Sim: 0.52 	Pair: NO
Line EN: 423 	Line ZH: 1614 	Sim: 0.52 	Pair: NO
Line EN: 455 	Line ZH: 1688 	Sim: 0.55 	Pair: NO
Line EN: 1385 	Line ZH: 1040 	Sim

In [None]:
## Not processing text is better as stopwords are kept => more information 