In [21]:
## Word Embeddings ##
# Word2Vec self-trained model
# might not be enough data to train a word2vec model

import pandas as pd
import numpy as np
import nltk
import jieba
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize  # splits by contractions which I don't like
from string import punctuation
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile

In [16]:
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
zh_file = "zh-en.training.zh"
en_file = "zh-en.training.en"

zh_df = pd.read_csv(zh_file, names=['ID_zh','Sentence_zh'], sep='\t')
en_df = pd.read_csv(en_file, names=['ID_en','Sentence_en'], sep='\t')

In [4]:
en_df.loc[en_df['ID_en'] == "en-000005983"]

Unnamed: 0,ID_en,Sentence_en
5955,en-000005983,"Until 1989, the global market encompassed betw..."


In [5]:
zh_df.loc[zh_df['ID_zh'] == "zh-000000231"]

Unnamed: 0,ID_zh,Sentence_zh
230,zh-000000231,今日全球面临的威胁是超民族的，因此也必须采取超民族的方式来应对。


In [6]:
pair_file = "zh-en.training.gold"

pair_df = pd.read_csv(pair_file, names=['ID_zh','ID_en'], sep='\t')

pair_df

Unnamed: 0,ID_zh,ID_en
0,zh-000000033,en-000005983
1,zh-000000231,en-000047360
2,zh-000000272,en-000027140
3,zh-000000438,en-000065621
4,zh-000000639,en-000005169
...,...,...
1894,zh-000094590,en-000013258
1895,zh-000094593,en-000061419
1896,zh-000094607,en-000039373
1897,zh-000094611,en-000003807


In [7]:
new_df = pair_df.merge(zh_df, 'inner', 'ID_zh')
new_df = new_df.merge(en_df, 'inner', 'ID_en')
new_df

Unnamed: 0,ID_zh,ID_en,Sentence_zh,Sentence_en
0,zh-000000033,en-000005983,1989年以前，全球经济包含大约8亿到10亿人口。,"Until 1989, the global market encompassed betw..."
1,zh-000000231,en-000047360,今日全球面临的威胁是超民族的，因此也必须采取超民族的方式来应对。,The threats facing the world today are suprana...
2,zh-000000272,en-000027140,欧盟移民政策的硬伤还有一个不太显著的方面。,"There is another, less obvious, reason why the..."
3,zh-000000438,en-000065621,只有让民粹主义服务于自由主义改革，政府才能取得长久的利益。,Only if populism is put at the service of libe...
4,zh-000000639,en-000005169,但社会民主派必须理解为何示威的发展会独立于现有的有组织中左翼政治。,But social democrats must understand why the p...
...,...,...,...,...
1848,zh-000094590,en-000013258,事件发生后当局在尚未进行调查的情况下就匆匆掩埋了出事列车残骸。,The wrecked body of the ruined train was burie...
1849,zh-000094593,en-000061419,北方拥有丰富的自然资源，就连电力也是从北方输送到南方。,"Natural resources were abundant in the North, ..."
1850,zh-000094607,en-000039373,如果利率为3%，那么年税收额必须增加15亿美元。,"If it is 3%, the required increase in annual t..."
1851,zh-000094611,en-000003807,五年前，叙利亚北部边陲城镇享受着土耳其高速经济增长的红利。,"Five years ago, Syria’s northern border towns ..."


In [8]:
en_df.iloc[0]['Sentence_en']

"As disclosure of emission related data as CDP's primary activity, the quality of the data reported to CDP is key."

In [10]:
# English
# 1. Tokenize Sentence -> Words
# 2. Remove punctuation and stopwords
# 3. Stemming Words



en_stopwords=stopwords.words("english")+["'s"]  #chinese de is stopword
stemmer=PorterStemmer()
punctuation = punctuation +'–’“”'

def en_proc(sentence):
    word_list = word_tokenize(sentence)
#    word_list = [w for w in word_list if w.lower() not in en_stopwords]
#    word_list = [w.translate(str.maketrans('', '', punctuation)) for w in word_list]
    bow_list = [stemmer.stem(w.lower()) for w in word_list if w.lower() not in en_stopwords and w not in punctuation]
    
    return bow_list


#en_list_token = [word_tokenize(s) for s in en_list]
#en_list_proc = []
#for s in en_list_token:
   # en_list_proc.append([stemmer.stem(w.lower()) for w in s if w.lower() not in en_stopwords and w not in punctuation])

# Chinese
# 1. Segmentation
# 2. Remove punctuation and stopwords

import re
with open('../zh_stopwords.txt','r', encoding='utf-8') as file:
    zh_stopwords = file.read()
zh_stopwords = re.sub('[ A-Za-z]+\n', ',', zh_stopwords)
zh_stopwords = zh_stopwords.translate(str.maketrans('', '', '\n')).split(',') 
zh_stopwords = list(filter(None, zh_stopwords))
punctuation = punctuation + '，「」。！？《》【】、'


def zh_proc(sentence):
    bow_list = [w for w in jieba.cut(sentence) if w not in zh_stopwords and w not in punctuation]
    return bow_list



#zh_list_proc = []
#for s in zh_list:
   # zh_list_proc.append([w for w in jieba.cut(s) if w not in zh_stopwords and w not in punctuation])


In [11]:
text = "1989年以前，全球经济包含大约8亿到10亿人口。"
zh_proc(text)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\gabri\AppData\Local\Temp\jieba.cache
Loading model cost 0.911 seconds.
Prefix dict has been built successfully.


['1989', '年', '以前', '全球', '经济', '包含', '大约', '8', '亿到', '10', '亿', '人口']

In [12]:
def gen_sentence(df,column,proc=en_proc,l=None):
    i=0
    output = []
    l= len(df.index) if not l else int(l)
    while i < l:
        output.append(proc(df.iloc[i][column]))
        i+=1
    return output

## Generator cannot pass to word2vec

In [33]:
zh_model = Word2Vec(gen_sentence(zh_df,'Sentence_zh',zh_proc), min_count=1, size=300)
zh_model.save('word2vec/zh_model.bin')

In [34]:
zh_model = Word2Vec.load('word2vec/zh_model.bin')

In [45]:
zh_vectors = zh_model.wv
zh_vec_file = get_tmpfile("zh_vectors.kv")
#zh_vectors.save(zh_vec_file)
zh_vectors = KeyedVectors.load(zh_vec_file)

zh_vectors.init_sims(replace=True) #normalize all
#zh_vectors.word_vec("法律", use_norm=True) # if didn't replace with normed vectors

In [46]:
#en_model = Word2Vec.load('word2vec/en_model.bin')
en_model = Word2Vec(gen_sentence(en_df,'Sentence_en',en_proc), min_count=1, size=300)
en_model.save('word2vec/en_model.bin')

In [51]:
en_vectors = en_model.wv
en_vec_file = get_tmpfile("en_vectors.kv")
#en_vectors.save(en_vec_file)
en_vectors = KeyedVectors.load(en_vec_file)
en_vectors.init_sims(replace=True)

In [None]:
# Using normalized vectors not much difference, still negative 
# -> might not be enough data to train a word2vec model

In [55]:
cosine_similarity(.5*(zh_vectors.word_vec("面临")+zh_vectors.word_vec("威胁")),.5*(en_vectors.word_vec("face")+en_vectors.word_vec("threat")))

-0.044774767

In [20]:
print(gen_sentence(en_df,'Sentence_en',en_proc,5))


[['disclosur', 'emiss', 'relat', 'data', 'cdp', 'primari', 'activ', 'qualiti', 'data', 'report', 'cdp', 'key'], ['2014', 'nearli', '2000', 'busi', 'report', 'climat', 'chang', 'data', 'cdp'], ['valu', 'cdp', 'report', 'investor', 'ngo', 'contest'], ['furthermor', 'qualiti', 'data', 'cdp', 'report', 'premis', 'question'], ['carbon', 'disclosur', 'project', 'cdp', 'attempt', 'sidestep', 'nation', 'interest', 'focus', 'individu', 'compani', 'rather', 'nation']]


In [82]:
en_model.wv.most_similar(positive=["data"])

[('2014,', 0.11501407623291016),
 ('premis', 0.09995618462562561),
 ('chang', 0.0796932578086853),
 ('primari', 0.07935003191232681),
 ('relat', 0.06664074957370758),
 ('valu', 0.05988314002752304),
 ('cdp.', 0.04614570736885071),
 ('compani', 0.03485778719186783),
 ('nation', 0.029505286365747452),
 ('activity,', 0.028874967247247696)]

In [17]:
## Feat: No. of counts a word appears in the document
# Eg. appear once in sentence, appear once in aligned sentence of other language


In [94]:
stemmer.stem('natural')

'natur'

In [95]:
en_model.wv.similarity('resourc', 'natur') 

0.9862863

In [87]:
zh_model.wv.most_similar(positive=['自由'])

[('法律', 0.9858250617980957),
 ('制度', 0.9773937463760376),
 ('保障', 0.9773855209350586),
 ('公民', 0.977051854133606),
 ('税金', 0.9757312536239624),
 ('恐怖活动', 0.9750471115112305),
 ('是否', 0.9735466837882996),
 ('限制', 0.9723238945007324),
 ('作出', 0.9718997478485107),
 ('人人平等', 0.9718680381774902)]

In [99]:
zh_proc(new_df.iloc[0]['Sentence_zh'])

['1989', '年', '以前', '全球', '经济', '包含', '大约', '8', '亿到', '10', '亿', '人口']

In [56]:
def sum_vector(model,sentence_list):
    i=0
    for word in sentence_list:
        if i==0:
            vector = model.wv[word]
        else:
            vector=vector+model.wv[word]
        i+=1
    return vector/(i+1)

def cosine_similarity(v1,v2):
    '''cosine_similarity(transformed_docs[2], transformed_docs[2])'''
    ## Idk why need to np.squeeze (1,148) into (148,) shape to dot product [error: shapes not aligned]
    ## toarray() [error: dimension mismatch]
    
    #print('Calculating Cosine Similarity...')
    
    #v1 = np.squeeze(v1.toarray())
    #v2 = np.squeeze(v2.toarray())
    return np.dot(v1,v2) / ( np.sqrt(np.dot(v1,v1)) * np.sqrt(np.dot(v2,v2)) )

        
def oof(n):
    k=0
    j=''
    for i in range(n):
        this_value = cosine_similarity( sum_vector(en_model,en_proc(new_df.iloc[i]['Sentence_en'])), sum_vector(zh_model,zh_proc(new_df.iloc[i]['Sentence_zh'])) )
        if i == 0:
            most = this_value
            s_index = 0
        elif most <= this_value:
            most = this_value
            s_index = i
        
        if this_value>0.5:
            j='YES'
            k+=1
        else:
            j='NO'
        
        print('Line:',i,'\tSim:',round(this_value,2), '\tPair:', j)
        
    return s_index,most, 'Success Rate:'+str(round(k/n*100,1))+'%'
            

    
oof(30)

Line: 0 	Sim: -0.06 	Pair: NO
Line: 1 	Sim: -0.04 	Pair: NO
Line: 2 	Sim: -0.03 	Pair: NO
Line: 3 	Sim: -0.05 	Pair: NO
Line: 4 	Sim: -0.05 	Pair: NO
Line: 5 	Sim: -0.04 	Pair: NO
Line: 6 	Sim: -0.1 	Pair: NO
Line: 7 	Sim: 0.01 	Pair: NO
Line: 8 	Sim: -0.02 	Pair: NO
Line: 9 	Sim: -0.04 	Pair: NO
Line: 10 	Sim: -0.05 	Pair: NO
Line: 11 	Sim: -0.04 	Pair: NO
Line: 12 	Sim: -0.04 	Pair: NO
Line: 13 	Sim: -0.04 	Pair: NO
Line: 14 	Sim: -0.07 	Pair: NO
Line: 15 	Sim: -0.04 	Pair: NO
Line: 16 	Sim: -0.05 	Pair: NO
Line: 17 	Sim: -0.05 	Pair: NO
Line: 18 	Sim: -0.07 	Pair: NO
Line: 19 	Sim: -0.07 	Pair: NO
Line: 20 	Sim: -0.03 	Pair: NO
Line: 21 	Sim: -0.05 	Pair: NO
Line: 22 	Sim: -0.03 	Pair: NO
Line: 23 	Sim: -0.08 	Pair: NO
Line: 24 	Sim: -0.04 	Pair: NO
Line: 25 	Sim: -0.04 	Pair: NO
Line: 26 	Sim: -0.07 	Pair: NO
Line: 27 	Sim: -0.05 	Pair: NO
Line: 28 	Sim: -0.03 	Pair: NO
Line: 29 	Sim: -0.05 	Pair: NO


(7, 0.006314113, 'Success Rate:0.0%')

In [103]:
print(en_proc(new_df.iloc[1]['Sentence_en']) , zh_proc(new_df.iloc[1]['Sentence_zh']))
cosine_similarity( sum_vector(en_model,en_proc(new_df.iloc[1]['Sentence_en'])), sum_vector(zh_model,zh_proc(new_df.iloc[1]['Sentence_zh'])) )

['threat', 'face', 'world', 'today', 'supranational,', 'counteract', 'must', 'supranational,', 'too.'] ['今日', '全球', '面临', '威胁', '超', '民族', '必须', '采取', '超', '民族', '方式', '应对']


-0.004908868

In [105]:
en_model.wv.similarity('threat', 'face') 

0.99819654

In [107]:
zh_model.wv.similarity('威胁', '面临') 

0.98876095

In [117]:
zh1 = zh_model.wv['今日']+zh_model.wv['全球']+zh_model.wv['面临']+zh_model.wv['威胁']+zh_model.wv['超']+zh_model.wv['民族']+zh_model.wv['必须']+zh_model.wv['采取']+zh_model.wv['超']+zh_model.wv['民族']+zh_model.wv['方式']+zh_model.wv['应对']
en1 = en_model.wv['threat']+en_model.wv['face']+en_model.wv['world']+en_model.wv['today']+en_model.wv['supranational,']+en_model.wv['counteract']+en_model.wv['must']+en_model.wv['supranational,']+en_model.wv['too.']

In [57]:
from scipy import spatial

vector1 = [1, 2, 3]
vector2 = [3, 2, 1]

cs1 = 1 - spatial.distance.cosine(vector1, vector2)
print(cs1)
cs2 = cosine_similarity(vector1,vector2)
print(cs2)

0.7142857142857143
0.7142857142857143


In [58]:
cosine_similarity(zh1,en1)

NameError: name 'zh1' is not defined

In [62]:
cosine_similarity(en_model.wv['threat']+en_model.wv['face'],zh_model.wv['面临']+zh_model.wv['威胁'])

-0.04475601

In [60]:
en_proc("I can't don't Andrew's")

['ca', "n't", "n't", 'andrew']