In [12]:
## Word2Vec on direct translated lyrics
## https://stackoverflow.com/questions/22129943/how-to-calculate-the-sentence-similarity-using-word2vec-model-of-gensim-with-pyt
## https://datascience.stackexchange.com/questions/23969/sentence-similarity-prediction

import pandas as pd
import numpy as np
import nltk
import jieba
import multiprocessing
from gensim.models import Word2Vec
#from gensim.models import KeyedVectors



In [45]:
#nltk.download('punkt')
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [96]:
en_file = 'shinpakusuu_en.txt'
zh_file = 'shinpakusuu_zh.txt'

def read_f(x_file):
    with open(x_file,'r', encoding='utf-8') as file:
        x_list = file.read().splitlines()
        
    return list(filter(None, x_list))
    
en_list = read_f(en_file)
        
zh_list = read_f(zh_file)

In [132]:
# English
# 1. Tokenize Sentence -> Words
# 2. Remove punctuation and stopwords
# 3. Stemming Words

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize  # splits by contractions which I don't like
from string import punctuation

en_stopwords=stopwords.words("english")
stemmer=PorterStemmer()
punctuation = punctuation +'–’“”'

en_list_token = [word_tokenize(s) for s in en_list]
en_list_proc = []
for s in en_list_token:
    en_list_proc.append([stemmer.stem(w.lower()) for w in s if w.lower() not in en_stopwords and w not in punctuation])

# Chinese
# 1. Segmentation
# 2. Remove punctuation and stopwords

import re
with open('zh_stopwords.txt','r', encoding='utf-8') as file:
    zh_stopwords = file.read()
zh_stopwords = re.sub('[ A-Za-z]+\n', ',', zh_stopwords)
zh_stopwords = zh_stopwords.translate(str.maketrans('', '', '\n')).split(',') 
zh_stopwords = list(filter(None, zh_stopwords))
punctuation = punctuation + '，「」。！？《》【】、'

zh_list_proc = []
for s in zh_list:
    zh_list_proc.append([w for w in jieba.cut(s) if w not in zh_stopwords and w not in punctuation])


In [150]:
zh_list_proc

[['心脏', '停下'],
 ['一定', '觉得', '已经', '充分', '享受', '世界', '结束'],
 ['彷佛', '没', '做', '完', '事', '几乎', '没有', '般'],
 ['希望', '身旁', '一直', '笑'],
 ['仍然', '想', '这颗', '心', '跳动', '时间', '守护'],
 ['那件事', '生存', '意义'],
 ['一个', '一个', '数着', '相同', '眼泪'],
 ['再度', '了解', '彼此'],
 ['巨大', '跳动', '声', '传达'],
 ['重叠', '声响', '流泄', '思念'],
 ['约定', '再也', '不要', '分开'],
 ['希望', '不要', '寂寞'],
 ['心脏', '一分钟'],
 ['会', '喊', '出', '70', '次', '正', '活着'],
 ['一起', '时', '会', '稍微', '加快脚步'],
 ['喊出', '110', '次', '我爱你'],
 ['仍然', '想', '这颗', '心', '跳动', '时间', '守护'],
 ['那件事', '生存', '意义'],
 ['一次', '一次', '重迭', '相同', '心意'],
 ['再度', '了解', '彼此'],
 ['相遇'],
 ['理由'],
 ['不', '知道', '是不是', '命运'],
 ['份', '喜悦', '不会', '改变', '喔'],
 ['某天', '放弃'],
 ['会', '说出', '次', '喜欢'],
 ['感谢', '能身', '这件', '事'],
 ['活着', '这件', '事', '感谢'],
 ['巨大', '跳动', '声', '传达'],
 ['重叠', '声响', '流泄', '思念'],
 ['约定', '一直', '相爱', '下去'],
 ['心跳', '停止']]

In [133]:
en_list_proc

[['heart', 'stop'],
 ['sure', 'world', 'think', 'fulli', 'enjoy'],
 ['leav', 'behind', 'pretti', 'much', 'noth'],
 ['side', 'think', 'want', 'keep', 'smile'],
 ['pound', 'chest', 'still', 'want', 'protect'],
 ['reason', 'live', 'fine'],
 ['one', 'one', 'count', 'tear'],
 ['know'],
 ['throb', 'puls', 'convey'],
 ['recur', 'sound', 'run', 'thought'],
 ['let', 'us', 'promis', 'apart', 'longer'],
 ['never', 'lone'],
 ['heart', 'one', 'minut'],
 ['seventi', 'time', 'shout', 'live'],
 ['run', 'fast'],
 ['one', 'hundr', 'ten', 'time', 'shout', 'love'],
 ['pound', 'chest', 'still', 'want', 'protect'],
 ['reason', 'live', 'fine'],
 ['heart', 'repeat'],
 ['know'],
 ['meet'],
 ['reason'],
 ['know', 'would', 'fate'],
 ['sheer', 'happi', 'unchang'],
 ['someday', 'end'],
 ['mani', 'love', 'utter'],
 ['offer', 'gratitud'],
 ['simpli', 'aliv', 'thank'],
 ['throb', 'puls', 'convey'],
 ['recur', 'sound', 'run', 'thought'],
 ['let', 'us', 'promis', 'keep', 'love'],
 ['heartbeat', 'stop']]

In [176]:
en_model = Word2Vec(en_list_proc, min_count=1, size=2)

In [177]:
en_model.wv.most_similar(positive=["heart"])

[('keep', 0.9999463558197021),
 ('protect', 0.9996652007102966),
 ('leav', 0.9882749319076538),
 ('simpli', 0.9769353866577148),
 ('know', 0.9744161367416382),
 ('tear', 0.9601825475692749),
 ('longer', 0.9531025886535645),
 ('lone', 0.9482792019844055),
 ('chest', 0.9168407917022705),
 ('think', 0.8970745801925659)]

In [178]:
zh_model = Word2Vec(zh_list_proc, min_count=1, size=2)

In [179]:
zh_model.wv.most_similar(positive=["心脏"])

[('觉得', 0.9999840259552002),
 ('再度', 0.9999455213546753),
 ('出', 0.9998087882995605),
 ('放弃', 0.9988545775413513),
 ('一个', 0.9917483329772949),
 ('做', 0.9864947199821472),
 ('思念', 0.9840846061706543),
 ('寂寞', 0.9760681390762329),
 ('说出', 0.9745391607284546),
 ('一次', 0.9745294451713562)]

In [None]:
## Feat: No. of counts a word appears in the document
# Eg. appear once in sentence, appear once in aligned sentence of other language


In [181]:
en_model.wv.similarity('heartbeat', 'heart') 

0.39296654

In [193]:
en_model.wv['heart']+en_model.wv['stop']

array([ 0.13860422, -0.20425732], dtype=float32)

In [188]:
zh_model.wv['心脏']+zh_model.wv['停下']

array([0.16470802, 0.07603731], dtype=float32)

In [198]:
en_list_proc[0]

['heart', 'stop']

In [241]:
def sum_vector(model,sentence_list):
    i=0
    for word in sentence_list:
        if i==0:
            vector = model.wv[word]
        else:
            vector=vector+model.wv[word]
        i+=1
    return vector

def cosine_similarity(v1,v2):
    '''cosine_similarity(transformed_docs[2], transformed_docs[2])'''
    ## Idk why need to np.squeeze (1,148) into (148,) shape to dot product [error: shapes not aligned]
    ## toarray() [error: dimension mismatch]
    
    #print('Calculating Cosine Similarity...')
    
    #v1 = np.squeeze(v1.toarray())
    #v2 = np.squeeze(v2.toarray())
    return np.dot(v1,v2) / ( np.sqrt(np.dot(v1,v1)) * np.sqrt(np.dot(v2,v2)) )

        
def oof(n):
    k=0
    j=''
    for i in range(n):
        this_value = cosine_similarity( sum_vector(en_model,en_list_proc[i]), sum_vector(zh_model,zh_list_proc[i]) )
        if i == 0:
            most = this_value
            s_index = 0
        elif most <= this_value:
            most = this_value
            s_index = i
        
        if this_value>0.5:
            j='YES'
            k+=1
        else:
            j='NO'
        
        print('Line:',i,'\tSim:',round(this_value,2), '\tPair:', j)
        
    return s_index,most, 'Success Rate:'+str(round(k/n*100,1))+'%'
            

    
oof(30)

Line: 0 	Sim: 0.16 	Pair: NO
Line: 1 	Sim: -0.96 	Pair: NO
Line: 2 	Sim: 0.27 	Pair: NO
Line: 3 	Sim: -0.27 	Pair: NO
Line: 4 	Sim: 0.98 	Pair: YES
Line: 5 	Sim: 0.98 	Pair: YES
Line: 6 	Sim: -0.42 	Pair: NO
Line: 7 	Sim: -0.05 	Pair: NO
Line: 8 	Sim: 0.57 	Pair: YES
Line: 9 	Sim: 0.48 	Pair: NO
Line: 10 	Sim: -0.33 	Pair: NO
Line: 11 	Sim: -0.68 	Pair: NO
Line: 12 	Sim: -0.21 	Pair: NO
Line: 13 	Sim: 0.97 	Pair: YES
Line: 14 	Sim: 0.99 	Pair: YES
Line: 15 	Sim: 0.22 	Pair: NO
Line: 16 	Sim: 0.98 	Pair: YES
Line: 17 	Sim: 0.98 	Pair: YES
Line: 18 	Sim: -0.16 	Pair: NO
Line: 19 	Sim: -0.05 	Pair: NO
Line: 20 	Sim: 0.99 	Pair: YES
Line: 21 	Sim: 0.82 	Pair: YES
Line: 22 	Sim: -0.9 	Pair: NO
Line: 23 	Sim: -1.0 	Pair: NO
Line: 24 	Sim: -0.99 	Pair: NO
Line: 25 	Sim: -0.29 	Pair: NO
Line: 26 	Sim: -0.99 	Pair: NO
Line: 27 	Sim: -0.98 	Pair: NO
Line: 28 	Sim: 0.57 	Pair: YES
Line: 29 	Sim: 0.48 	Pair: NO


(20, 0.9901083, 'Success Rate:33.3%')