In [1]:
import numpy as np
import jieba
import re
import codecs
import time
import pandas as pd
from gensim.models import Word2Vec,KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize



In [2]:
#!pip install jieba
#!pip install gensim
#https://github.com/Lipairui/Text-similarity-centroid-of-the-word-vectors

In [3]:
model_path = 'C:\\Temp\\wordvectors\\GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(model_path,binary=True,unicode_errors='ignore')
stopwords_path = 'english-stopwords.txt'
model.init_sims(replace=True)

  after removing the cwd from sys.path.


In [64]:
def tokenize_doc(stopwords,doc):
    '''
    Function: preprocess data in Chinese including cleaning, tokenzing...
    Input: document string
    Output: list of words
    '''     
    doc = doc.lower()
    doc = word_tokenize(doc)
    doc = [word for word in doc if word not in set(stopwords)]
    doc = [word for word in doc if word.isalpha()]
    return doc

In [65]:
def doc_vector(model,doc):
    '''
    Function:
        compute the mean of word vectors
    Input:
        model: gensim word2vec model
        doc: list of words
    Output:
        doc vector 
    '''
    # remove out-of-vocab words
    doc = [word for word in doc if word in model.key_to_index]
    return np.mean(model[doc],axis=0)

In [66]:
def calculate_similarity(model,doc1,doc2):
    '''
    Function:
        calculate cosine similarity of document pair
    Input: 
        model: gensim word2vec model
        doc1: list of words of document1
        doc2: list of words of document2
    Output:
        similarity of doc1 and doc2: (float)
            value ranges from 0 to 1;
            -1 means error
    '''
    vec1 = np.array(doc_vector(model,doc1)).reshape(1,-1)
    vec2 = np.array(doc_vector(model,doc2)).reshape(1,-1)
    cos = cosine_similarity(vec1,vec2)[0][0]      
    # regularize value of cos to [-1,1]
    if cos<-1.0:cos=-1.0
    if cos>1.0:cos=1.0      
    sim = 1-np.arccos(cos)/np.pi 
    return sim


In [67]:
def regularize_sim(sims):
    '''
    Function: replace illegal similarity value -1 with mean value
    Input: list of similarity of document pairs
    Output: regularized list of similarity 
    '''
    sim_mean = np.mean([sim for sim in sims if sim!=-1])
    r_sims = []
    errors = 0
    for sim in sims:
        if sim==-1:
            r_sims.append(sim_mean)
            errors += 1
        else:
            r_sims.append(sim)
    return r_sims

In [68]:
def doc_sim(lang,docs1,docs2):
    '''
    Function:
        calculate similarity of document pairs 
    Input: 
        lang: text language-Chinese for 'cn'/ English for 'en'
        docs1:  document strings list1
        docs2: document strings list2
    Output:
        similarity list of docs1 and docs2 pairs:
            value ranges from 0 to 1;
            -1 means error
    '''
    # check if the number of documents matched

    assert lang=='cn' or lang=='en', 'Language setting is wrong'
    
    # preprocess data
    stopwords= [w.strip() for w in codecs.open(stopwords_path, 'r',encoding='utf-8').readlines()]
    sims = []
    for i in range(len(docs1)):        
        p1 = tokenize_doc(stopwords,docs1[i])
        p2 = tokenize_doc(stopwords,docs2[i])
       # calculate similarity
        sim = calculate_similarity(model,p1,p2)
        sims.append(sim)
    # regularize sims
    r_sims = regularize_sim(sims)
    return r_sims

In [69]:
# English text example
reference = ['a speaker presents some products',
                 'vegetable is being sliced.',
                'man sitting using tool at a table in his home.']
translation = ['the speaker is introducing the new products on a fair.',
                'someone is slicing a tomato with a knife on a cutting board.',
                'The president comes to China']

doc_sim('en',reference,translation)

[0.7321356712144432, 0.6884806689162883, 0.5777315231179934]

In [21]:
del LogInfo

In [72]:
from baseline import Baseline

In [73]:
baseline = Baseline()

TypeError: __init__() missing 4 required positional arguments: 'df', 'language', 'model', and 'stopwords'