In [1]:
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
import string

from nltk.translate.chrf_score import sentence_chrf
from sklearn import preprocessing 
import torch
import warnings
from scipy.stats import kendalltau, pearsonr, spearmanr

In [2]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk.translate.gleu_score as gleu
from nltk.translate.meteor_score import meteor_score
from nltk.translate.nist_score import sentence_nist, corpus_nist
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge
from nltk import RegexpTokenizer

In [3]:
list_of_names = ['cs_en', 'de_en', 'en_fi', 'en_zh', 'ru_en', 'zh_en', 'en_fi', 'en_zh']

In [4]:
cs_en = pd.read_csv("corpus/cs-en/scores.csv")
de_en = pd.read_csv("corpus/de-en/scores.csv")
en_fi = pd.read_csv("corpus/en-fi/scores.csv")
en_zh = pd.read_csv("corpus/en-zh/scores.csv")
ru_en = pd.read_csv("corpus/ru-en/scores.csv")
zh_en = pd.read_csv("corpus/zh-en/scores.csv")

data_to_eng = [cs_en, de_en, ru_en, zh_en]
data_from_eng = [en_fi, en_zh]

In [5]:
cs_en = pd.DataFrame(data=cs_en,columns=['reference','translation','z-score'])
de_en = pd.DataFrame(data=de_en,columns=['reference','translation','z-score'])
en_fi = pd.DataFrame(data=en_fi,columns=['reference','translation','z-score'])
en_zh = pd.DataFrame(data=en_zh,columns=['reference','translation','z-score'])
ru_en = pd.DataFrame(data=ru_en,columns=['reference','translation','z-score'])
zh_en = pd.DataFrame(data=zh_en,columns=['reference','translation','z-score'])

In [6]:
def load_dataset():
    # dataframes provided with the corresponding language translations

    # assign dataset names
    list_of_names = ['cs-en', 'de-en', 'ru-en', 'zh-en', 'en-fi', 'en-zh']

    # create empty list
    dataframes_list = []
  
    # append datasets into teh list
    for i in range(len(list_of_names)):

        temp_df = pd.read_csv("corpus/"+ list_of_names[i]+"/scores.csv")

        dataframes_list.append(temp_df)

    return dataframes_list

In [7]:
def remove_empty(df):
    df = df.replace(r'^\s*$', np.NaN, regex=True)
    df = df.dropna()
    return df

## Preprocessing

In [8]:
stop_en = set(stopwords.words('english'))
stop_fi = set(stopwords.words('finnish'))
# stop_zh = chinese library needed 
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer('english')

def clean(text_list, lemmatize=False, stemmer=False, punctuation = True, stop_words=False, stop = stop_en):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        text = text.lower()
        
        #REMOVE NUMERICAL DATA AND PUNCTUATION
        if punctuation:
            text = re.sub("[^a-zA-Z]", ' ', text)
        
        #REMOVE TAGS (HTML)
        text = BeautifulSoup(text).get_text()
        
        #REMOVE STOP WORDS - not needed 
        if stop_words:
            text = " ".join([word for word in text.split() if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(snowball_stemmer.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def update_df(dataframe, list_updated, column):
    dataframe.update(pd.DataFrame({column: list_updated}))
    
#updates = clean(df["translation"], lemmatize = False, stemmer = False)
#update_df(df, updates, "translation")

def number_token(text):
    """
    Function that receives a string of text and returns the string with 
    the cost formats within it substituted by the token #COST
    """
    tokenized_text = re.sub('(\d+|\d+.\d+)(| )','##',text)
        
    return tokenized_text

def total_word_freq(text_list):
    """
    Function that receives a list of strings and returns the frequency of each word
    in the set of all strings.
    """
    words_in_df = ' '.join(text_list).split()
    # Count all words 
    freq = pd.Series(words_in_df).value_counts()
    return freq

# Fetch wordcount for each abstract
def word_count(df):
    word_count_ref  = df['reference'].apply(lambda x: len(str(x).split(" ")))
    word_count_tra  = df['translation'].apply(lambda x: len(str(x).split(" ")))
    df['word_count_ref'] = word_count_ref
    df['word_count_tra'] = word_count_tra

In [9]:
def number_token(df):

    def transform_number(text):
        """
        Function that receives a string of text and returns the string with 
        the cost formats within it substituted by the token #COST
        """
        tokenized_text = re.sub('(\d+|\d+.\d+)(| )','##',text)
            
        return tokenized_text

    df["reference"] = [transform_number(x) for x in df["reference"]]
    df["translation"] = [transform_number(x) for x in df["translation"]]

def tokenize(df):
    df['reference_token'] = [[x.split()] for x in df['reference']]
    df['translation_token'] = [x.split() for x in df['translation']]
    return df

In [10]:
language_list = load_dataset()

In [11]:
preprocess_config = {
        'lemmatize': False,
        'stemmer': False,
        'punctuation': True,
        'stop_words': False,
        'stop': stop_en
        # lowercase
        # remove punctuation
        }

In [12]:
def wer(translation, reference, print_matrix=False):
    N = len(translation)
    M = len(reference)
    L = np.zeros((N, M))
    for i in range(0, N):
        for j in range(0, M):
            if min(i, j) == 0:
                L[i, j] = max(i, j)
            else:
                deletion = L[i - 1, j] + 1
                insertion = L[i, j - 1] + 1
                sub = 1 if translation[i] != reference[j] else 0
                substitution = L[i - 1, j - 1] + sub
                L[i, j] = min(deletion, min(insertion, substitution))
                # print("{} - {}: del {} ins {} sub {} s {}".format(hyp[i], ref[j], deletion, insertion, substitution, sub))
    if print_matrix:
        print("WER matrix ({}x{}): ".format(N, M))
        print(L)
    return int(L[N - 1, M - 1])

In [14]:
def run_models(df, name):
    # get word count for each of reference and translation
    word_count(df)

    # apply baseline bleu model
    baseline_bleu(df)

    # apply sacre bleu
    sacre_bleu(df)

    # apply NIST model
    nist(df)

    # apply the rouge model
    rouge_1(df)

    # apply the bleu-rouge f1
    bleu_rouge(df)

    # apply meteor model
    meteor(df)

    # apply charF
    charf(df)

    # apply word embedding
    #run_word_embedding(df, name)

    return df

def evaluate_models(df):  # TODO for laser
    model_list = ['bleu', 'sacre_bleu', 'rouge', 'bleu_rouge', 'meteor', 'charf']
    correl_df = pd.DataFrame()
    # set indices
    for model in model_list:
        reg = RegressionReport()
        correl_df[model] = reg.compute(df[model], df['z-score'])

    return correl_df

In [15]:
class RegressionReport:
    def __init__(self):
        super().__init__()
        self.metrics = [Pearson(), Kendall(), Spearman()]

    def compute(self, x: np.array, y: np.array) -> float:
        """Computes Kendall correlation.
        :param x: predicted scores.
        :param x: ground truth scores.
        :return: Kendall Tau correlation value.
        """
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            return {metric.name: metric.compute(x, y) for metric in self.metrics}
        
class Kendall:
    def __init__(self):
        self.name = "kendall"

    def compute(self, x: np.array, y: np.array) -> float:
        """Computes Kendall correlation.
        :param x: predicted scores.
        :param x: ground truth scores.
        :return: Kendall Tau correlation value.
        """
        return torch.tensor(kendalltau(x, y)[0], dtype=torch.float32)


class Pearson:
    def __init__(self):
        self.name = "pearson"

    def compute(self, x: np.array, y: np.array) -> torch.Tensor:
        """Computes Pearson correlation.
        :param x: predicted scores.
        :param x: ground truth scores.
        :return: Pearson correlation value.
        """
        return torch.tensor(pearsonr(x, y)[0], dtype=torch.float32)


class Spearman:
    def __init__(self):
        self.name = "spearman"

    def compute(self, x: np.array, y: np.array) -> float:
        """Computes Spearman correlation.
        :param x: predicted scores.
        :param x: ground truth scores.
        Return:
            - Spearman correlation value.
        """
        return torch.tensor(spearmanr(x, y)[0], dtype=torch.float32)

In [None]:
final_df=[]
correlations = []
    
list_of_names = ['cs-en', 'de-en', 'ru-en', 'zh-en', 'en-fi', 'en-zh']

language_list_to_en = language_list[:-2]

for name, df in enumerate(language_list_to_en):

    df_size = df.shape[0]
    print("Cleaning " + list_of_names[name])

    updates = clean(df["reference"], lemmatize=preprocess_config['lemmatize'], stemmer=preprocess_config['stemmer'], stop_words=preprocess_config['stop_words'], stop=preprocess_config['stop'])
    update_df(df, updates, "reference")

    updates = clean(df["translation"], lemmatize=preprocess_config['lemmatize'], stemmer=preprocess_config['stemmer'], stop_words=preprocess_config['stop_words'], stop=preprocess_config['stop'])
    update_df(df, updates, "translation")

    df = remove_empty(df)
    print(df.shape[0]/df_size)

    number_token(df)
    df = tokenize(df)

    print("Running models for " + list_of_names[name])
    final_df.append(run_models(df, list_of_names[name]))
    
    model_list = ['bleu', 'sacre_bleu', 'rouge', 'bleu_rouge', 'meteor', 'charf']

    for i in model_list:
        print(kendalltau(de_en['z-score'], de_en[i]))
        print(pearsonr(de_en['z-score'], de_en[i]))


In [None]:
updates = clean(cs_en["reference"], lemmatize=preprocess_config['lemmatize'], stemmer=preprocess_config['stemmer'], stop_words=preprocess_config['stop_words'], stop=preprocess_config['stop'])
update_df(cs_en, updates, "reference")

updates = clean(cs_en["translation"], lemmatize=preprocess_config['lemmatize'], stemmer=preprocess_config['stemmer'], stop_words=preprocess_config['stop_words'], stop=preprocess_config['stop'])
update_df(cs_en, updates, "translation")

cs_en = remove_empty(cs_en)

number_token(cs_en)
cs_en = tokenize(cs_en)


In [None]:
correlations.append(evaluate_models(de_en))

In [None]:
de_en.isna().sum()

In [16]:
de_en

Unnamed: 0,reference,translation,z-score
0,Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024
1,He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.903800
2,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503
3,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572
4,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909
...,...,...,...
21699,"Lt. Cmdr. Patrick Evans, a press officer at th...","Lt. Cmdr. Patrick Evans, a Pentagon spokesman,...",1.246459
21700,"""To give an example: If I ask him something th...","""To give an example: If I ask him what happene...",0.792878
21701,One reason that not all neighbours view this a...,One reason for not all neighbours seeing this ...,0.597068
21702,Profit before interest and tax increased from ...,Profits before interest and taxes increased fr...,-0.305719


In [None]:
x_tr, y_tr = train['text'].values, train['label'].values

In [None]:
def run_NN(df):

    model = Sequential()

    # embedding layer
    model.add(Embedding(size_of_vocabulary, 300, input_length=100, trainable=True))

    # lstm layer
    model.add(LSTM(128, return_sequences=True, dropout=0.2))

    # Global Maxpooling
    model.add(GlobalMaxPooling1D())

    # Dense Layer
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Add loss function, metrics, optimizer
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["acc"])

    # Adding callbacks
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True, verbose=1)

    # Print summary of model
    #print(model.summary())

    history = model.fit(np.array(x_tr_seq), np.array(y_tr), batch_size=128, epochs=10,
                        validation_data=(np.array(x_val_seq), np.array(y_val)), verbose=1, callbacks=[es, mc])

    model = load_model('best_model.h5')

    # evaluation
    _, val_acc = model.evaluate(x_val_seq, y_val, batch_size=128)
    print(val_acc)

def learn_embedd(df):
    # Tokenize the sentences
    tokenizer = Tokenizer()

    # preparing vocabulary
    tokenizer.fit_on_texts(list(x_tr))

    # converting text into integer sequences
    x_tr_seq = tokenizer.texts_to_sequences(x_tr)
    x_val_seq = tokenizer.texts_to_sequences(x_val)

    # padding to prepare sequences of same length
    x_tr_seq = pad_sequences(x_tr_seq, maxlen=100)
    x_val_seq = pad_sequences(x_val_seq, maxlen=100)


def pretrain_embedding(df):
    # load the whole embedding into memory
    embeddings_index = dict()
    f = open('../input/glove6b/glove.6B.300d.txt')

    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

    f.close()

    # create a weight matrix for words in training docs
    embedding_matrix = np.zeros((size_of_vocabulary, 300))

    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


## Bag-of-words

In [None]:
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

#all words were made lowercase by default and that the punctuation was ignored.

In [None]:
cv = CountVectorizer(max_df=0.9, binary=True)

In [None]:
X = cv.fit_transform(cs_en["reference"])

In [None]:
list(cv.vocabulary_.keys())[:10]

In [None]:
def get_top_n_grams(corpus, top_k, n):
    """
    Function that receives a list of documents (corpus) and extracts
        the top k most frequent n-grams for that corpus.
        
    :param corpus: list of texts
    :param top_k: int with the number of n-grams that we want to extract
    :param n: n gram type to be considered 
             (if n=1 extracts unigrams, if n=2 extracts bigrams, ...)
             
    :return: Returns a sorted dataframe in which the first column 
        contains the extracted ngrams and the second column contains
        the respective counts
    """
    vec = CountVectorizer(ngram_range=(n, n), max_features=2000).fit(corpus)
    
    bag_of_words = vec.transform(corpus)
    
    sum_words = bag_of_words.sum(axis=0) 
    
    words_freq = []
    for word, idx in vec.vocabulary_.items():
        words_freq.append((word, sum_words[0, idx]))
        
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    top_df = pd.DataFrame(words_freq[:top_k])
    top_df.columns = ["Ngram", "Freq"]
    return top_df

In [None]:
top_df = get_top_n_grams(cs_en["reference"], top_k=20, n=1)

In [None]:
top_df.head(10)

In [None]:
def plot_frequencies(top_df):
    """
    Function that receives a dataframe from the "get_top_n_grams" function
    and plots the frequencies in a bar plot.
    """
    x_labels = top_df["Ngram"][:30]
    y_pos = np.arange(len(x_labels))
    values = top_df["Freq"][:30]
    plt.bar(y_pos, values, align='center', alpha=0.5)
    plt.xticks(y_pos, x_labels)
    plt.ylabel('Frequencies')
    plt.title('Words')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
plot_frequencies(top_df)

In [None]:
tfidf_vectorizer = TfidfTransformer()
tfidf_vectorizer.fit(X)

In [None]:
# get feature names
feature_names = cv.get_feature_names()
 
# fetch document for which keywords needs to be extracted
doc = cs_en["reference"][53]
 
#generate tf-idf for the given document
tf_idf_vector = tfidf_vectorizer.transform(cv.transform([doc]))

In [None]:
tf_idf_vector.toarray()

In [None]:
def extract_feature_scores(feature_names, document_vector):
    """
    Function that creates a dictionary with the TF-IDF score for each feature.
    :param feature_names: list with all the feature words.
    :param document_vector: vector containing the extracted features for a specific document
    
    :return: returns a sorted dictionary "feature":"score".
    """
    feature2score = {}
    for i in range(len(feature_names)):
        feature2score[feature_names[i]] = document_vector[0][i]    
    return sorted(feature2score.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
extract_feature_scores(feature_names, tf_idf_vector.toarray())[:10]

## Models

In [None]:
# baseline bleu
from nltk.translate.bleu_score import sentence_bleu
#print('Cumulative 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
#print('Cumulative 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))
#print('Cumulative 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)))
from rouge import Rouge
from nltk.translate.nist_score import sentence_nist
from nltk.translate.chrf_score import sentence_chrf
from sklearn import preprocessing
# check pytorch version
import torch
print(torch.__version__)

In [None]:
# measures precision
def baseline_bleu(df):
    smoothie = SmoothingFunction().method1
    df['bleu'] = df.apply(lambda x: sentence_bleu(x['reference_token'], x['translation_token'], weights=(1,0,0,0), smoothing_function=smoothie), axis=1)
    return df 

def sacre_bleu(df):
    df['sacre_bleu'] = df.apply(lambda x: sacrebleu.sentence_bleu(x['reference_toke'], x['translation_token']).score, axis=1)
    df = df.dropna()
    return df 

def nist(df):
    # tokenization happens inside nist
    df['nist'] = df.apply(lambda x: sentence_nist(x['reference'], x['translation']),axis=1)
    return df 

# measures recall
def rouge_1(df):
    rouge = Rouge()
    df['rouge'] = df.apply(lambda x: rouge.get_scores(x['translation'], x['reference'], avg=True)['rouge-1']['f'],axis=1) 
    return df

def bleu_rouge(df):
    df['bleu_rouge'] = 2 * (df['bleu'] * df['rouge']) / (df['bleu'] + df['rouge'])
    df['bleu_rouge'] = df['bleu_rouge'].replace(np.nan, 0)
    return df

def meteor(df):
    df['meteor'] = df.apply(lambda x: meteor_score([x['reference']], x['translation']),axis=1)
    #If no words match during the method returns the score of 0
    return df

def charf(df):
    df['charf'] = df.apply(lambda x: sentence_chrf([x['reference']], x['translation']),axis=1)
    return df

import sacrebleu
def sacre_bleu(df):
    df['sacre_bleu'] = df.apply(lambda x: sacrebleu.corpus_bleu(x['reference'], x['translation']).score, axis=1)
    #x = df['sacre_bleu'].values.reshape(-1, 1) #returns a numpy array
    #min_max_scaler = preprocessing.MinMaxScaler()
    #x_scaled = min_max_scaler.fit_transform(x)
    #df['sacre_bleu'] = pd.DataFrame(x_scaled)
    return df 

def charf(df):
    df['charf'] = df.apply(lambda x: sentence_chrf([x['reference']], x['translation']),axis=1)
    return df

In [None]:
clean_cs = cs_en.copy()

In [None]:
rouge_1(clean_cs)

In [None]:
df['rouge'] = df.apply(lambda x: rouge.get_scores(x['translation'], x['reference'], avg=True)['rouge-1']['f'],axis=1) 

In [None]:
updates = clean(ru_en["reference"], lemmatize=False, stemmer=False, punctuation = True, stop_words=False, stop=stop_en)
update_df(ru_en, updates, "reference")

In [None]:
ru_en = ru_en.replace(r'^\s*$', np.NaN, regex=True)
ru_en = ru_en.dropna()

In [None]:
ru_en[ru_en['reference'] == '']

In [None]:
ru_en = ru_en.reset_index()

In [None]:
nist(ru_en)

In [None]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [None]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

In [None]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [62]:
ref_embedding = np.load('corpus/de-en/laser.reference_embeds.npy')
translation_embedding = np.load('corpus/de-en/laser.translation_embeds.npy')

In [None]:
ref_embedding.shape

In [None]:
from sklearn.metrics import roc_curve
from sklearn.base import TransformerMixin
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

def plot_roc(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    plt.plot(fpr, tpr)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    
def print_scores(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('F1 score: {:3f}'.format(f1_score(y_test, y_pred)))
    print('AUC score: {:3f}'.format(roc_auc_score(y_test, y_pred)))

lr = LogisticRegression()
print_scores(lr, X_train, y_train, X_test, y_test)
plot_roc(lr, X_test, y_test)

In [None]:
# Feed a word2vec with the ingredients
w2v = gensim.models.Word2Vec(list(data.ingredients), size=350, window=10, min_count=2, iter=20)

In [60]:
from sklearn.metrics.pairwise import euclidean_distances
def run_word_embedding(df, name):

    tokenized_corpus = []

    tokenized_corpus = [x for i, y in df['reference_token'].apply(list).iteritems() for x in y]
    tokenized_corpus = [word for sent in tokenized_corpus for word in sent]
    tokenized_corpus = [word for sent in df['translation_token'] for word in sent]
    #[tokenized_corpus.append(word) for doc in df['reference_token'] for word in doc]
    #[tokenized_corpus.append(word) for word in df['translation_token']]
    #vocabulary = {word for doc in tokenized_corpus for word in doc}

    word2idx = {w: idx for (idx, w) in enumerate(set(tokenized_corpus))}

    # load word embeddings 
    W1 = np.load('corpus/'+ str(name) +'/laser.reference_embeds.npy')
    W2 = np.load('corpus/'+ str(name) +'/laser.translation_embeds.npy')

    #training_pairs = build_word_embedding_training(tokenized_corpus, word2idx)

    #W1, W2, losses = Skip_Gram(training_pairs, word2idx, epochs=2)

    W = torch.from_numpy(W1) + torch.from_numpy(W2)
    W = (torch.t(W)/2).clone().detach()

    df['wordEmbDistance'] = get_word_embedding_distance(W, word2idx, df['reference_token'], df['translation_token'])
    print(df)
    return df

def get_word_embedding_distance(W, word2idx, reference, translation):
    distances = []
    for sent_idx in range(len(reference)):
        distances.append(apply_word_embedding_distance(W, word2idx, reference.iloc[sent_idx], translation.iloc[sent_idx]))

    return distances

def apply_word_embedding_distance(W, word2idx, sentence1, sentence2):
    distance = 0
    for word1 in sentence1[0]:
        for word2 in sentence2:
            distance += euclidean_distances([W[word2idx[word1]].numpy()], [W[word2idx[word2]].numpy()])
    return distance


In [58]:
tokenize(de_en)

Unnamed: 0,reference,translation,z-score,reference_token,translation_token
0,Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,"[[Her, timeless, pace, measures, them, when, t...","[Their, slow, speed, was, measured, by, resear..."
1,He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.903800,"[[He, said, the, areas, offer, quiet, meeting,...","[He, said, the, spaces, provided, calm, meetin..."
2,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,"[[For, businessmen, at, the, B, 27,, it's, onl...","[This, is, only, a, small, consolation, for, b..."
3,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,"[[This, ability, may, be, born, or, developed,...","[This, ability, may, be, innate,, or, may, dev..."
4,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,"[[Because, they, prefer, water, temperatures, ...","[They, generally, only, come, to, the, surface..."
...,...,...,...,...,...
21699,"Lt. Cmdr. Patrick Evans, a press officer at th...","Lt. Cmdr. Patrick Evans, a Pentagon spokesman,...",1.246459,"[[Lt., Cmdr., Patrick, Evans,, a, press, offic...","[Lt., Cmdr., Patrick, Evans,, a, Pentagon, spo..."
21700,"""To give an example: If I ask him something th...","""To give an example: If I ask him what happene...",0.792878,"[[""To, give, an, example:, If, I, ask, him, so...","[""To, give, an, example:, If, I, ask, him, wha..."
21701,One reason that not all neighbours view this a...,One reason for not all neighbours seeing this ...,0.597068,"[[One, reason, that, not, all, neighbours, vie...","[One, reason, for, not, all, neighbours, seein..."
21702,Profit before interest and tax increased from ...,Profits before interest and taxes increased fr...,-0.305719,"[[Profit, before, interest, and, tax, increase...","[Profits, before, interest, and, taxes, increa..."


In [107]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader as api
def word_mover_distance(df):
    # remove stop words?
    model = api.load(translation_embedding)
    df['wmd'] = df.apply(lambda x: model.wmdistance([x['reference']], x['translation']), axis=1)
    return df

In [109]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')



In [110]:
sentences = de_en['translation_token'].tolist()

In [112]:
df['wmd'] = de_en.apply(lambda x: model.wmdistance([x['reference']], x['translation']), axis=1)

ModuleNotFoundError: No module named 'pyemd'

In [96]:
model = KeyedVectors.load_word2vec_format('corpus/de-en/laser.translation_embeds.npy')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x93 in position 0: invalid start byte

In [98]:
model = Word2Vec(sentences, min_count=1,workers=3, window =3, sg = 1 )

In [84]:
model_2 = Word2Vec(min_count=1)
model_2.build_vocab(sentences)
total_examples = model_2.corpus_count
model = KeyedVectors.load_word2vec_format("glove.6B.300d.txt", binary=False)
model_2.build_vocab([list(model.vocab.keys())], update=True)
model_2.intersect_word2vec_format("glove.6B.300d.txt", binary=False, lockf=1.0)
model_2.train(sentences, total_examples=total_examples, epochs=model_2.iter)

# fit a 2d PCA model to the vectors
X = model_2[model_1.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model_1.wv.vocab)
for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.300d.txt'

In [71]:
word_mover_distance(de_en)

TypeError: unhashable type: 'numpy.ndarray'

In [None]:
import torch.nn.functional as F
import tensorflow as tf


def similarity(embeddings_1, embeddings_2):
    embeddings_1 = tf.convert_to_tensor(embeddings_1, np.float32)
    embeddings_2 = tf.convert_to_tensor(embeddings_2.transpose(0,1), np.float32)
    return torch.matmul(
        embeddings_1, embeddings_2
    )

def difference(embeddings_1, embeddings_2):
    return embeddings_1 - embeddings_2



In [None]:
type(tf.convert_to_tensor(ref_embedding, np.float32))

In [None]:
type(tf.convert_to_tensor(translation_embedding.transpose(0,1), np.float32))

In [None]:
print(similarity(ref_embedding, translation_embedding))

In [None]:
ref_embedding - translation_embedding

## Evaluate

In [None]:
from matplotlib import pyplot
pyplot.scatter(clean_csen['z-score'], clean_csen['bleu'])
pyplot.show()

In [None]:
from scipy.stats import kendalltau
from scipy.stats import pearsonr
kendalltau(clean_enzh['z-score'], clean_enzh['bleu'])

In [None]:
pearsonr(clean_enzh['z-score'], clean_enzh['bleu'])

In [None]:
kendalltau(clean_csen['z-score'], clean_csen['meteor'])
pearsonr(clean_csen['z-score'], clean_csen['meteor'])

In [114]:
pip install bleurt

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement bleurt (from versions: none)
ERROR: No matching distribution found for bleurt
You should consider upgrading via the 'C:\Users\doris\anaconda3\python.exe -m pip install --upgrade pip' command.


In [115]:
from bleurt import score

ModuleNotFoundError: No module named 'bleurt'