# Imports

In [1]:
import numpy as np
import os
from collections import OrderedDict
import io
import pdb
import csls as csls
from data import Language, WordDictionary
from utils import to_numpy, read_from_yaml, setup_output_dir
import evaluate as evl
import json
import logging
logger = logging.getLogger(__name__)

# Load Config File

In [2]:
config_file = "Configs/GeoMM/ru-en.yaml"
gpu = 0
config = read_from_yaml(config_file)
base_dir = config["base_dir"]
loglevel = "INFO"
output_dir, config = setup_output_dir(base_dir, config, loglevel)
src = config["src_lang"]
tgt = config["tgt_lang"]
BASE_DIR = config["base_data_dir"]
CROSSLINGUAL = os.path.join(BASE_DIR, "crosslingual", "dictionaries")

logger.info(f"Computing between language pairs {src} - {tgt}")

languages = OrderedDict()
for lang in config["languages"]:
    name = lang.pop("name")
    filename = lang.pop("filename")
    lang_obj = Language(name, gpu, **lang)
    lang_obj.load(filename, BASE_DIR)
    languages[name] = lang_obj

train_file = os.path.join(CROSSLINGUAL, f"{src}-{tgt}.0-5000.txt")
training_mapping = WordDictionary(
    languages[src], languages[tgt], train_file)

2018-10-21 21:20:22,100: INFO: Computing between language pairs ru - en


# Training

In [3]:
import pytorch_backend as prob
import numpy as np
from pymanopt.solvers import SteepestDescent, ConjugateGradient
import torch
from torch.autograd import Variable
from torch import Tensor

In [4]:
unique_src, src_indices = np.unique(training_mapping.word_map[:, 0], return_inverse=True)
unique_tgt, tgt_indices = np.unique(training_mapping.word_map[:, 1], return_inverse=True)
A = np.zeros((unique_src.shape[0], unique_tgt.shape[0]))
for six, tix in zip(src_indices, tgt_indices):
    A[six, tix] = 1
# A : number of unique src tgt pairs.
# A[i, j] is 1 unique_src[i] and unique_tgt[j] are aligned, 0 otherwise
Xs = languages[src].get_embeddings(unique_src)
Xt = languages[tgt].get_embeddings(unique_tgt)
A = Variable(torch.FloatTensor(A))
if gpu >= 0:
    A = A.cuda(gpu)

In [5]:
training_params = config["training_params"]
for param in training_params:
    value = training_params[param]
    logger.info(f"{param}\t{value}")
lbda = training_params["lambda"]
manifold_learner = prob.GeomManifold(Xs, Xt, A, lbda, Xs.size(1), device=gpu)
problem = prob.Problem(
    manifold=manifold_learner.manifold,
    cost=manifold_learner.cost,
    egrad=manifold_learner.egrad)
max_opt_time = training_params["max_opt_time"]
max_opt_iter = training_params["max_opt_iter"]
solver = ConjugateGradient(
    maxtime=max_opt_time, maxiter=max_opt_iter)
theta = solver.solve(problem)
Us, B, Ut = theta

2018-10-21 21:20:26,848: INFO: lambda	1000
2018-10-21 21:20:26,849: INFO: max_opt_time	5000
2018-10-21 21:20:26,850: INFO: max_opt_iter	150
Optimizing...
 iter		   cost val	    grad. norm
    0	+5.7918737500000000e+05	9.38927023e+04
    1	+5.0748725000000000e+05	7.18895624e+04
    2	+4.3028178125000000e+05	7.46343520e+04
    3	+3.3312937500000000e+05	5.28672279e+04
    4	+2.8713353125000000e+05	3.73801743e+04
    5	+2.3570350000000000e+05	3.01963540e+04
    6	+1.7714829687500000e+05	2.47519831e+04
    7	+1.1170785156250000e+05	1.46377182e+04
    8	+7.4323398437500000e+04	1.44541237e+04
    9	+6.9524593750000000e+04	8.22884803e+03
   10	+6.4452257812500000e+04	6.88220923e+03
   11	+5.7427273437500000e+04	6.28350241e+03
   12	+4.7237550781250000e+04	6.83030507e+03
   13	+3.3331140625000000e+04	3.09314734e+03
   14	+3.0336578125000000e+04	2.67918319e+03
   15	+2.5941906250000000e+04	2.16176736e+03
   16	+2.0553996093750000e+04	1.53363905e+03
   17	+1.5481535156250000e+04	9.45222321e+02
  

# Save the matrices

In [6]:
np.save(os.path.join(output_dir, "Us.npy"), arr=Us)
np.save(os.path.join(output_dir, "B.npy"), arr=B)
np.save(os.path.join(output_dir, "Ut.npy"), arr=Ut)

# Transform to different spaces

In [7]:
u,s,vh = np.linalg.svd(B, full_matrices=True)
b_sqrt = np.dot(u, np.dot(np.diag(np.sqrt(s)), vh))
src_embeddings = to_numpy(languages[src].embeddings, gpu >= 0)
tgt_embeddings = to_numpy(languages[tgt].embeddings, gpu >= 0)
src_transform = np.dot(np.dot(src_embeddings, Us), b_sqrt)
tgt_transform = np.dot(np.dot(tgt_embeddings, Ut), b_sqrt)

# NN Evaluation using CSLS

In [8]:
csls_object = csls.CSLS(src_transform, tgt_transform, gpu_device=gpu)

In [9]:
evaluator = evl.Evaluator(languages[src], languages[tgt], data_dir="data")

In [10]:
metrics = evaluator.supervised(csls_object, {})

2018-10-21 21:20:54,602: INFO: Using Mode: csls
2018-10-21 21:20:54,777: INFO: Total: 1500, Precision@1: 67.93, @5: 81.60, @10: 84.60


In [11]:
metrics_file = os.path.join(output_dir, "metrics.json")
logger.info(f"Writing metrics to {metrics_file}")
json.dump(metrics, open(metrics_file, "w"))
logger.info("Done")

2018-10-21 21:20:54,918: INFO: Writing metrics to Experiments/GeoMM/run-5/metrics.json
2018-10-21 21:20:54,931: INFO: Done


# Done