# Imports

In [1]:
import numpy as np
import os
from collections import OrderedDict
import io
import pdb
import csls as csls
from data import Language, WordDictionary
from utils import to_numpy, read_from_yaml, setup_output_dir
import evaluate as evl
import logging
logger = logging.getLogger(__name__)

# Load Config File

In [2]:
config_file = "Configs/GeoMM/en-zh.yaml"
gpu = 0
config = read_from_yaml(config_file)
base_dir = config["base_dir"]
loglevel = "INFO"
output_dir, config = setup_output_dir(base_dir, config, loglevel)
src = config["src_lang"]
tgt = config["tgt_lang"]
BASE_DIR = config["base_data_dir"]
CROSSLINGUAL = os.path.join(BASE_DIR, "crosslingual", "dictionaries")

logger.info(f"Computing between language pairs {src} - {tgt}")

languages = OrderedDict()
for lang in config["languages"]:
    name = lang.pop("name")
    filename = lang.pop("filename")
    lang_obj = Language(name, gpu, **lang)
    lang_obj.load(filename, BASE_DIR)
    languages[name] = lang_obj

train_file = os.path.join(CROSSLINGUAL, f"{src}-{tgt}.0-5000.txt")
training_mapping = WordDictionary(
    languages[src], languages[tgt], train_file)

2018-10-21 21:10:52,690: INFO: Computing between language pairs en - zh


# Training

In [3]:
import pytorch_backend as prob
import numpy as np
from pymanopt.solvers import SteepestDescent, ConjugateGradient
import torch
from torch.autograd import Variable
from torch import Tensor

In [4]:
unique_src, src_indices = np.unique(training_mapping.word_map[:, 0], return_inverse=True)
unique_tgt, tgt_indices = np.unique(training_mapping.word_map[:, 1], return_inverse=True)
A = np.zeros((unique_src.shape[0], unique_tgt.shape[0]))
for six, tix in zip(src_indices, tgt_indices):
    A[six, tix] = 1
# A : number of unique src tgt pairs.
# A[i, j] is 1 unique_src[i] and unique_tgt[j] are aligned, 0 otherwise
Xs = languages[src].get_embeddings(unique_src)
Xt = languages[tgt].get_embeddings(unique_tgt)
A = Variable(torch.FloatTensor(A))
if gpu >= 0:
    A = A.cuda(gpu)

In [5]:
training_params = config["training_params"]
for param in training_params:
    value = training_params[param]
    logger.info(f"{param}\t{value}")
lbda = training_params["lambda"]
manifold_learner = prob.GeomManifold(Xs, Xt, A, lbda, Xs.size(1), device=gpu)
problem = prob.Problem(
    manifold=manifold_learner.manifold,
    cost=manifold_learner.cost,
    egrad=manifold_learner.egrad)
max_opt_time = training_params["max_opt_time"]
max_opt_iter = training_params["max_opt_iter"]
solver = ConjugateGradient(
    maxtime=max_opt_time, maxiter=max_opt_iter)
theta = solver.solve(problem)
Us, B, Ut = theta

2018-10-21 21:11:00,945: INFO: lambda	1000
2018-10-21 21:11:00,946: INFO: max_opt_time	5000
2018-10-21 21:11:00,947: INFO: max_opt_iter	150
Optimizing...
 iter		   cost val	    grad. norm
    0	+6.3568087500000000e+05	4.67997423e+05
    1	+6.0555400000000000e+05	3.30096297e+05
    2	+5.7116712500000000e+05	2.01295336e+05
    3	+5.4059693750000000e+05	2.34044983e+05
    4	+5.1778993750000000e+05	1.30518509e+05
    5	+4.9891018750000000e+05	1.01817164e+05
    6	+4.7530212500000000e+05	1.53765406e+05
    7	+4.4971787500000000e+05	8.48920195e+04
    8	+4.2900228125000000e+05	6.67367397e+04
    9	+4.0634075000000000e+05	1.10967817e+05
   10	+3.1201625000000000e+05	1.60371593e+05
   11	+2.8461546875000000e+05	5.37248358e+04
   12	+2.7371159375000000e+05	3.68920306e+04
   13	+2.5580809375000000e+05	4.11026909e+04
   14	+2.4773520312500000e+05	3.84804492e+04
   15	+2.3480801562500000e+05	3.02012639e+04
   16	+2.2165620312500000e+05	2.81287545e+04
   17	+1.8903446875000000e+05	3.26131730e+04
  

# Save the matrices

In [6]:
np.save(os.path.join(output_dir, "Us.npy"), arr=Us)
np.save(os.path.join(output_dir, "B.npy"), arr=B)
np.save(os.path.join(output_dir, "Ut.npy"), arr=Ut)

# Transform to different spaces

In [7]:
u,s,vh = np.linalg.svd(B, full_matrices=True)
b_sqrt = np.dot(u, np.dot(np.diag(np.sqrt(s)), vh))
src_embeddings = to_numpy(languages[src].embeddings, gpu >= 0)
tgt_embeddings = to_numpy(languages[tgt].embeddings, gpu >= 0)
src_transform = np.dot(np.dot(src_embeddings, Us), b_sqrt)
tgt_transform = np.dot(np.dot(tgt_embeddings, Ut), b_sqrt)

# NN Evaluation using CSLS

In [8]:
csls_object = csls.CSLS(src_transform, tgt_transform, gpu_device=gpu)

In [9]:
evaluator = evl.Evaluator(languages[src], languages[tgt], data_dir="data")

In [10]:
metrics = evaluator.supervised(csls_object, {})

2018-10-21 21:11:44,769: INFO: Using Mode: csls
2018-10-21 21:11:44,939: INFO: Total: 1500, Precision@1: 48.33, @5: 70.13, @10: 75.20


In [11]:
metrics_file = os.path.join(output_dir, "metrics.json")
logger.info(f"Writing metrics to {metrics_file}")
json.dump(metrics, open(metrics_file, "w"))
logger.info("Done")

2018-10-21 21:11:46,888: INFO: Writing metrics to Experiments/GeoMM/run-3/metrics.json


NameError: name 'json' is not defined