# Chapter 05
## Pretrained Embeddings


In this notebook we will show how to use pretrained embeddings zZ

In [1]:

import torch
import torch.nn as nn
from tqdm import tqdm
from annoy import AnnoyIndex
import numpy as np


class PreTrainedEmbeddings(object):
    """ A wrapper around pre-trained word vectors and their use """
    def __init__(self, word_to_index, word_vectors):
        """
        Args:
            word_to_index (dict): mapping from word to integers
            word_vectors (list of numpy arrays)
        """
        self.word_to_index = word_to_index
        self.word_vectors = word_vectors
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}
        print(len(word_vectors[0]))
        self.index = AnnoyIndex(len(word_vectors[0]), metric='euclidean')
        print("Building Index!")
        for _, i in self.word_to_index.items():
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
        print("Finished!")
        
    @classmethod
    def from_embeddings_file(cls, embedding_file):
        """Instantiate from pre-trained vector file.
        
        Vector file should be of the format:
            word0 x0_0 x0_1 x0_2 x0_3 ... x0_N
            word1 x1_0 x1_1 x1_2 x1_3 ... x1_N
        
        Args:
            embedding_file (str): location of the file
        Returns: 
            instance of PretrainedEmbeddigns
        """
        word_to_index = {}
        word_vectors = []
        dim = None

        with open(embedding_file) as fp:
            for idx, line in enumerate(fp.readlines()):
                try:
                    line = line.rstrip().split(" ")
                    
                    if len(line) < 10:
                        # First line with metadata; skip
                        continue
                    word = line[0]

                    vec = np.array([float(x) for x in line[1:]])
                    
                    if dim is None:
                        dim = vec.shape[0]
                    elif vec.shape[0] != dim:
                        print("Dimensión desigual -- {} -- salteando".format(word))
                        continue

                    word_to_index[word] = len(word_to_index)
                    word_vectors.append(vec)
                except ValueError:
                    print(line)
                    print(idx)
                    raise
        return cls(word_to_index, word_vectors)
    
    def get_embedding(self, word):
        """
        Args:
            word (str)
        Returns
            an embedding (numpy.ndarray)
        """
        return self.word_vectors[self.word_to_index[word]]

    def get_closest_to_vector(self, vector, n=1):
        """Given a vector, return its n nearest neighbors
        
        Args:
            vector (np.ndarray): should match the size of the vectors 
                in the Annoy index
            n (int): the number of neighbors to return
        Returns:
            [str, str, ...]: words that are nearest to the given vector. 
                The words are not ordered by distance 
        """
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def compute_and_print_analogy(self, word1, word2, word3):
        """Prints the solutions to analogies using word embeddings

        Analogies are word1 is to word2 as word3 is to __
        This method will print: word1 : word2 :: word3 : word4
        
        Args:
            word1 (str)
            word2 (str)
            word3 (str)
        """
        vec1 = self.get_embedding(word1)
        vec2 = self.get_embedding(word2)
        vec3 = self.get_embedding(word3)

        # now compute the fourth word's embedding!
        spatial_relationship = vec2 - vec1
        vec4 = vec3 + spatial_relationship

        closest_words = self.get_closest_to_vector(vec4, n=4)
        existing_words = set([word1, word2, word3])
        closest_words = [word for word in closest_words 
                             if word not in existing_words] 

        if len(closest_words) == 0:
            print("Could not find nearest neighbors for the computed vector!")
            return
        
        for word4 in closest_words:
            print("{} : {} :: {} : {}".format(word1, word2, word3, word4))

Voy a usar los de fasttext que ya los tengo bajados....son medio pedorros, igual :-D

In [5]:
#embeddings = PreTrainedEmbeddings.from_embeddings_file('data/WordVectors/wiki.es.vec')
embeddings = PreTrainedEmbeddings.from_embeddings_file('data/WordVectors/glove/glove.6B.100d.txt')

100
Building Index!
Finished!


In [6]:
embeddings.compute_and_print_analogy('man', 'him', 'woman')

man : him :: woman : herself
man : him :: woman : her


In [7]:
embeddings.compute_and_print_analogy('fly', 'plane', 'sail')

fly : plane :: sail : ship
fly : plane :: sail : vessel


In [9]:
embeddings.compute_and_print_analogy('cat', 'kitten', 'dog')

cat : kitten :: dog : puppy
cat : kitten :: dog : toddler
cat : kitten :: dog : sleds


In [19]:
embeddings.compute_and_print_analogy('usa', 'basketball', 'argentina')

usa : basketball :: argentina : soccer
usa : basketball :: argentina : football


In [20]:
embeddings.compute_and_print_analogy('toe', 'foot', 'finger')

toe : foot :: finger : hand
toe : foot :: finger : attached
toe : foot :: finger : apart


In [21]:
embeddings.compute_and_print_analogy('man', 'king', 'woman')

man : king :: woman : queen
man : king :: woman : monarch
man : king :: woman : throne


In [22]:
embeddings.compute_and_print_analogy('man', 'doctor', 'woman')

man : doctor :: woman : nurse
man : doctor :: woman : physician


In [22]:
embeddings.compute_and_print_analogy('alas', 'pájaro', 'antenas')

alas : pájaro :: antenas : zancudos
alas : pájaro :: antenas : saltamontes
alas : pájaro :: antenas : mapache
alas : pájaro :: antenas : calambre
