# Word embeddings

In [1]:
import os
import pickle
import logging
from datetime import datetime
import pandas as pd
from gensim.models import Word2Vec, FastText
from gensim.models.word2vec import PathLineSentences

logging.basicConfig(filename='logs/word-embeddings.log', filemode='w', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logging.getLogger("gensim").setLevel(logging.WARNING)

OUT_DIR = 'representations'

In [2]:
class Trainer:
    
    MODELS = {'skipgram': Word2Vec, 
              'fasttext': FastText}
    DIMS = [100, 300, 500]
    WINDOW = 5
    SG = 1
    HS = 1
    EPOCHS = 5
    
    def __init__(self, lg):
        self.lg = lg
        self.sentences = PathLineSentences(os.path.join('wikipedia', lg, 'articles'))
        fname = os.path.join('wikipedia', lg, 'metadata.pkl')
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        self.num_sents = obj['num_sents']
        self.out_dir = os.path.join(OUT_DIR, lg)
        os.makedirs(self.out_dir) # errors if out_dir already exists so I don't re-write data
    
    def train(self):
        for name, model in self.MODELS.items():
            for d in self.DIMS:
                start = datetime.now()
                m = model(size=d,
                          sg=self.SG,
                          window=self.WINDOW,
                          hs=self.HS,
                          max_vocab_size=50000)
                m.build_vocab(self.sentences)
                m.train(self.sentences, total_examples=self.num_sents, epochs=self.EPOCHS)
                self.save(m, name)
                end = datetime.now()
                msg = 'Training {} with {} dimensions on {} sentences for {} took {}'.format(name, d, self.num_sents, self.lg, end-start)
                logging.info(msg)
                 
    def save(self, model, name):
        embeddings = {word: model.wv.get_vector(word) for word in model.wv.vocab.keys()}
        df = pd.DataFrame.from_dict(embeddings, orient='index')
        df.index.name = 'lx_obj'
        dim = len(df.columns)
        fname = os.path.join(self.out_dir, '{}-{}.csv'.format(name, dim))
        df.to_csv(fname)

In [3]:
for lg in ['en', 'fr', 'it', 'es']:
    trainer = Trainer(lg)
    trainer.train()