# Sentences

In [1]:
import os
import pickle
import logging
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from itertools import product
from datetime import datetime
import numpy as np
import pandas as pd
import spacy
from sklearn.decomposition import TruncatedSVD

logging.basicConfig(filename='logs/sentences.log', filemode='a', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

In [2]:
class SentenceEmbedder:
    
    ALPHA = 1e-4
    
    def __init__(self, lg):
        self.lg = lg
        nlp = spacy.load(lg, disable=['tagger', 'parser', 'ner', 'textcat'])
        self.tokenizer = nlp.tokenizer
        self.not_in_wiki = self.read_pickled_metadata('wikipedia', 'unk-metadata.pkl', 'not-in-wiki')
        self.tensed_types = self.read_pickled_metadata('UD', 'metadata.pkl', 'tensed_types')
        self.missing_tensed_words = self.tensed_types.intersection(self.not_in_wiki)
        self.prepared = self.read_prepared()
        self.sentences = self.prepared['text']
    
    def read_pickled_metadata(self, dirname, fname, key):
        fname = os.path.join(dirname, self.lg, fname)
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        return obj[key]
    
    def read_prepared(self):
        fname = os.path.join('UD', self.lg, 'prepared.csv')
        return pd.read_csv(fname)
    
    def read_embedding(self, model, dim):
        fname = os.path.join('embeddings', self.lg, '{}-{}.csv'.format(model, dim))
        df = pd.read_csv(fname, index_col=0)
        return df.reset_index().dropna().set_index('lx_obj') # drops rows that somehow are embeddings for NaN
    
    def combine_sentence_embeddings_and_prepared(self, embeddings):
        return pd.merge(self.prepared, embeddings, left_index=True, right_index=True, how='left')
    
    def save(self, df, model, dim):
        fname = '{}-{}.csv'.format(model, dim)
        fname = os.path.join('representations', self.lg, fname)
        df.to_csv(fname, index=False)
        
    def unk_filter(self, sentence, embeddings):
        tokens = [t.text for t in self.tokenizer(sentence)]
        new_tokens = []
        for token in tokens:
            if token in embeddings.index:
                new_tokens.append(token)
            else:
                assert token not in self.missing_tensed_words, "You're missing an important tensed word here."
                new_tokens.append('UNK')
        return new_tokens
    
    def combine_element_wise(self, model, dim, f):
        embeddings = self.read_embedding(model, dim)
        sentence_embeddings = []
        for s in self.sentences:
            new_tokens = self.unk_filter(s, embeddings)
            e = getattr(embeddings.loc[new_tokens], f)()
            e.index = e.index.astype(int)
            sentence_embeddings.append(e)
        sentence_embeddings = pd.DataFrame(sentence_embeddings)
        result = self.combine_sentence_embeddings_and_prepared(sentence_embeddings)
        model = '{}-{}'.format(model, f)
        self.save(result, model, dim)
    
    def read_wiki_metadata(self, embeddings):
        fname = os.path.join('wikipedia', self.lg, 'metadata.pkl')
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        wiki_freq_dist = obj['freq_dist']
        wiki_freq_dist['UNK'] = sum([f for w,f in wiki_freq_dist.items() if w not in embeddings.index])
        num_tokens = obj['num_tokens']
        return wiki_freq_dist, num_tokens
    
    def get_weights(self, model, dim):
        embeddings = self.read_embedding(model, dim)
        wiki_freq_dist, num_tokens = self.read_wiki_metadata(embeddings)
        probs = embeddings.index.map(lambda w: wiki_freq_dist[w]) / num_tokens
        probs = pd.Series(probs, index=embeddings.index)
        weights = self.ALPHA / (self.ALPHA + probs)
        return weights
    
    def get_weighted_average(self, embeddings, model, dim):
        weights = self.get_weights(model, dim)
        num_sents = len(self.sentences)
        result = np.zeros((num_sents, dim))
        for i in range(num_sents):
            s = self.sentences[i]
            new_tokens = self.unk_filter(s, embeddings)
            weighted_avg = np.average(embeddings.loc[new_tokens].values, weights=weights.loc[new_tokens].values, axis=0)
            result[i,:] = weighted_avg
        return result
    
    def sif(self, model, dim):
        word_embeddings = self.read_embedding(model, dim)
        sentence_embeddings = self.get_weighted_average(word_embeddings, model, dim)
        sentence_embeddings = self.remove_pc(sentence_embeddings, 1)
        sentence_embeddings = pd.DataFrame(sentence_embeddings)
        result = self.combine_sentence_embeddings_and_prepared(sentence_embeddings)
        model = '{}-sif'.format(model)
        self.save(result, model, dim)

    def compute_pc(self, X, npc):
        """
        Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
        :param X: X[i,:] is a data point
        :param npc: number of principal components to remove
        :return: component_[i,:] is the i-th pc
        """
        svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
        svd.fit(X)
        return svd.components_

    def remove_pc(self, X, npc):
        """
        Remove the projection on the principal components
        :param X: X[i,:] is a data point
        :param npc: number of principal components to remove
        :return: XX[i, :] is the data point after removing its projection
        """
        pc = self.compute_pc(X, npc)
        return X - X.dot(pc.transpose()) * pc
    
    def create_sentence_embedding(self, model, dim, method):
        if method == 'sif':
            self.sif(model, dim)
        elif method in FS:
            self.combine_element_wise(model, dim, method)

In [2]:
LGS = ['en', 'fr']
DIMS = np.arange(20, 320, 20)
FS = ['sum', 'mean', 'max', 'min']
METHODS = FS + ['sif']

In [7]:
def main(lgs, dims, methods):
    model = 'skipgram'
    for lg in lgs:
        s = SentenceEmbedder(lg)
        for d in dims:
            for method in methods:
                start = datetime.now()
                s.create_sentence_embedding(model, d, method)
                end = datetime.now()
                msg = 'Creating sentence representations for {} {} {} {} took {}'.format(lg, model, d, method, end-start)
                logging.info(msg)

In [8]:
main(LGS, DIMS, METHODS)