# Sentences

In [9]:
import os
import pickle
import logging
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from itertools import product
from datetime import datetime
import numpy as np
import pandas as pd
import spacy
from sklearn.decomposition import TruncatedSVD

logging.basicConfig(filename='logs/sentences.log', filemode='a', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

LGS = ['en']
DIMS = np.arange(100, 520, 20)
MODELS = ['skipgram']
FS = ['sum', 'mean', 'max', 'min']
ALPHA = 1e-4

In [10]:
def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_

def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    pc = compute_pc(X, npc)
    return X - X.dot(pc.transpose()) * pc

class SentenceEmbedder:
    
    def __init__(self, lg):
        self.lg = lg
        nlp = spacy.load(lg, disable=['tagger', 'parser', 'ner', 'textcat'])
        self.tokenizer = nlp.tokenizer
        self.not_in_wiki = self.read_not_in_wiki()
        self.tensed_types = self.read_tensed_types()
        self.replaceable_words = self.not_in_wiki.difference(self.tensed_types)
        self.missing_tensed_words = self.tensed_types.intersection(self.not_in_wiki)
        self.prepared = self.read_prepared()
        self.sentences = self.prepared['text']
    
    def read_not_in_wiki(self):
        fname = os.path.join('wikipedia', self.lg, 'unk-metadata.pkl')
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        return obj['not-in-wiki']
    
    def read_tensed_types(self):
        fname = os.path.join('UD', self.lg, 'metadata.pkl')
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        return obj['tensed_types']
    
    def read_prepared(self):
        fname = os.path.join('UD', self.lg, 'prepared.csv')
        return pd.read_csv(fname)
    
    def combine(self, model, dim, f, sentences):
        embedding_fname = os.path.join('embeddings', self.lg, '{}-{}.csv'.format(model, dim))
        learnt_embeddings = pd.read_csv(embedding_fname, index_col=0)
        result = []
        for s in sentences:
            tokens = [t.text for t in self.tokenizer(s)]
            # this is where I replace unimportant words that didn't get embeddings with the UNK token
            # the other alternative is to drop these words
            new_tokens = []
            for token in tokens:
                if token in self.replaceable_words:
                    new_tokens.append('UNK')
                else:
                    assert token not in self.missing_tensed_words, "You're missing an important tensed word here."
                    # If we survive the above assert line, then 
                    new_tokens.append(token)
            e = getattr(learnt_embeddings.loc[new_tokens], f)()
            e.index = e.index.astype(int)
            result.append(e)
        result = pd.DataFrame(result)
        result = pd.merge(self.prepared, result, left_index=True, right_index=True, how='left')
        fname = '{}-{}-{}.csv'.format(model, f, dim)
        fname = os.path.join('representations', self.lg, fname)
        result.to_csv(fname, index=False)
        return result
    
    def get_weighted_average(self, model, dim, alpha):
        """
        The original implementation was memory inefficient so I replace only this function,
        otherwise I'm using their code.
        """
        fname = os.path.join('embeddings', self.lg, '{}-{}.csv'.format(model, dim))
        embeddings = pd.read_csv(fname, index_col=0)
        fname = os.path.join('wikipedia', self.lg, 'metadata.pkl')
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        wiki_freq_dist = obj['freq_dist']
        wiki_freq_dist['UNK'] = sum([f for w,f in wiki_freq_dist.items() if w not in embeddings.index])
        num_tokens = obj['num_tokens']
        w = (pd.Series(embeddings.index).apply(lambda w: wiki_freq_dist[w]) / num_tokens).values
        w = alpha / (alpha + w)
        w = pd.Series(w, index=embeddings.index)
        n_sentences = len(self.sentences)
        result = np.zeros((n_sentences, dim))
        for i in range(n_sentences):
            s = self.sentences[i]
            tokens = [t.text for t in self.tokenizer(s)]
            new_tokens = []
            for token in tokens:
                if token in embeddings.index:
                    new_tokens.append(token)
                else:
                    new_tokens.append('UNK')
            weighted_average = np.average(embeddings.loc[new_tokens].values, weights=w.loc[new_tokens].values, axis=0)
            result[i,:] = weighted_average
        return result
    
    def t2b(self, model, dim, alpha):
        emb = self.get_weighted_average(model, dim, alpha)
        result = remove_pc(emb, 1)
        result = pd.DataFrame(result)
        result = pd.merge(self.prepared, result, left_index=True, right_index=True, how='left')
        fname = '{}-{}-{}-{}.csv'.format(model, 't2b', alpha, dim)
        fname = os.path.join('representations', self.lg, fname)
        result.to_csv(fname, index=False)
        return result

In [11]:
def main(lgs):
    for lg in lgs:
        s = SentenceEmbedder(lg)
        for d in DIMS:
            for f in FS:
                model = 'skipgram'
                start = datetime.now()
                s.combine(model, d, f, s.sentences)
                end = datetime.now()
                msg = 'Creating sentence representations for {} {} {} {} took {}'.format(lg, model, d, f, end-start)
                logging.info(msg)
            start = datetime.now()
            s.t2b(model, d, ALPHA)
            end = datetime.now()
            msg = 'Creating sentence representations for {} {} {} {} took {}'.format(lg, model, d, 't2b', end-start)
            logging.info(msg)

In [11]:
main(['en'])