# Sentences

In [1]:
import os
import pickle
import logging
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from datetime import datetime
import pandas as pd
import spacy
from tqdm import tqdm

logging.basicConfig(filename='logs/sentences.log', filemode='w', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

In [2]:
class SentenceEmbedder:
    
    def __init__(self, lg):
        self.lg = lg
        nlp = spacy.load(lg, disable=['tagger', 'parser', 'ner', 'textcat'])
        self.tokenizer = nlp.tokenizer
        self.not_in_wiki = self.read_not_in_wiki()
        self.tensed_types = self.read_tensed_types()
        self.replaceable_words = self.not_in_wiki.difference(self.tensed_types)
        self.missing_tensed_words = self.tensed_types.intersection(self.not_in_wiki)
        self.prepared = self.read_prepared()
        self.sentences = self.prepared['text']
    
    def read_not_in_wiki(self):
        fname = os.path.join('wikipedia', self.lg, 'unk-metadata.pkl')
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        return obj['not-in-wiki']
    
    def read_tensed_types(self):
        fname = os.path.join('UD', self.lg, 'metadata.pkl')
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        return obj['tensed_types']
    
    def read_prepared(self):
        fname = os.path.join('UD', self.lg, 'prepared.csv')
        return pd.read_csv(fname)
    
    def combine(self, model, dim, f, sentences):
        embedding_fname = os.path.join('embeddings', self.lg, '{}-{}.csv'.format(model, dim))
        learnt_embeddings = pd.read_csv(embedding_fname, index_col=0)
        result = []
        for s in sentences:
            tokens = [t.text for t in self.tokenizer(s)]
            # this is where I replace unimportant words that didn't get embeddings with the UNK token
            # the other alternative is to drop these words
            new_tokens = []
            for token in tokens:
                if token in self.replaceable_words:
                    new_tokens.append('UNK')
                else:
                    # assert token not in self.missing_tensed_words, "You're missing an important tensed word here."
                    # I will remove these when doing the classification, rather than now.
                    new_tokens.append(token)
            e = getattr(learnt_embeddings.loc[new_tokens], f)()
            e.index = e.index.astype(int)
            result.append(e)
        result = pd.DataFrame(result)
        result = pd.merge(self.prepared, result, left_index=True, right_index=True, how='left')
        fname = '{}-{}-{}.csv'.format(model, f, dim)
        fname = os.path.join('representations', self.lg, fname)
        result.to_csv(fname, index=False)
        return result
    
    def t2b(self, model, dim, sentences):
        pass

In [8]:
def main(lgs):
    for lg in lgs:
        s = SentenceEmbedder(lg)
        for d in [50, 100, 300, 500, 700]:
            for model in ['skipgram', 'fasttext']:
                for f in ['sum', 'mean']:
                    start = datetime.now()
                    s.combine(model, d, f, s.sentences)
                    end = datetime.now()
                    msg = 'Creating sentence representations for {} {} {} {} took {}'.format(lg, model, d, f, end-start)
                    logging.info(msg)

In [9]:
main(['en', 'fr', 'it', 'es'])

KeyboardInterrupt: 