# Building Ioya2Vec

In [1]:
import ktrain
import numpy as np
from gensim.models import Word2Vec
from gensim import utils
import pandas as pd
import pickle

## Compute custom Word2Vec embeddings

In [2]:
NUM_WORDS = 33000
MAXLEN = 5000
DIMS = 300
NGRAM_RANGE = 1

### Functions to do the heavy lifting

In [3]:
def preprocess_corpus(corpus_path, text_column='contents', max_features=33000, maxlen=5000, class_names=[0,1]):
    """
    Takes a corpus of texts stored in a csv file and processes them
    through ktrain's preprocessor in preparation for using them in
    gensim's Word2Vec model.
    """
    preproc = ktrain.text.preprocessor.StandardTextPreprocessor(max_features=max_features,
                                                                maxlen=maxlen,
                                                                class_names=class_names)
    full_corpus = pd.read_csv(corpus_path)
    full_corpus_text = full_corpus[text_column]
    dummy_labels = np.zeros(len(full_corpus_text))
    X, dummy_y = preproc.preprocess_train(full_corpus_text, dummy_labels)
    X = list(X)
    corpus = [preproc.undo(doc).split(' ') for doc in X]
    return preproc, corpus


def build_wv_model(corpus, dims, save_path=None, workers=8):
    """
    Builds, returns, and saves the Word2Vec-like model developed
    from the corpus returned by preprocess_corpus.
    """
    WV = Word2Vec(sentences=corpus, size=dims, workers=8)
    vocab = WV.wv.index2word
    vocab_len = len(vocab)
    embeddings = np.array([WV.wv.get_vector(word) for word in vocab])
    text_to_token = {word: i for word, i in zip(vocab, range(vocab_len))}
    token_to_text = {i: word for word, i in zip(vocab, range(vocab_len))}
    model = {'embeddings': embeddings,
             'text_to_token': text_to_token,
             'token_to_text': token_to_text,
             'vocab': vocab,
             'vocab_len': len(vocab)}
    if save_path is not None:
        with open(save_path, 'wb') as f:
            pickle.dump(model, f)
    return model


def embed_word(word, wv_model, dims):
    if word in wv_model['vocab']:
        token = wv_model['text_to_token'][word]
        return wv_model['embeddings'][token]
    else:
        return np.zeros(dims)
    

def build_embeddings(preproc, wv_model, dims):
    preproc_vocab = preproc.undo(range(NUM_WORDS)).split(' ')
    Embeddings = [embed_word(word, wv_model, dims) for word in preproc_vocab]
    Embeddings = np.stack(Embeddings)
    return(Embeddings)

In [None]:
%%time
preproc, corpus = preprocess_corpus('../data/ICAAD_FIJI.csv', max_features=NUM_WORDS, maxlen=MAXLEN)

language: en
Word Counts: 132011
Nrows: 13384
13384 train sequences
train sequence lengths:
	mean : 2218
	95percentile : 5989
	99percentile : 11259
x_train shape: (13384,5000)
y_train shape: (13384, 1)
Is Multi-Label? False


In [5]:
%%time
wv_model = build_wv_model(corpus, DIMS, save_path='./Ioya2Vec/Ioya2Vec.pickle')

CPU times: user 8min 6s, sys: 2.21 s, total: 8min 8s
Wall time: 1min 27s
