In [116]:
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
from gensim.utils import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from tqdm import tqdm

from constants import *
tqdm.pandas()

In [87]:
set = NYT
df_path = get_data_path(set)
df = get_data(set)
seeds = get_data(set,type='seedwords')
seeds_flat = []
for seed in seeds.values():
    seeds_flat.extend(seed)
df.head()

Unnamed: 0,sentence,label,tfidf-auto
0,"nasa, in preparation for a spacewalk on saturd...",science,science
1,if professional pride and strong defiance can ...,sports,sports
2,"admittedly, the language is reconstructed and ...",arts,arts
3,"palermo, sicily — roberta vinci beat top-seede...",sports,sports
4,the argentine soccer club san lorenzo complete...,sports,sports


In [88]:
def f1_scores(df,y_pred, y_true='label'):
    """Calculate macro and micro F1 scores."""
    return (f1_score(df[y_true],df[y_pred],average='macro'),
            f1_score(df[y_true],df[y_pred],average='micro'))
f1_scores(df,'tfidf-auto')

(0.607252385118526, 0.8480090222954801)

In [None]:
def get_embeddings (set,gran='coarse', tokenizer=tokenize):
    save_path = os.path.join(MODELS_PATH,set+'.txt')
    if not os.path.isfile(save_path): # no saved embeddings, calculate new.
        df = get_data(set, granularity=gran)
        sents = df['sentence'].apply(tokenizer).apply(tuple)
        w2v = Word2Vec(sents)
        w2v.wv.save_word2vec_format(save_path)
    return KeyedVectors.load_word2vec_format(save_path)



In [117]:
class TQDMCallback(CallbackAny2Vec):
    """Callback to visualize Word2Vec training progress using tqdm."""
    def __init__(self, epochs):
        self.progress_bar = tqdm(total=epochs, desc="Epochs", position=0)
    def on_epoch_end(self, model):
        self.progress_bar.update(1)
    def on_train_end(self, model):
        self.progress_bar.close()

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

In [118]:
print("Tokenizing dataset.")
sents = df['sentence'].progress_apply(tokenize)
print("Converting tokenized data to tuples.")
sents = sents.progress_apply(tuple)
sents = sents
print("Generating Word2Vec embeddings.")
call = TQDMCallback(5)
w2v = Word2Vec(sents, callbacks=[call]) # should take ~50 sec or less

Tokenizing dataset.


100%|██████████| 11527/11527 [00:00<00:00, 722362.80it/s]

Converting tokenized data to tuples.



100%|██████████| 11527/11527 [00:08<00:00, 1395.37it/s]


Generating Word2Vec embeddings.


Epochs:   0%|          | 0/5 [00:45<?, ?it/s]
Epochs:  20%|██        | 1/5 [00:06<00:27,  6.82s/it]

KeyboardInterrupt: 

In [107]:
w2v.wv.save_word2vec_format('models/nyt-coarse.txt')

In [108]:
wv = KeyedVectors.load_word2vec_format("models/nyt-coarse.txt")