# Word2Vec before Text Clustering - Incomplete

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import random
from multiprocessing import cpu_count

import gensim
import numpy as np
import pandas as pd
import smart_open
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline

In [3]:
%aimport src.clean.clean_data
from src.clean.clean_data import TextCleaner

In [4]:
# !cat -n src/clean/clean_data.py | sed -n -e 18,90p

## About

This is a walkthrough of using `Word2Vec` by partially following the example (of how to use `Word2Vec` on its own, without clustering) from the Gensim documentation. See the **Links** section for a link to the example from the documentation.

## User Inputs

In [5]:
# Set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], "test", "test_data")
lee_train_file = os.path.join(test_data_dir, "lee_background.cor")
lee_test_file = os.path.join(test_data_dir, "lee.cor")

# Number of most similar training docs to show for a single inference doc
num_most_similar_docs = 5

In [6]:
all_stop_words = set(stopwords.words("english"))
manual_stop_words = [
    # HTML tags
    "http",
    "href",
    "jpg",
    "imgur",
    "com",
    "img",
    "alt",
    "li",
    "ul",
    "ol",
    "src",
    "em",
    "en",
    "rel",
    "nofollow",
    "blockquote",
    "www",
    "png",
    "aedt",
]

# Manually add to stop words
for manual_stop_word in manual_stop_words:
    all_stop_words.add(manual_stop_word)

In [7]:
def get_tokens_with_gensim_tags(all_tokens, tokens_only=False):
    for i, tokens in enumerate(all_tokens):
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [11]:
def get_all_doc_tokens_word_vectors(model, doc, dim=300):
    wvcs = []
    for w in doc:
        if w in model.wv.index_to_key:
            wvcs.append(model.wv[w])
    if not wvcs:
        wvcs = [np.zeros(dim)]
    return wvcs

In [25]:
def get_avg_doc_word(corpus, model, data, dim=300, concat_with_data=False):
    executor = Parallel(n_jobs=cpu_count(), backend="multiprocessing")
    tasks = (
        delayed(get_all_doc_tokens_word_vectors)(model, corpus_doc, dim)
        for corpus_doc in corpus
    )
    vecs = executor(tasks)
    corpus_word_vectors = np.array([np.mean(v, axis=0) for v in vecs])
    df_corpus_word_vectors = pd.DataFrame(corpus_word_vectors, index=data.index)
    assert corpus_word_vectors.shape == (data.shape[0], dim)
    if concat_with_data:
        data_trans = pd.concat([data, df_corpus_word_vectors], axis=1)
        return data_trans
    else:
        return df_corpus_word_vectors

In [22]:
class WordEmbeddingsVectorizer(BaseEstimator, TransformerMixin):
    """Use word embeddings to vectorize text."""

    def __init__(
        self,
        data,
        dim=300,
        concat_with_data=False,
    ):
        self.dim = dim
        self.data = data
        self.concat_with_data = concat_with_data

    def fit(self, X, y=None):
        self.model = gensim.models.Word2Vec(
            sentences=X.tolist(), vector_size=self.dim, workers=1
        )
        self.model.build_vocab(X.tolist(), update=True)
        return self

    def transform(self, X):
        # tokens_all_docs = pipe_clean.fit_transform(X).tolist()
        # corpus = list(get_tokens_with_gensim_tags(tokens_all_docs, True))
        X_transformed = get_avg_doc_word(
            X.tolist(), self.model, X, self.dim, self.concat_with_data
        )
        return X_transformed
        # return pd.DataFrame(np.random.rand(len(self.data), self.dim))

## Get Data

In [8]:
%%time
with smart_open.open(lee_train_file, encoding="iso-8859-1") as f:
    df_train = pd.DataFrame([line for i, line in enumerate(f)], columns=["text"])
with smart_open.open(lee_test_file, encoding="iso-8859-1") as f:
    df_test = pd.DataFrame([line for i, line in enumerate(f)], columns=["text"])

CPU times: user 1.22 ms, sys: 1.74 ms, total: 2.96 ms
Wall time: 2.39 ms


## Data Cleaning

Define the text cleaning pipeline

In [None]:
pipe_clean = Pipeline([("clean", TextCleaner("text", False, True, [], 2, 15))])

Clean the training data

In [14]:
train_clean = pipe_clean.fit_transform(df_train)
train_clean

0      [hundreds, of, people, have, been, forced, to,...
1      [indian, security, forces, have, shot, dead, e...
2      [the, national, road, toll, for, the, christma...
3      [argentina, political, and, economic, crisis, ...
4      [six, midwives, have, been, suspended, at, wol...
                             ...                        
295    [team, of, australian, and, israeli, scientist...
296    [today, is, world, aids, day, and, the, latest...
297    [the, federal, national, party, has, rejected,...
298    [university, of, canberra, academic, proposal,...
299    [australia, will, take, on, france, in, the, d...
Name: text, Length: 300, dtype: object

Clean the testing data

In [27]:
test_clean = pipe_clean.fit_transform(df_test)
test_clean.head()

0    [the, national, executive, of, the, strife, to...
1    [cash, strapped, financial, services, group, a...
2    [the, united, states, government, has, said, i...
3    [radical, armed, islamist, group, with, ties, ...
4    [washington, has, sharply, rebuked, russia, ov...
Name: text, dtype: object

## Feature Engineering

Define the text vectorization pipeline

In [23]:
pipe = Pipeline([("vec", WordEmbeddingsVectorizer(df_train, 300))])

Vectorize the cleaned training data

In [26]:
%%time
train_wordvectors = pipe.fit_transform(train_clean)
train_wordvectors

CPU times: user 802 ms, sys: 130 ms, total: 932 ms
Wall time: 1.16 s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.015379,0.132064,0.132653,0.249821,0.050705,-0.227408,0.094230,0.518622,0.140756,-0.073674,...,0.055642,0.410941,0.214852,-0.015370,0.435837,0.381723,0.027690,-0.195002,0.390147,0.001138
1,0.013827,0.120411,0.120946,0.226582,0.046301,-0.207441,0.085038,0.471790,0.128211,-0.067391,...,0.050456,0.373905,0.195512,-0.014384,0.395954,0.347230,0.025046,-0.177048,0.353886,0.000755
2,0.014650,0.123485,0.125180,0.235291,0.047884,-0.212599,0.088169,0.486529,0.131155,-0.068046,...,0.052187,0.385863,0.201232,-0.014362,0.408062,0.357890,0.026131,-0.183148,0.366278,0.001784
3,0.014755,0.126605,0.125715,0.236070,0.048722,-0.215667,0.088734,0.491012,0.133693,-0.070326,...,0.052581,0.389804,0.203781,-0.014380,0.412494,0.361495,0.026240,-0.185311,0.368758,0.000643
4,0.015124,0.132882,0.132400,0.247938,0.050812,-0.227006,0.093881,0.517039,0.140871,-0.074334,...,0.055102,0.409226,0.214100,-0.015087,0.433899,0.380943,0.027579,-0.194678,0.387741,0.000959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.015124,0.133033,0.131424,0.246837,0.050581,-0.226320,0.093243,0.515600,0.140196,-0.074045,...,0.055544,0.408626,0.213879,-0.015003,0.432522,0.379323,0.027245,-0.194437,0.386387,0.000758
296,0.014759,0.125671,0.126451,0.237189,0.047705,-0.215873,0.089730,0.493708,0.133116,-0.070558,...,0.052804,0.390924,0.205025,-0.014976,0.414203,0.362946,0.026128,-0.184863,0.370421,0.000487
297,0.016204,0.137192,0.136397,0.255157,0.052206,-0.233958,0.096799,0.532737,0.144682,-0.076510,...,0.057431,0.421997,0.220883,-0.015740,0.446918,0.391884,0.028479,-0.200539,0.399464,0.000705
298,0.015782,0.130751,0.131227,0.245762,0.049983,-0.224287,0.093026,0.511580,0.138441,-0.073207,...,0.054973,0.405275,0.212553,-0.015480,0.429807,0.376785,0.027188,-0.192400,0.384460,0.000670


Vectorize the cleaned testing data

In [28]:
%%time
test_wordvectors = pipe.fit_transform(test_clean)
test_wordvectors.head()

CPU times: user 94.5 ms, sys: 61.5 ms, total: 156 ms
Wall time: 165 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.002767,0.005383,-0.001078,0.002227,-1.2e-05,-0.010178,0.00798,0.018911,-0.001309,-0.004781,...,0.005782,0.014351,0.004792,0.003489,0.012449,0.012113,0.001945,-0.004405,0.008162,-0.008821
1,-0.003212,0.005712,-0.001433,0.001637,-0.000171,-0.010843,0.007856,0.01848,-0.000665,-0.00506,...,0.005866,0.013696,0.004158,0.003206,0.011131,0.011945,0.001668,-0.004929,0.007838,-0.008224
2,-0.002954,0.005289,-0.001137,0.001585,-2.1e-05,-0.009995,0.008089,0.017765,-0.000918,-0.004422,...,0.005356,0.013301,0.00443,0.002472,0.011226,0.011633,0.001887,-0.004263,0.007824,-0.008277
3,-0.00248,0.005009,-0.001367,0.002093,0.000117,-0.010719,0.00756,0.018184,-0.001482,-0.004605,...,0.005128,0.013681,0.004955,0.003049,0.012078,0.012277,0.001907,-0.003974,0.008276,-0.009079
4,-0.003054,0.00482,-0.000597,0.001728,-0.00049,-0.011082,0.008338,0.019956,-0.000976,-0.004777,...,0.005363,0.014651,0.005819,0.002919,0.012554,0.012859,0.00183,-0.003707,0.008887,-0.008866


## Links

1. [Gensim Docs Word2Vec tutorial](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#word2vec-demo)
2. [Analytics Vidhya - use KMeans after Word2Vec](https://medium.com/analytics-vidhya/topic-modelling-using-word-embeddings-and-latent-dirichlet-allocation-3494778307bc)
3. [My Space News project Topic Modeling notebook](https://github.com/edesz/miscellaneous/blob/master/links/nlp-topic-modeling/8_gensim_coherence_nlp_trials_v2.ipynb)
   - old version with Word2Vec from SpaCy