In [13]:
import textacy.preprocessing as p
import numpy as np
from tqdm import tqdm_notebook
import nltk
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import sent_tokenize
import spacy
import pandas as pd

I've added some stuff to the clean_sentence function; since I'll be scraping e-mails---a type of text prone to having typos, phone numbers, and other such miscellanea---I'll need to normalize phone numbers, e-mail addresses, and the like.

Pay attention those of you wanting to analyze tweets! Textacy also has functions for the automatic handling of things like emojis and hashtags.

In [7]:

def clean_sentence(sent):
    clean = p.normalize.normalize_whitespace(sent.lower())
    clean = p.remove.remove_punctuation(clean)
    clean = p.replace.replace_emails(clean)
    clean = p.replace.replace_phone_numbers(clean)
    clean = p.replace.replace_urls(clean)
    clean = p.replace.replace_numbers(clean)
    return(clean)

Let's look at our dataset, the 20 newsgroups dataset. Per the SKLearn team, the 20 newsgroups dataset comprises around 18,000 newsgroups posts on 20 topics split in two subsets, one for training and one for testing. 

The dataset is for text-classification purposes, the goal being to predict the topic given text alone. I'm using it here because it's document separated, meaning I can meaningfully train word vectors for models like tf-idf.

In [8]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers'))

dataset.data[0]

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [9]:
X = []

for mail in tqdm_notebook(dataset.data):
    cleaned = []
    sents = sent_tokenize(mail)
    for sent in sents:
        cleaned.append(clean_sentence(sent))
    X.append(". ".join([sent for sent in cleaned]))
    

HBox(children=(IntProgress(value=0, max=11314), HTML(value='')))




Let's now train our word/document embeddings using our traditional models 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [11]:
count = CountVectorizer()
count_embeddings = count.fit_transform(X).T
tfidf = TfidfVectorizer()
tfidf_embeddings = tfidf.fit_transform(X).T
svd = TruncatedSVD(n_components=300)
lsa_embeddings = svd.fit_transform(tfidf_embeddings)

In [12]:
print(count_embeddings.shape)
print(tfidf_embeddings.shape)
print(lsa_embeddings.shape)
print(svd.explained_variance_ratio_.sum())
print(lsa_embeddings)

(103447, 11314)
(103447, 11314)
(103447, 300)
0.3008538127181256
[[ 1.12526110e-05  4.67549721e-05  6.23684888e-06 ... -8.68203222e-06
   1.40763417e-06  2.43330462e-06]
 [ 1.12526110e-05  4.67549721e-05  6.23684885e-06 ... -8.68203223e-06
   1.40763416e-06  2.43330459e-06]
 [ 1.12526110e-05  4.67549721e-05  6.23684884e-06 ... -8.68203223e-06
   1.40763417e-06  2.43330462e-06]
 ...
 [ 1.77157514e-03 -6.21367483e-04 -2.41682244e-03 ...  3.78105472e-04
  -2.37568416e-03  6.77284095e-05]
 [ 1.13762380e-03 -3.07518129e-04 -1.78660085e-04 ... -3.34097545e-03
   1.38152804e-03  1.34102966e-03]
 [ 1.77157514e-03 -6.21367483e-04 -2.41682244e-03 ...  3.78105472e-04
  -2.37568416e-03  6.77284095e-05]]


Great. Note how we transposed the tfidf and count vectors. That was so we index directly into words rather than documents.

Our word vectors live in 11,314D space, the number of documents there are. Also note that though we reduce dimension to 300 (roughly equal 1/40th the original dimension) we account for 30% of the information.

Now, I'm going to collect our individual documents into one list of sentences, getting them ready for embedding by our more modern methods.

In [360]:
X_combined = []
for doc in X:
    new_sents = sent_tokenize(doc)
    X_combined += new_sents

In [361]:
print(len(X))
print(len(X_combined))

11314
165563


In [362]:
with open("./corpus.txt", "w") as fout:
    for sent in X_combined:
        fout.write(sent + "\n")

To get our GloVe embeddings, we're going to use a python wrapper around the original C code. It's very easy to use; feel free to reuse this code (changing the corpus, of course) if you are going to collect GloVe embeddings.

Also, this is going to take a couple of minutes, so let's use this opportunity to hear your elevator pitches for final projects.

In [363]:
import glove_pywrapper

CORPUS = "./corpus.txt"
glove = glove_pywrapper.GloveWrapper(CORPUS, "X_combined", vector_size=300, window_size=10)
#prepare vocabulary count
glove.vocab_count()
#prepare co-occurrence matrix
glove.cooccur()
#reshuffle
glove.shuffle()
#glove train
glove.glove()

vocab count
build/vocab_count -max-vocab 100000-min-count 5 -versbose 2 < ./corpus.txt > /home/dan/Geometric_Models/자료/demo/November/train_dir/glove/X_combined_vocab.txt
coocurr matrix
build/cooccur -memory 4.0 -vocab-file /home/dan/Geometric_Models/자료/demo/November/train_dir/glove/X_combined_vocab.txt -versbose 2 -window-size 10 -symmetric 1 -overflow-file tempoverflow < ./corpus.txt > /home/dan/Geometric_Models/자료/demo/November/train_dir/glove/X_combined_cooccurrence.bin
coocurr matrix shuffle
build/shuffle  -memory 4.0 -versbose 2 < /home/dan/Geometric_Models/자료/demo/November/train_dir/glove/X_combined_cooccurrence.bin > /home/dan/Geometric_Models/자료/demo/November/train_dir/glove/X_combined_cooccurrence.shuf.bin
train glove
build/glove  -save-file /home/dan/Geometric_Models/자료/demo/November/train_dir/glove/X_combined_vectors -threads 8 -input-file /home/dan/Geometric_Models/자료/demo/November/train_dir/glove/X_combined_cooccurrence.shuf.bin -x-max 10 -iter 25 -vector-size 300 -binary 

Now we're going to use a very cool package called gensim which has many uses, not least among them a very good implementation of word2vec. Here, we're going to use it to convert our GloVe embeddings output (which is a raw .txt file) to something manageable in Python. Again, this code should prove helpful if you're going to be using GloVe for your HW/project(s).

In [364]:
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = "./train_dir/glove/X_combined_vectors.txt"
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

glove = KeyedVectors.load_word2vec_format(tmp_file)

In [365]:
glove["the"]

array([ 0.949128,  0.200447, -0.212956,  0.86906 ,  0.437179, -0.16734 ,
        0.035951, -0.038623, -0.125448,  0.588126, -0.589043,  0.524886,
       -1.039198,  0.088763,  0.039482,  0.230936, -0.551745, -0.146105,
       -0.364771,  0.04054 , -0.361246, -0.178435,  0.600075,  0.257776,
        0.17523 ,  0.514284, -0.387354,  0.472283, -0.303998,  0.31414 ,
        0.328614, -1.277768,  0.032194, -0.40022 ,  0.242385, -0.341265,
        0.182074, -0.375374, -1.162917,  0.763035, -0.191276,  0.943603,
        0.571229, -0.301747, -1.158828,  0.532953, -0.378093,  0.57381 ,
       -0.427321, -0.295825,  0.111481,  0.37496 ,  0.139503,  1.021863,
        0.458283,  0.253207, -0.454079, -0.136772, -0.455405, -0.0052  ,
        0.085712, -0.02069 ,  0.426193, -0.66345 ,  0.220014, -0.75807 ,
       -0.189003, -0.378212,  0.455509, -0.027778, -0.503643,  0.819416,
        0.812403,  0.466262, -0.717948, -0.166966,  0.216494, -0.144243,
        0.696485, -0.489462, -0.273684,  0.320622, 

Now we'll use the same package to train our W2V embeddings. The input to a w2v model is a list of sentences, where each sentence is a list of word tokens. Thus, it's a list of lists.

In [14]:
from gensim.models import Word2Vec

In [367]:
w2v = Word2Vec([sent.split() for sent in X_combined], size=300, window=5, min_count=1, workers=8)

In [368]:
w2v.train([sent.split() for sent in X_combined], total_examples=model.corpus_count, epochs=15)

(37769903, 49734030)

In [369]:
print(len(w2v.wv["the"]))
print(len(glove["the"]))
print(len(count_embeddings[count.vocabulary_["the"]].toarray()[0]))
print(len(tfidf_embeddings[tfidf.vocabulary_["the"]].toarray()[0]))
print(len(lsa_embeddings[tfidf.vocabulary_["the"]]))

300
300
11314
11314
300


Now, we have roughly 100,000 word vocabulary, far too much if we want to do visualizations, or clustering, or anything exploratory.

As such, we're going to discard the least common 99k or so words. Otherwise said, we'll keep around just the 1,000 or so most common words. 

In [370]:
from collections import Counter

In [371]:
c = Counter()

In [372]:
for mail in X:
    c.update(word for word in mail.split())

Note what I'm doing here. In addition to keeping only the most common words, I'm also taking their part-of-speech (POS). The POSs here serve to partition our word embeddings, a partition which we can then try to recreate by clustering.

This is an example of the kind of linguistic test I mentioned in the syllabus.

In [373]:
most_common = dict([(nltk.pos_tag([w], tagset="universal")[0]) for w, i in c.most_common(1000)])
NUM_TAGS = len(np.unique(list(most_common.values())))

In [374]:
most_common

{'.': '.',
 'the': 'DET',
 '_NUMBER_': 'NOUN',
 'to': 'PRT',
 'ax>': 'NOUN',
 'of': 'ADP',
 'a': 'DET',
 'and': 'CONJ',
 'i': 'NOUN',
 'in': 'ADP',
 'is': 'VERB',
 'that': 'ADP',
 'it': 'PRON',
 '>': 'NOUN',
 'for': 'ADP',
 'you': 'PRON',
 's': 'NOUN',
 'this': 'DET',
 'on': 'ADP',
 'be': 'VERB',
 'are': 'VERB',
 'have': 'VERB',
 'not': 'ADV',
 'with': 'ADP',
 't': 'NOUN',
 'as': 'ADP',
 'or': 'CONJ',
 'but': 'CONJ',
 'if': 'ADP',
 'was': 'VERB',
 'they': 'PRON',
 'can': 'VERB',
 'from': 'ADP',
 'by': 'ADP',
 'at': 'ADP',
 'an': 'DET',
 'm': 'NOUN',
 'there': 'ADV',
 'what': 'PRON',
 'will': 'VERB',
 'all': 'DET',
 'one': 'NUM',
 'would': 'VERB',
 'my': 'PRON',
 'we': 'PRON',
 'he': 'PRON',
 'do': 'VERB',
 'writes': 'NOUN',
 'about': 'ADP',
 'so': 'ADV',
 'has': 'VERB',
 'x': 'NOUN',
 'your': 'PRON',
 'article': 'NOUN',
 'no': 'DET',
 'edu': 'NOUN',
 '|>': 'NOUN',
 'any': 'DET',
 '`': '.',
 'some': 'DET',
 'me': 'PRON',
 'who': 'PRON',
 'which': 'DET',
 'out': 'ADP',
 'people': 'NOUN',

In [375]:
mc_w2v = pd.DataFrame([(word, most_common[word], w2v.wv[word]) for word in most_common if word in count.vocabulary_], 
                      columns=["Word", "POS", "Vector"])
mc_glove = pd.DataFrame([(word, most_common[word], glove[word]) for word in most_common if word in count.vocabulary_], 
                        columns=["Word", "POS", "Vector"])
mc_counts = pd.DataFrame([(word, most_common[word], count_embeddings[count.vocabulary_[word]].toarray()[0]) 
                          for word in most_common if word in count.vocabulary_], columns=["Word", "POS", "Vector"])
mc_tfidf = pd.DataFrame([(word, most_common[word], tfidf_embeddings[count.vocabulary_[word]].toarray()[0]) 
                          for word in most_common if word in count.vocabulary_], columns=["Word", "POS", "Vector"])
mc_lsa = pd.DataFrame([(word, most_common[word], lsa_embeddings[count.vocabulary_[word]]) 
                          for word in most_common if word in count.vocabulary_], columns=["Word", "POS", "Vector"])

Let's look at an example of our new dataframe.

In [376]:
mc_w2v

Unnamed: 0,Word,POS,Vector
0,the,DET,"[-1.388606, 0.06583077, -0.13645032, -1.863539..."
1,to,PRT,"[-0.6417752, 1.9310826, 0.69087327, -1.1071494..."
2,of,ADP,"[0.42809805, -0.055565987, 0.10761638, 0.07476..."
3,and,CONJ,"[-1.2817402, 1.5832126, 0.4680638, -0.03984642..."
4,in,ADP,"[-1.2154648, 1.3424835, 0.38822988, -1.0937023..."
5,is,VERB,"[-0.660043, 0.89603996, 0.04622833, -0.2136076..."
6,that,ADP,"[1.9276302, 2.8953228, 0.47525463, 0.42097518,..."
7,it,PRON,"[-0.5596454, 0.16515075, 1.5768685, -1.1366521..."
8,for,ADP,"[0.2474737, 1.208332, 0.09670697, -0.6879694, ..."
9,you,PRON,"[0.2568737, 2.237701, -1.646235, -0.6529777, 1..."


And if we want to reclaim just the word-embedding matrices...

In [377]:
np.stack(mc_tfidf.Vector.values, axis=0).shape

(931, 11314)

Let's test our clusterings...

In [325]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score as ari

In [378]:
AC_w2v = AgglomerativeClustering(n_clusters=NUM_TAGS)
AC_w2v.fit(np.stack(mc_w2v.Vector.values, axis=0))
ari(AC_w2v.labels_, mc_w2v.POS)

0.0063597023157344265

In [379]:
AC_glove = AgglomerativeClustering(n_clusters=NUM_TAGS)
AC_glove.fit(np.stack(mc_glove.Vector.values, axis=0))
ari(AC_glove.labels_, mc_glove.POS)

0.028219603537472993

In [380]:
AC_tfidf = AgglomerativeClustering(n_clusters=NUM_TAGS)
AC_tfidf.fit(np.stack(mc_tfidf.Vector.values, axis=0))
ari(AC_tfidf.labels_, mc_tfidf.POS)

0.17998482982370506

In [381]:
AC_counts = AgglomerativeClustering(n_clusters=NUM_TAGS)
AC_counts.fit(np.stack(mc_counts.Vector.values, axis=0))
ari(AC_counts.labels_, mc_counts.POS)

0.2032500495348084

In [382]:
AC_lsa = AgglomerativeClustering(n_clusters=NUM_TAGS)
AC_lsa.fit(np.stack(mc_lsa.Vector.values, axis=0))
ari(AC_lsa.labels_, mc_lsa.POS)

0.1902577385422497

We can examine clusters with code like the following:

In [340]:
lsa_answers = [item for item in zip(mc_lsa["Word"], mc_lsa["POS"], AC_lsa.labels_)]

In [386]:
[item for item in lsa_answers if item[2] == 5]

[('for', 'ADP', 5),
 ('this', 'DET', 5),
 ('on', 'ADP', 5),
 ('be', 'VERB', 5),
 ('are', 'VERB', 5),
 ('have', 'VERB', 5),
 ('not', 'ADV', 5),
 ('with', 'ADP', 5),
 ('as', 'ADP', 5),
 ('or', 'CONJ', 5),
 ('if', 'ADP', 5),
 ('they', 'PRON', 5),
 ('can', 'VERB', 5),
 ('there', 'ADV', 5),
 ('what', 'PRON', 5),
 ('will', 'VERB', 5),
 ('would', 'VERB', 5),
 ('my', 'PRON', 5),
 ('do', 'VERB', 5),
 ('your', 'PRON', 5),
 ('any', 'DET', 5)]

NOW! Let me ask you, why is it that our global vectors did so well here? How can they (seemingly) capture more syntactic information than something like w2v, which uses only local information? Scroll down for (my) answer.





(It has to do with frequencies. It will more often than not nail things like prepositions and any function word, since their counts will be very high, and thus their geometry different. These words will be further out on the fringe of our embeddings. The question remains, will they perform as well on the intrinsic evaluation tests we discussed before? That's for you to find out...)

Now we can visualize our embeddings

In [387]:
import plotly.figure_factory as ff

In [388]:
fig = ff.create_dendrogram(np.stack(mc_glove.Vector.values, axis=0), labels=mc_glove["Word"].values)
fig.update_layout(width=1200, height=1200)
fig.show()

We can try another clustering algorithm and see if that changes our results

In [389]:
from sklearn.cluster import KMeans
KM_w2v = KMeans(n_clusters=NUM_TAGS)
KM_w2v.fit(np.stack(mc_w2v.Vector.values, axis=0))
ari(KM_w2v.labels_, mc_w2v.POS)

0.04554279490607367

In [390]:
KM_glove = KMeans(n_clusters=NUM_TAGS)
KM_glove.fit(np.stack(mc_glove.Vector.values, axis=0))
ari(KM_glove.labels_, mc_glove.POS)

0.02616784276543452

In [391]:
KM_counts = KMeans(n_clusters=NUM_TAGS)
KM_counts.fit(np.stack(mc_counts.Vector.values, axis=0))
ari(KM_counts.labels_, mc_counts.POS)

0.19024590215540035

In [392]:
KM_tfidf = KMeans(n_clusters=NUM_TAGS)
KM_tfidf.fit(np.stack(mc_tfidf.Vector.values, axis=0))
ari(KM_tfidf.labels_, mc_tfidf.POS)

0.13287540707315038

In [393]:
KM_lsa = KMeans(n_clusters=NUM_TAGS)
KM_lsa.fit(np.stack(mc_lsa.Vector.values, axis=0))
ari(KM_lsa.labels_, mc_lsa.POS)

0.19768047073243047

We can now perform DR and view just the embeddings with no clustering...

In [394]:
from umap import UMAP
umap = UMAP(n_components=2)

In [395]:
umap_w2v = umap.fit_transform(np.stack(mc_w2v.Vector.values, axis=0))
umap_glove = umap.fit_transform(np.stack(mc_glove.Vector.values, axis=0))
umap_counts = umap.fit_transform(np.stack(mc_counts.Vector.values, axis=0))
umap_tfidf = umap.fit_transform(np.stack(mc_tfidf.Vector.values, axis=0))
umap_lsa = umap.fit_transform(np.stack(mc_lsa.Vector.values, axis=0))

In [396]:
import plotly.express as px

def view_2D(points):
    temp = pd.DataFrame(points, columns=["x", "y"])
    temp["Word"] = mc_w2v.Word.values
    temp["POS"] = mc_w2v.POS.values
    fig = px.scatter(temp, x="x", y="y", text="Word", color="POS")
    fig.update_traces(textposition="top center")
    fig.show()

In [397]:
view_2D(umap_lsa)

Above, we ran our tests for POS clustering on w2v with window=5, but we've discussed before that such paradigmatic information is better captured by small window sizes. Let's therefore try w2v with window-size 1.

In [398]:
small_window = Word2Vec([sent.split() for sent in X_combined], size=300, window=1, min_count=1, workers=8)
small_window.train([sent.split() for sent in X_combined], total_examples=model.corpus_count, epochs=15)
mc_small_window = pd.DataFrame([(word, most_common[word], small_window.wv[word]) for word in most_common if word in count.vocabulary_], 
                      columns=["Word", "POS", "Vector"])

AC_small_window = AgglomerativeClustering(n_clusters=NUM_TAGS)
AC_small_window.fit(np.stack(mc_small_window.Vector.values, axis=0))
print(ari(AC_small_window.labels_, mc_small_window.POS))

KM_small_window = KMeans(n_clusters=NUM_TAGS)
KM_small_window.fit(np.stack(mc_small_window.Vector.values, axis=0))
print(ari(KM_small_window.labels_, mc_small_window.POS))

0.09865210839216504
0.08797440937132575
