GH-61: added visualization and word/char visual

tSNE makes sense on the word level. Bidirectional character embeddings don't make much sense.
flairNLP · Oct 4, 2018 · 3654ab7 · 3654ab7
1 parent 4a389ac
commit 3654ab7
Show file tree

Hide file tree

Showing 5 changed files with 223 additions and 38 deletions.
diff --git a/.gitignore b/.gitignore
@@ -104,3 +104,6 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# data directory
+resources/data
diff --git a/flair/visual/__init__.py b/flair/visual/__init__.py
@@ -1 +1 @@
-from .tsne import tSNE
+from .manifold import tSNE, uMap, show, prepare_word_embeddings, prepare_char_embeddings, word_contexts, char_contexts
diff --git a/flair/visual/manifold.py b/flair/visual/manifold.py
@@ -0,0 +1,133 @@
+from sklearn.manifold import TSNE
+from umap import UMAP
+import tqdm
+import numpy
+import blessings
+
+
+t = blessings.Terminal()
+
+
+def prepare_word_embeddings(embeddings, sentences):
+    X = []
+
+    print('computing embeddings')
+    for sentence in tqdm.tqdm(sentences):
+        embeddings.embed(sentence)
+
+        for i, token in enumerate(sentence):
+            X.append(token.embedding.detach().numpy()[None, :])
+
+    X = numpy.concatenate(X, 0)
+
+    return X
+
+
+def word_contexts(sentences):
+    contexts = []
+
+    for sentence in sentences:
+
+        strs = [x.text for x in sentence.tokens]
+
+        for i, token in enumerate(strs):
+            prop = '<b><font color="red"> {token} </font></b>'.format(
+                token=token)
+
+            prop = ' '.join(strs[max(i - 4, 0):i]) + prop
+            prop = prop + ' '.join(strs[i + 1:min(len(strs), i + 5)])
+
+            contexts.append('<p>' + prop + '</p>')
+
+    return contexts
+
+
+def prepare_char_embeddings(embeddings, sentences):
+
+    X = []
+
+    print('computing embeddings')
+    for sentence in tqdm.tqdm(sentences):
+
+        sentence = ' '.join([x.text for x in sentence])
+
+        hidden = embeddings.lm.get_representation(sentence)
+        X.append(hidden.squeeze().detach().numpy())
+
+    X = numpy.concatenate(X, 0)
+
+    return X
+
+
+def char_contexts(sentences):
+
+    contexts = []
+
+    for sentence in sentences:
+        sentence = ' '.join([token.text for token in sentence])
+
+        for i, char in enumerate(sentence):
+
+            context = '<span style="background-color: yellow"><b>{}</b></span>'.format(char)
+            context = ''.join(sentence[max(i - 30, 0):i]) + context
+            context = context + ''.join(sentence[i + 1:min(len(sentence), i + 30)])
+
+            contexts.append(context)
+
+    return contexts
+
+
+class _Transform:
+    def __init__(self):
+        pass
+
+    def fit(self, X):
+        return self.transform.fit_transform(X)
+
+
+class tSNE(_Transform):
+    def __init__(self):
+
+        super().__init__()
+
+        self.transform = \
+            TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
+
+
+class uMap(_Transform):
+    def __init__(self):
+
+        super().__init__()
+
+        self.transform = UMAP(
+            n_neighbors = 5,
+            min_dist = 0.3,
+            metric = 'correlation',
+        )
+
+
+def show(X, contexts):
+    import matplotlib.pyplot
+    import mpld3
+
+    fig, ax = matplotlib.pyplot.subplots()
+
+    ax.grid(True, alpha=0.3)
+
+    points = ax.plot(X[:, 0], X[:, 1], 'o', color='b',
+                     mec='k', ms=5, mew=1, alpha=.6)
+
+    ax.set_xlabel('x')
+    ax.set_ylabel('y')
+    ax.set_title('Hover mouse to reveal context', size=20)
+
+    tooltip = mpld3.plugins.PointHTMLTooltip(
+        points[0],
+        contexts,
+        voffset=10,
+        hoffset=10
+    )
+
+    mpld3.plugins.connect(fig, tooltip)
+
+    mpld3.show()
diff --git a/flair/visual/tsne.py b/flair/visual/tsne.py
diff --git a/tests/test_visual.py b/tests/test_visual.py
@@ -1,12 +1,12 @@
-from flair.visual import tSNE
+from flair.visual import *
 from flair.data import Sentence
 from flair.embeddings import CharLMEmbeddings, StackedEmbeddings
 import unittest
 import numpy
 
 
-class TesttSNE(unittest.TestCase):
-    def test(self):
+class Test(unittest.TestCase):
+    def test_prepare(self):
         with open('resources/data/snippet.txt') as f:
             sentences = [x for x in f.read().split('\n') if x]
 
@@ -19,11 +19,91 @@ def test(self):
             [charlm_embedding_backward, charlm_embedding_forward]
         )
 
-        trans_ = tSNE(embeddings)
+        X = prepare_word_embeddings(embeddings, sentences)
+        contexts = word_contexts(sentences)
+
+        numpy.save('resources/data/embeddings', X)
+
+        with open('resources/data/contexts.txt', 'w') as f:
+            f.write('\n'.join(contexts))
+
+    def test_tSNE(self):
+
+        X = numpy.load('resources/data/embeddings.npy')
+        trans_ = tSNE()
+        reduced = trans_.fit(X)
+
+        numpy.save('resources/data/tsne', reduced)
+
+    def test__prepare_char(self):
+
+        with open('resources/data/snippet.txt') as f:
+            sentences = [x for x in f.read().split('\n') if x]
+
+        sentences = [Sentence(x) for x in sentences[:100]]
+
+        embeddings = CharLMEmbeddings('news-forward')
+
+        X_forward = prepare_char_embeddings(embeddings, sentences)
+
+        embeddings = CharLMEmbeddings('news-backward')
+
+        X_backward = prepare_char_embeddings(embeddings, sentences)
+
+        X = numpy.concatenate([X_forward, X_backward], axis=1)
+
+        numpy.save('resources/data/char_embeddings', X)
+
+    def test_tSNE_char(self):
+
+        X = numpy.load('resources/data/char_embeddings.npy')
+        trans_ = tSNE()
+        reduced = trans_.fit(X)
+
+        numpy.save('resources/data/char_tsne', reduced)
+
+    def test_char_contexts(self):
+
+        with open('resources/data/snippet.txt') as f:
+            sentences = [x for x in f.read().split('\n') if x]
+
+        sentences = [Sentence(x) for x in sentences[:100]]
+
+        contexts = char_contexts(sentences)
+
+        with open('resources/data/char_contexts.txt', 'w') as f:
+            f.write('\n'.join(contexts))
+
+
+class TestuMap(unittest.TestCase):
+    def test(self):
+
+        X = numpy.load('resources/data/embeddings.npy')
+
+        reduced = uMap().fit(X)
+
+        numpy.save('resources/data/umap', reduced)
+
+
+class Test_show(unittest.TestCase):
+    def test_word(self):
+
+        reduced = numpy.load('resources/data/umap.npy')
+
+        with open('resources/data/contexts.txt') as f:
+            contexts = f.read().split('\n')
+
+        show(reduced, contexts)
+
+    def test_char(self):
+
+        reduced = numpy.load('resources/data/char_tsne.npy')
+
+        with open('resources/data/char_contexts.txt') as f:
+            contexts = f.read().split('\n')
 
-        embeddings = trans_.fit(sentences)
+        show(reduced, contexts)
 
-        numpy.save(embeddings, 'resources/data/embeddings.npy')
 
 
 if __name__ == '__main__':