Skip to content

Commit

Permalink
GH-61: added visualization and word/char visual
Browse files Browse the repository at this point in the history
tSNE makes sense on the word level. Bidirectional character embeddings
don't make much sense.
  • Loading branch information
Duncan Blythe authored and tabergma committed Oct 4, 2018
1 parent 4a389ac commit 3654ab7
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 38 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -104,3 +104,6 @@ venv.bak/

# mypy
.mypy_cache/

# data directory
resources/data
2 changes: 1 addition & 1 deletion flair/visual/__init__.py
@@ -1 +1 @@
from .tsne import tSNE
from .manifold import tSNE, uMap, show, prepare_word_embeddings, prepare_char_embeddings, word_contexts, char_contexts
133 changes: 133 additions & 0 deletions flair/visual/manifold.py
@@ -0,0 +1,133 @@
from sklearn.manifold import TSNE
from umap import UMAP
import tqdm
import numpy
import blessings


t = blessings.Terminal()


def prepare_word_embeddings(embeddings, sentences):
X = []

print('computing embeddings')
for sentence in tqdm.tqdm(sentences):
embeddings.embed(sentence)

for i, token in enumerate(sentence):
X.append(token.embedding.detach().numpy()[None, :])

X = numpy.concatenate(X, 0)

return X


def word_contexts(sentences):
contexts = []

for sentence in sentences:

strs = [x.text for x in sentence.tokens]

for i, token in enumerate(strs):
prop = '<b><font color="red"> {token} </font></b>'.format(
token=token)

prop = ' '.join(strs[max(i - 4, 0):i]) + prop
prop = prop + ' '.join(strs[i + 1:min(len(strs), i + 5)])

contexts.append('<p>' + prop + '</p>')

return contexts


def prepare_char_embeddings(embeddings, sentences):

X = []

print('computing embeddings')
for sentence in tqdm.tqdm(sentences):

sentence = ' '.join([x.text for x in sentence])

hidden = embeddings.lm.get_representation(sentence)
X.append(hidden.squeeze().detach().numpy())

X = numpy.concatenate(X, 0)

return X


def char_contexts(sentences):

contexts = []

for sentence in sentences:
sentence = ' '.join([token.text for token in sentence])

for i, char in enumerate(sentence):

context = '<span style="background-color: yellow"><b>{}</b></span>'.format(char)
context = ''.join(sentence[max(i - 30, 0):i]) + context
context = context + ''.join(sentence[i + 1:min(len(sentence), i + 30)])

contexts.append(context)

return contexts


class _Transform:
def __init__(self):
pass

def fit(self, X):
return self.transform.fit_transform(X)


class tSNE(_Transform):
def __init__(self):

super().__init__()

self.transform = \
TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)


class uMap(_Transform):
def __init__(self):

super().__init__()

self.transform = UMAP(
n_neighbors = 5,
min_dist = 0.3,
metric = 'correlation',
)


def show(X, contexts):
import matplotlib.pyplot
import mpld3

fig, ax = matplotlib.pyplot.subplots()

ax.grid(True, alpha=0.3)

points = ax.plot(X[:, 0], X[:, 1], 'o', color='b',
mec='k', ms=5, mew=1, alpha=.6)

ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Hover mouse to reveal context', size=20)

tooltip = mpld3.plugins.PointHTMLTooltip(
points[0],
contexts,
voffset=10,
hoffset=10
)

mpld3.plugins.connect(fig, tooltip)

mpld3.show()
31 changes: 0 additions & 31 deletions flair/visual/tsne.py

This file was deleted.

92 changes: 86 additions & 6 deletions tests/test_visual.py
@@ -1,12 +1,12 @@
from flair.visual import tSNE
from flair.visual import *
from flair.data import Sentence
from flair.embeddings import CharLMEmbeddings, StackedEmbeddings
import unittest
import numpy


class TesttSNE(unittest.TestCase):
def test(self):
class Test(unittest.TestCase):
def test_prepare(self):
with open('resources/data/snippet.txt') as f:
sentences = [x for x in f.read().split('\n') if x]

Expand All @@ -19,11 +19,91 @@ def test(self):
[charlm_embedding_backward, charlm_embedding_forward]
)

trans_ = tSNE(embeddings)
X = prepare_word_embeddings(embeddings, sentences)
contexts = word_contexts(sentences)

numpy.save('resources/data/embeddings', X)

with open('resources/data/contexts.txt', 'w') as f:
f.write('\n'.join(contexts))

def test_tSNE(self):

X = numpy.load('resources/data/embeddings.npy')
trans_ = tSNE()
reduced = trans_.fit(X)

numpy.save('resources/data/tsne', reduced)

def test__prepare_char(self):

with open('resources/data/snippet.txt') as f:
sentences = [x for x in f.read().split('\n') if x]

sentences = [Sentence(x) for x in sentences[:100]]

embeddings = CharLMEmbeddings('news-forward')

X_forward = prepare_char_embeddings(embeddings, sentences)

embeddings = CharLMEmbeddings('news-backward')

X_backward = prepare_char_embeddings(embeddings, sentences)

X = numpy.concatenate([X_forward, X_backward], axis=1)

numpy.save('resources/data/char_embeddings', X)

def test_tSNE_char(self):

X = numpy.load('resources/data/char_embeddings.npy')
trans_ = tSNE()
reduced = trans_.fit(X)

numpy.save('resources/data/char_tsne', reduced)

def test_char_contexts(self):

with open('resources/data/snippet.txt') as f:
sentences = [x for x in f.read().split('\n') if x]

sentences = [Sentence(x) for x in sentences[:100]]

contexts = char_contexts(sentences)

with open('resources/data/char_contexts.txt', 'w') as f:
f.write('\n'.join(contexts))


class TestuMap(unittest.TestCase):
def test(self):

X = numpy.load('resources/data/embeddings.npy')

reduced = uMap().fit(X)

numpy.save('resources/data/umap', reduced)


class Test_show(unittest.TestCase):
def test_word(self):

reduced = numpy.load('resources/data/umap.npy')

with open('resources/data/contexts.txt') as f:
contexts = f.read().split('\n')

show(reduced, contexts)

def test_char(self):

reduced = numpy.load('resources/data/char_tsne.npy')

with open('resources/data/char_contexts.txt') as f:
contexts = f.read().split('\n')

embeddings = trans_.fit(sentences)
show(reduced, contexts)

numpy.save(embeddings, 'resources/data/embeddings.npy')


if __name__ == '__main__':
Expand Down

0 comments on commit 3654ab7

Please sign in to comment.