In [4]:
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np
import csv

import fastText
import spacy 
import tensorflow as tf
import tensorflow_hub as hub

# Prepare Data

In [None]:
# need to find proper datasets

# Word Embedding

## GloVe

In [None]:
glove_model = open("embeddings/glove.6B.100d.txt", encoding="utf-8")

In [None]:
corpus = []

for sent in doc_text:
    words = sent.strip().split(" ")
    corpus += words
    
corpus = set(corpus)

In [None]:
word_embedding = {}

for line in glove_model:
    split = line.strip().split(" ")
    word = split[0].lower()
    vec = split[1:]

    if word in corpus:
        word_embedding[word] = np.array([float(num) for num in vec])
        
word_embedding["UNKNOWN_TOKEN"] = np.random.uniform(-0.25, 0.25, len(split)-1)


In [None]:
glove_embedding = []
unknow_tokens = []
for sent in doc_text:
    words = sent.strip().split(" ")
    vectors = []
    for word in words:
        if word in word_embedding:
            vectors.append(word_embedding[word])
        else:
            vectors.append(word_embedding["UNKNOWN_TOKEN"])
            unknow_tokens.append(word)
    vectors = np.array(vectors)
    glove_embedding.append(np.mean(vectors, axis=2))

## SpaCy Word2Vec

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
spacy_embedding = []

for sent in doc_text:
    spacy_embedding.append(nlp(sent).vector)

## FastText

In [None]:
ft_model = fastText.load_model('embedding/crawl-300d-2M-subword.bin')

In [None]:
ft_embedding = []

for sent in doc_text:
    ft_embedding.append(ft_model.get_sentence_vector(sent))

## Universal Sentence Encoder

In [None]:
use_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
use_model = hub.Module(use_url)

In [None]:
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    use_embedding = session.run(use_model(doc_text))

## Elmo

In [None]:
elmo_url = "https://tfhub.dev/google/elmo/2"
elmo_model = hub.Module(elmo_url)

In [None]:
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    elmo_embedding = session.run(use_model(doc_text))

# Clustering

In [None]:
embedding = glove_embedding
# embedding = spacy_embedding
# embedding = ft_embedding
# embedding = use_embedding
# embedding = elmo_embedding

## Elbow Method

In [None]:
from sklearn import metrics
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import numpy as np

maximum = 16

plt.plot()
distortions = []
K = range(2, maximum+1)
for k in K:
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=2018)
    kmeans.fit(embedding)
    distortions.append(kmeans.inertia_)

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

# K-mean

In [None]:
k = 7

In [None]:
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=2018).fit(embedding)
clusters = kmeans.predict(embedding)

## Data Visualization

In [None]:
arr = np.array(embedding)
tsne = TSNE(n_components=2, random_state=512)
reduced = tsne.fit_transform(arr)       
t = np.array(reduced).transpose()

In [None]:
from matplotlib.lines import Line2D

markers = ["x", "v", "o", "s", "*", ">", "<", "P", 
           '1', '2', '3', '4', 'h', "d", "|", "+"]
colors = ['darkorange', 'steelblue', 'limegreen',  'salmon', 'y',  'violet', 'c', 'tomato', 
          'rosybrown', 'brown', 'darkmagenta', 'pink', 'gold', "orange", "skyblue", "seagreen"]

fig, ax = plt.subplots(figsize=(20, 10))

for x, y, c in zip(t[0], t[1], clusters):
    ax.scatter(x, y, c=colors[c], marker=markers[c])

types = []
for c in set(clusters):
    types.append(Line2D([], [], color=colors[c], marker=markers[c], label=c))

plt.legend(handles=types, loc='upper left')
plt.show()