# https://rebeccabilbro.github.io/words-in-space/

In [None]:
import os

from sklearn.datasets.base import Bunch
from yellowbrick.download import download_all

## The path to the test data sets
FIXTURES  = os.path.join(os.getcwd(), "data")

## Dataset loading mechanisms
datasets = {
    "hobbies": os.path.join(FIXTURES, "hobbies")
}


def load_data(name, download=True):
    """
    Loads and wrangles the passed in text corpus by name.
    If download is specified, this method will download any missing files.
    """

    # Get the path from the datasets
    path = datasets[name]

    # Check if the data exists, otherwise download or raise
    if not os.path.exists(path):
        if download:
            download_all()
        else:
            raise ValueError((
                "'{}' dataset has not been downloaded, "
                "use the download.py module to fetch datasets"
            ).format(name))

    # Read the directories in the directory as the categories.
    categories = [
        cat for cat in os.listdir(path)
        if os.path.isdir(os.path.join(path, cat))
    ]

    files  = [] # holds the file names relative to the root
    data   = [] # holds the text read from the file
    target = [] # holds the string of the category

    # Load the data from the files in the corpus
    for cat in categories:
        for name in os.listdir(os.path.join(path, cat)):
            files.append(os.path.join(path, cat, name))
            target.append(cat)

            with open(os.path.join(path, cat, name), 'r') as f:
                data.append(f.read())


    # Return the data bunch for use similar to the newsgroups example
    return Bunch(
        categories=categories,
        files=files,
        data=data,
        target=target,
    )

corpus = load_data('hobbies')


In [None]:
nb = 5
for k in corpus.keys():
    print(k,len(corpus[k]))
    [print('\t+ '+str((i,e))) for i,e in enumerate(corpus[k]) if i < nb]
    if len(corpus[k]) > nb:
        print('\t+ ', '...')
        [print('\t+ '+str((i,e))) for i,e in enumerate(corpus[k]) if i >= len(corpus[k])-nb]
    print()

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
docs       = vectorizer.fit_transform(corpus.data)
labels     = corpus.target

In [None]:
from yellowbrick.text import TSNEVisualizer
tsne = TSNEVisualizer() # distance by default ?
tsne.fit(docs, labels)
tsne.poof()

In [None]:
tsne = TSNEVisualizer(metric="euclidean")
tsne.fit(docs, labels)
tsne.poof()


In [None]:
tsne = TSNEVisualizer(metric="cityblock")
tsne.fit(docs, labels)
tsne.poof()


In [None]:
tsne = TSNEVisualizer(metric="minkowski")
tsne.fit(docs, labels)
tsne.poof()


In [None]:
tsne = TSNEVisualizer(metric="cosine")
tsne.fit(docs, labels)
tsne.poof()


In [None]:
tsne = TSNEVisualizer(metric="jaccard")
tsne.fit(docs, labels)
tsne.poof()
