Text mining example based on https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors

## Prerequesites

install gensim (word2vec implementation)

    sudo docker exec -it $(sudo docker inspect -f {{.Id}} jupyter-notebook) conda install -yq -n python2 nltk gensim

In [None]:
import re                          # Regular Expressions
import pandas as pd                # DataFrames & Manipulation
import nltk.data                   # tpunkt tokenizer
from nltk.corpus import stopwords  # Import the stop word list
from bs4 import BeautifulSoup      # HTML processing
import gensim                      # word2vec impl

In [None]:
# ensure tokenizer models are loaded
print nltk.download("punkt")

In [None]:
train_input = "../data/recipes.csv.gz"

train = pd.read_csv(train_input, delimiter="\t", quoting=3, names = ("recipe_id", "instructions"))

In [None]:
train.head()

In [None]:
def clean_text( input, remove_stopwords = False ):
    """
    Clean a document.
    """
    
    # remove any HTML markup with BeautifulSoup
    plainText = " ".join(BeautifulSoup(input, 'html.parser').strings)
    
    # retain only letters
    onlyLetters = re.sub("[^a-zA-Z]", " ", plainText)
    
    # get lower case words
    words = onlyLetters.lower().split()
    
    # use set of stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    return words

In [None]:
# Split sentences

tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')

tokenizer.tokenize(train['instructions'][2])

In [None]:
def clean_sentences( input, remove_stopwords = False ):
    """
    Split a document by sentences and clean each sentence.
    """
    
    sentences = tokenizer.tokenize(input)
    clean_sentences = []
    
    for sentence in sentences:
        if len(sentence) > 0:
            clean_sentences.append(clean_text(sentence, remove_stopwords))
    
    return clean_sentences

In [None]:
sentences = []

num_reviews = train['instructions'].size

for i in xrange ( 0, num_reviews ):
    
    if (i+1) % 1000 == 0:
        print "Processing recipe %d of %d recipies." % ( i+1, num_reviews )
    sentences += clean_sentences(train['instructions'][i].decode('utf-8'))

In [None]:
print "Example Sentences:\n", "\n".join([",".join(sentence) for sentence in sentences[0:3]])

In [None]:
# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# make the model much more memory-efficient.
model.init_sims(replace=True)

In [None]:
model_name = "recipes_100features_20minwords_10context"
model.save(model_name)

In [None]:
model.doesnt_match("milch brot eier".split())

In [None]:
model.most_similar("braten")

In [None]:
model.most_similar("pasta")

In [None]:
model.most_similar("brownies")

In [None]:
model.most_similar("frucht")

In [None]:
print "%d words in vocab." % len(model.wv.vocab)
#model.sort_vocab()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [None]:
# get vector weights
X = model[model.wv.vocab]

In [None]:
# word in vocabulary count
[(word, obj.count) for (word, obj) in model.wv.vocab.items()]

In [None]:
model.wv.vocab

In [None]:
tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
X_tsne = tsne.fit_transform(X)

In [None]:
X_pca = PCA().fit_transform(X)

In [None]:
plt.scatter(X_tsne[:, 0], X_tsne[:, 1])

In [None]:
plt.scatter(X_pca[:, 0], X_pca[:, 1])