In [1]:
import sys
print(sys.version)

2.7.12 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:42:40) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


In [2]:
spark

<pyspark.sql.session.SparkSession at 0x7f39542bf950>

## Download and Subset Data

We will first subset down the dataset of Amazon Book reviews located at [this link](http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books_5.json.gz).  This dataset contains 8,898,041 book reviews.

This dataset includes reviews (ratings, text, helpfulness votes), product metadata (descriptions, category information, price, brand, and image features), and links (also viewed/also bought graphs).  For more information please refer to [this page](http://jmcauley.ucsd.edu/data/amazon/).

The data located at `s3n://spark-talk/reviews_Books_subset5.json` contains a 5% subset of the full dataset

In [3]:
url = "s3n://spark-talk/reviews_Books_subset5.json"

review_subset = spark.read.json(url)

In [4]:
count = review_subset.count()
print("reviews_Books_subset5.json contains {} elements".format(count))

reviews_Books_subset5.json contains 445266 elements


In [5]:
print("First 10 rows of review_subset DataFrame...")
review_subset.show(10, truncate=True)

First 10 rows of review_subset DataFrame...
+-------+--------------------+
|overall|          reviewText|
+-------+--------------------+
|    5.0|As you read, Gibr...|
|    5.0|_The Prophet_ is ...|
|    5.0|The Prophet is ab...|
|    5.0|Reading a classic...|
|    3.0|Maybe I just wasn...|
|    5.0|Gibran gets right...|
|    4.0|This book was the...|
|    5.0|One of the classi...|
|    4.0|I have no memory ...|
|    4.0|At first, I was g...|
+-------+--------------------+
only showing top 10 rows



In [6]:
import pyspark as ps    # for the pyspark suite
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType
import string
import unicodedata
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams
from nltk import pos_tag
from nltk import RegexpParser

from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF

import numpy as np
import sys


def extract_bow_from_raw_text(text_as_string):
    """Extracts bag-of-words from a raw text string.

    Parameters
    ----------
    text (str): a text document given as a string

    Returns
    -------
    list : the list of the tokens extracted and filtered from the text
    """
    if (text_as_string == None):
        return []

    if (len(text_as_string) < 1):
        return []

    import nltk
    if '/home/hadoop/nltk_data' not in nltk.data.path:
        nltk.data.path.append('/home/hadoop/nltk_data')

    nfkd_form = unicodedata.normalize('NFKD', unicode(text_as_string))
    text_input = nfkd_form.encode('ASCII', 'ignore')

    sent_tokens = sent_tokenize(text_input)

    tokens = map(word_tokenize, sent_tokens)

    sent_tags = map(pos_tag, tokens)

    grammar = r"""
        SENT: {<(J|N).*>}                # chunk sequences of proper nouns
    """

    cp = RegexpParser(grammar)
    ret_tokens = list()
    stemmer_snowball = SnowballStemmer('english')

    for sent in sent_tags:
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'SENT':
                t_tokenlist = [tpos[0].lower() for tpos in subtree.leaves()]
                t_tokens_stemsnowball = map(stemmer_snowball.stem, t_tokenlist)
                #t_token = "-".join(t_tokens_stemsnowball)
                #ret_tokens.append(t_token)
                ret_tokens.extend(t_tokens_stemsnowball)
            #if subtree.label() == 'V2V': print(subtree)
    #tokens_lower = [map(string.lower, sent) for sent in tokens]

    stop_words = {'book', 'author', 'read', "'", 'character', ''}.union(ENGLISH_STOP_WORDS)

    tokens = [token for token in ret_tokens if token not in stop_words]

    return(tokens)


def indexing_pipeline(input_df, **kwargs):
    """ Runs a full text indexing pipeline on a collection of texts contained
    in a DataFrame.

    Parameters
    ----------
    input_df (DataFrame): a DataFrame that contains a field called 'text'

    Returns
    -------
    df : the same DataFrame with a column called 'features' for each document
    wordlist : the list of words in the vocabulary with their corresponding IDF
    """
    inputCol_ = kwargs.get("inputCol", "text")
    vocabSize_ = kwargs.get("vocabSize", 5000)
    minDF_ = kwargs.get("minDF", 2.0)

    tokenizer_udf = udf(extract_bow_from_raw_text, ArrayType(StringType()))
    df_tokens = input_df.withColumn("bow", tokenizer_udf(col(inputCol_)))

    cv = CountVectorizer(inputCol="bow", outputCol="vector_tf", vocabSize=vocabSize_, minDF=minDF_)
    cv_model = cv.fit(df_tokens)
    df_features_tf = cv_model.transform(df_tokens)

    idf = IDF(inputCol="vector_tf", outputCol="features")
    idfModel = idf.fit(df_features_tf)
    df_features = idfModel.transform(df_features_tf)

    return(df_features, cv_model.vocabulary)

In [7]:
review_df, vocab = indexing_pipeline(review_subset, inputCol='reviewText')

# Persist this DataFrame to keep it in memory
review_df.persist()

# print the top 5 elements of the DataFrame and schema to the log
print(review_df.take(5))
review_df.printSchema()

print("Example of first 50 words in our Vocab:")
print(vocab[:50])

[Row(overall=5.0, reviewText=u"As you read, Gibran's poetry brings spiritual and visual beauty to life within you. Gibran is justly famous for rich metaphors that brilliantly highlight the pursuit of Truth and Goodness amidst all the darkness and light of human nature.", bow=[u'gibran', u'poetri', u'spiritu', u'visual', u'beauti', u'life', u'gibran', u'famous', u'rich', u'metaphor', u'pursuit', u'truth', u'good', u'dark', u'light', u'human', u'natur'], vector_tf=SparseVector(5000, {3: 1.0, 5: 1.0, 60: 1.0, 90: 1.0, 137: 1.0, 178: 1.0, 179: 1.0, 184: 1.0, 342: 1.0, 442: 1.0, 628: 1.0, 1350: 1.0, 1481: 1.0, 1806: 1.0, 2008: 1.0}), features=SparseVector(5000, {3: 1.3775, 5: 1.8214, 60: 3.0803, 90: 3.1305, 137: 3.518, 178: 3.5413, 179: 3.6397, 184: 3.6669, 342: 4.0506, 442: 4.5143, 628: 4.5919, 1350: 5.6132, 1481: 5.5098, 1806: 5.7744, 2008: 5.8573})), Row(overall=5.0, reviewText=u'_The Prophet_ is a short read (my copy checks in at just under 100 pages), but its berevity belies both the p

## Train LDA Model

Now that we have a DataFrame with column `features` containing a vector object representing the [Tf-Idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) values for our words, we can apply the [Latent Dirichlet allocation algorithm contained in the `ml` package](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.clustering.LDA).

For the sake of this demonstration we will be specifying 5 clusters.

In [8]:
from pyspark.ml.clustering import LDA

lda = LDA(k=5, maxIter=10, seed=42, featuresCol='features')
model = lda.fit(review_df)

In [9]:
import pandas as pd

model_description = model.describeTopics(20).toPandas()
vocab = np.array(vocab)

In [11]:
for idx, row in model_description.iterrows():
    desc = "Top Words Associated with Topic {0}:\n{1}\n" \
                .format(row['topic'], vocab[row['termIndices']])
    print(desc)

Top Words Associated with Topic 0:
[u'war' u'histori' u'god' u'american' u'peopl' u'christian' u'mani'
 u'world' u'bibl' u'time' u'polit' u'life' u'year' u'stori' u'work'
 u'novel' u'reader' u'new' u'state' u'histor']

Top Words Associated with Topic 1:
[u'stori' u'love' u'life' u'charact' u'time' u'famili' u'thing' u'way'
 u'friend' u'peopl' u'heart' u'relationship' u'seri' u'romanc' u'year'
 u'emot' u'littl' u'world' u'mani' u'good']

Top Words Associated with Topic 2:
[u'quot' u'stori' u'charact' u'novel' u'seri' u'vampir' u'life' u'man'
 u'time' u'world' u'mysteri' u'human' u'way' u'new' u'good' u'reader'
 u'great' u'famili' u'peopl' u'year']

Top Words Associated with Topic 3:
[u'seri' u'recip' u'stori' u'great' u'good' u'charact' u'time' u'lot'
 u'littl' u'way' u'page' u'thing' u'plot' u'new' u'fun' u'novel' u'food'
 u'easi' u'mani' u'love']

Top Words Associated with Topic 4:
[u'charact' u'stori' u'good' u'time' u'thing' u'novel' u'way' u'great'
 u'mani' u'chapter' u'reader' u's