In [1]:
import pyspark as ps    # import the spark suite
import warnings         # display warning if spark context already exists
import os
import pandas as pd
import numpy as np

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, FloatType
# from pyspark.sql.functions import mean as sql_mean

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF

from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import NGram

# not sure why Spark example uses HashingTF for TF-IDF 
# - scikit learn seems to recomend against this

# from pyspark.ml.feature import HashingTF 

In [2]:
try:
    sc = ps.SparkContext('local[4]') # create spark context to work locally on all available cpus
    print('created SparkContext')
except ValueError:
    warnings.warn('SparkContext already exists')    # issue a warning if context already exists

spark = ps.SQLContext(sc)
print('created SQLContext')


# Only Run This Once - It gets angry the second time!
data_file = 'data/data.json'
raw_df = spark.read.json(data_file)

raw_df.printSchema()
print "row count: ", raw_df.count()
raw_df.show(3)



created SQLContext
root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

row count:  9050
+--------------+--------------------+---------------+
|        author|             excerpt|          title|
+--------------+--------------------+---------------+
|CharlesDickens|A CHRISTMAS CAROL...|AChristmasCarol|
|CharlesDickens|Mind! I don't mea...|AChristmasCarol|
|CharlesDickens|Scrooge never pai...|AChristmasCarol|
+--------------+--------------------+---------------+
only showing top 3 rows



In [3]:
# create copy of raw_df incase I mess things up :P
df = raw_df

## Process the Excerpts and Create New Columns:


Add word count, sentence count, avg word len, avg sent len,  . . .

In [4]:
# define functions to apply to a row

def char_count(text):
    return len(text)

def word_count(text):
    return len(text.split())

def avg_word_length(text):
    return sum([len(t) for t in text.split()]) / float(len(text.split()))

def sentence_count(text):
    return len(text.split('.'))

# ? Could use sentence count col and word count col to do thie w/o udf
def sentence_length(text):
    return sum([len(t.split()) for t in text.split('.')]) / float(len(text.split('.')))

# ? Create count of paragraphs per excerpts?
def paragraph_count(text):
    pass

# create User Defined Functions from above
charcount_udf = udf(lambda x : char_count(x))
wordcount_udf = udf(lambda x: word_count(x))
avgwordlen_udf = udf(lambda x: avg_word_length(x))
sentencecount_udf = udf(lambda x: sentence_count(x))
sentencelength_udf = udf(lambda x: sentence_length(x))

# add columns to datafram

df = df.withColumn("character_count", charcount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("word_count", wordcount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("avg_wordlen", avgwordlen_udf(df.excerpt).cast(FloatType())) \
        .withColumn("sent_count", sentencecount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("sent_length", sentencelength_udf(df.excerpt).cast(FloatType()))

In [29]:
df.createOrReplaceTempView("data")
spark.sql('''
    SELECT author, excerpt, character_count, word_count, sent_count, sent_length
    FROM data
    ''').show(5)

+--------------+--------------------+---------------+----------+----------+-----------+
|        author|             excerpt|character_count|word_count|sent_count|sent_length|
+--------------+--------------------+---------------+----------+----------+-----------+
|CharlesDickens|A CHRISTMAS CAROL...|         1156.0|     214.0|      13.0|   16.76923|
|CharlesDickens|Mind! I don't mea...|         1504.0|     268.0|      14.0|  19.142857|
|CharlesDickens|Scrooge never pai...|         1438.0|     250.0|      16.0|     15.625|
|CharlesDickens|Nobody ever stopp...|         1643.0|     303.0|       9.0|  33.666668|
|CharlesDickens|The door of Scroo...|         1141.0|     211.0|      10.0|       21.1|
+--------------+--------------------+---------------+----------+----------+-----------+
only showing top 5 rows



### Spark's Examples:

In [None]:

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
for feature in result.select("result").take(3):
    print(feature)

In [None]:
from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["label", "words"])
ngram = NGram(inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordDataFrame)
for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
    print(ngrams_label)


### Applying NLP Functions to the Excerpts:

### Will need to Remove Punctuation!!! (apparently spark tokenize doesn't do this for us)

In [6]:
# Tokenize (turn excerpt into list of words)

tokenizer = Tokenizer(inputCol="excerpt", outputCol="tokenized")
df = tokenizer.transform(df)


In [7]:
# CountVectorize excerpts in df (turn wordlist into a vector of word counts)

cv = CountVectorizer(inputCol="tokenized", outputCol="count_vectorized")
model = cv.fit(df)
df = model.transform(df)


In [8]:
# Calculate Inverse Document Frequency

idf = IDF(inputCol="count_vectorized", outputCol="tfidf")
idfModel = idf.fit(df)
df = idfModel.transform(df)


In [21]:
df.createOrReplaceTempView("data")
spark.sql('''
    SELECT author, tokenized, count_vectorized
    FROM data
    ''').show(6)

+--------------+--------------------+--------------------+
|        author|           tokenized|    count_vectorized|
+--------------+--------------------+--------------------+
|CharlesDickens|[a, christmas, ca...|(120815,[0,1,2,3,...|
|CharlesDickens|[mind!, i, don't,...|(120815,[0,1,2,3,...|
|CharlesDickens|[scrooge, never, ...|(120815,[0,1,2,3,...|
|CharlesDickens|[nobody, ever, st...|(120815,[0,1,2,3,...|
|CharlesDickens|[the, door, of, s...|(120815,[0,1,2,3,...|
|CharlesDickens|["i, do,", said, ...|(120815,[0,1,2,3,...|
+--------------+--------------------+--------------------+
only showing top 6 rows



In [26]:
# Punctuation is still included after Spark's Tokenize should remove

for i in df.head(6)[5].tokenized[:20]:
    print i

"i
do,"
said
scrooge.
"merry
christmas!
what
right
have
you
to
be
merry?
what
reason
have
you
to
be
merry?
