In [1]:
import pyspark as ps    # import the spark suite
import warnings         # display warning if spark context already exists
import os

import string

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, FloatType

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF

from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import NGram

In [2]:
data_file = 'data/data.json'
raw_df = spark.read.json(data_file)

raw_df.printSchema()
print "row count: ", raw_df.count()
raw_df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

row count:  9050
+--------------+--------------------+---------------+
|        author|             excerpt|          title|
+--------------+--------------------+---------------+
|CharlesDickens|A CHRISTMAS CAROL...|AChristmasCarol|
|CharlesDickens|Mind! I don't mea...|AChristmasCarol|
|CharlesDickens|Scrooge never pai...|AChristmasCarol|
+--------------+--------------------+---------------+
only showing top 3 rows



In [4]:
# create copy of raw_df incase I mess things up :P
df = raw_df

## Process the Excerpts and Create New Columns:


Add word count, sentence count, avg word len, avg sent len,  . . .

In [5]:
# define functions to apply to a row

def char_count(text):
    return len(text)

def word_count(text):
    return len(text.split())

def avg_word_length(text):
    return sum([len(t) for t in text.split()]) / float(len(text.split()))

def sentence_count(text):
    return len(text.split('.'))

# ? Could use sentence count col and word count col to do thie w/o udf
def sentence_length(text):
    return sum([len(t.split()) for t in text.split('.')]) / float(len(text.split('.')))

# ? Create count of paragraphs per excerpt?
def paragraph_count(text):
    pass

# create User Defined Functions from above
charcount_udf = udf(lambda x : char_count(x))
wordcount_udf = udf(lambda x: word_count(x))
avgwordlen_udf = udf(lambda x: avg_word_length(x))
sentencecount_udf = udf(lambda x: sentence_count(x))
sentencelength_udf = udf(lambda x: sentence_length(x))

# add columns to datafram

df = df.withColumn("char_count", charcount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("word_count", wordcount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("avg_wordlen", avgwordlen_udf(df.excerpt).cast(FloatType())) \
        .withColumn("sent_count", sentencecount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("sent_length", sentencelength_udf(df.excerpt).cast(FloatType()))

In [6]:
df.createOrReplaceTempView("data")
spark.sql('''
    SELECT author, excerpt, char_count, word_count, sent_count, sent_length
    FROM data''').show(5)

+--------------+--------------------+----------+----------+----------+-----------+
|        author|             excerpt|char_count|word_count|sent_count|sent_length|
+--------------+--------------------+----------+----------+----------+-----------+
|CharlesDickens|A CHRISTMAS CAROL...|    1156.0|     214.0|      13.0|   16.76923|
|CharlesDickens|Mind! I don't mea...|    1504.0|     268.0|      14.0|  19.142857|
|CharlesDickens|Scrooge never pai...|    1438.0|     250.0|      16.0|     15.625|
|CharlesDickens|Nobody ever stopp...|    1643.0|     303.0|       9.0|  33.666668|
|CharlesDickens|The door of Scroo...|    1141.0|     211.0|      10.0|       21.1|
+--------------+--------------------+----------+----------+----------+-----------+
only showing top 5 rows



### For experimentation purposes - here is how to get one excerpt from the spark dataframe

```python
df.createOrReplaceTempView("data")
temp = spark.sql('''SELECT excerpt FROM data LIMIT 5''')
sample_text = str(temp.take(3)[1].excerpt)
temp.show()
sample_text
```

### Spark's Examples:

```python
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
for feature in result.select("result").take(3):
    print(feature)
    
```

```python
from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["label", "words"])
ngram = NGram(inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordDataFrame)
for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
    print(ngrams_label)
```

## Applying Sparks MLlib NLP Functions to the Excerpts:

### Remove Punctuation from excerpts

In [7]:
# CONSIDER: reworking with REGEX
# CONSIDER: Remove Punctuation with stopwords (Tip From Sally)

def remove_punctuation(text):
    # return text.translate(None, punctuation)
    return "".join(c for c in text if c not in set(string.punctuation))

removepunctuation_udf = udf(lambda x : remove_punctuation(x))

df = df.withColumn("words_only", removepunctuation_udf(df.excerpt).cast(StringType()))

## Tokenize the punctuationless excerpts

In [8]:

tokenizer = Tokenizer(inputCol="words_only", outputCol="tokenized")
df = tokenizer.transform(df)


### CountVectorize token lists in df (turn wordlist into a vector of word counts)

In [9]:

cv = CountVectorizer(inputCol="tokenized", outputCol="count_vectorized")
cvmodel = cv.fit(df)
df = cvmodel.transform(df)


### Calculate Term Frequency - Inverse Document Frequency

In [10]:

idf = IDF(inputCol="count_vectorized", outputCol="tfidf")
idfmodel = idf.fit(df)
df = idfmodel.transform(df)


### The FINAL DataFrame!

In [11]:
df.createOrReplaceTempView("data")
sqldf = spark.sql('''
    SELECT author, tokenized, count_vectorized, tfidf
    FROM data
    LIMIT 6
    ''')
sqldf.show()

+--------------+--------------------+--------------------+--------------------+
|        author|           tokenized|    count_vectorized|               tfidf|
+--------------+--------------------+--------------------+--------------------+
|CharlesDickens|[a, christmas, ca...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[mind, i, dont, m...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[scrooge, never, ...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[nobody, ever, st...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[the, door, of, s...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[i, do, said, scr...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
+--------------+--------------------+--------------------+--------------------+



## Save test

In [12]:
sqldf.write.save("save_test.parquet", format="parquet")

In [23]:
spark.sql("SELECT * FROM parquet.`save_test.parquet`").show()

+--------------+--------------------+--------------------+--------------------+
|        author|           tokenized|    count_vectorized|               tfidf|
+--------------+--------------------+--------------------+--------------------+
|CharlesDickens|[a, christmas, ca...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[mind, i, dont, m...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[scrooge, never, ...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[nobody, ever, st...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[the, door, of, s...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[i, do, said, scr...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
+--------------+--------------------+--------------------+--------------------+



In [22]:
sqldf.createOrReplaceTempView("sqldf")

In [19]:
# VERY SLOW
spark.sql("SELECT tokenized FROM sqldf LIMIT 2").show()

+--------------------+
|               tfidf|
+--------------------+
|(67321,[0,1,2,3,4...|
|(67321,[0,1,2,3,4...|
+--------------------+



In [20]:
# VERY FAST
spark.sql("SELECT tokenized FROM parquet.`save_test.parquet` LIMIT 2").show()

+--------------------+
|               tfidf|
+--------------------+
|(67321,[0,1,2,3,4...|
|(67321,[0,1,2,3,4...|
+--------------------+



In [25]:
table = "parquet.`save_test.parquet`"
spark.sql("SELECT * FROM {}"
          .format(table)).show()

+--------------+--------------------+--------------------+--------------------+
|        author|           tokenized|    count_vectorized|               tfidf|
+--------------+--------------------+--------------------+--------------------+
|CharlesDickens|[a, christmas, ca...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[mind, i, dont, m...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[scrooge, never, ...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[nobody, ever, st...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[the, door, of, s...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
|CharlesDickens|[i, do, said, scr...|(67321,[0,1,2,3,4...|(67321,[0,1,2,3,4...|
+--------------+--------------------+--------------------+--------------------+

