In [4]:
import re
import string

import pandas as pd

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, FloatType

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF

from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import NGram

In [2]:
# when starting jupyter with the sparkjupyter script, pyspark is already imported

print "sql session setup by script:\t", spark
print "spark context setup by script:\t", sc
print "pyspark imported by script:\t", str(pyspark)[:56], "..."

# ps= pyspark

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-2-a205a89a3599>, line 3)

In [6]:
data_file = 'data/data.json'
raw_df = spark.read.json(data_file)

raw_df.printSchema()
print("row count: ", raw_df.count())
raw_df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

row count:  9050
+--------------+--------------------+---------------+
|        author|             excerpt|          title|
+--------------+--------------------+---------------+
|CharlesDickens|A CHRISTMAS CAROL...|AChristmasCarol|
|CharlesDickens|Mind! I don't mea...|AChristmasCarol|
|CharlesDickens|Scrooge never pai...|AChristmasCarol|
+--------------+--------------------+---------------+
only showing top 3 rows



In [7]:
# create copy of raw_df incase I mess things up :P
df = raw_df

## Process the Excerpts and Create New Columns:


Add word count, sentence count, avg word len, avg sent len,  . . .

In [6]:
# define functions to apply to a row

def char_count(text):
    return len(text)

def word_count(text):
    return len(text.split())

def avg_word_length(text):
    return sum([len(t) for t in text.split()]) / float(len(text.split()))

def sentence_count(text):
    return len(re.split('[?.!]', text))

# ? Could use sentence count col and word count col to do thie w/o udf
def sentence_length(text):
    sentences = re.split('[?.!]', text)
    return sum([len(t.split()) for t in sentences]) / float(len(sentences))

# ? Create count of paragraphs per excerpt?
def paragraph_count(text):
    pass

# create User Defined Functions from above
charcount_udf = udf(lambda x : char_count(x))
wordcount_udf = udf(lambda x: word_count(x))
avgwordlen_udf = udf(lambda x: avg_word_length(x))
sentencecount_udf = udf(lambda x: sentence_count(x))
sentencelength_udf = udf(lambda x: sentence_length(x))

# add columns to dataframe
df = df.withColumn("char_count", charcount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("word_count", wordcount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("avg_wordlen", avgwordlen_udf(df.excerpt).cast(FloatType())) \
        .withColumn("sent_count", sentencecount_udf(df.excerpt).cast(FloatType())) \
        .withColumn("avg_sentlen", sentencelength_udf(df.excerpt).cast(FloatType())).persist()

In [None]:
#FIXME: we are not actually calculating the averages properly here 
#  - this is an average of averages!

df.createOrReplaceTempView("data")
spark.sql('''
            SELECT author
                    , ROUND(AVG(avg_wordlen),3) AS AvgWordLen
                    , ROUND(AVG(word_count),1) AS AvgWordsPerPara
                    , ROUND(AVG(sent_length),1) AS AvgWordsPerSent
            FROM data
            GROUP BY author
            ''').show()

### For experimentation purposes - here is how to get one excerpt from the spark dataframe

```python
df.createOrReplaceTempView("data")
temp = spark.sql('''SELECT excerpt FROM data LIMIT 5''')
sample_text = str(temp.take(3)[1].excerpt)
temp.show()
sample_text
```

## Spark's Examples:

### Tokenizer  <br> regexTokenizer
```python
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
```

### word2vec

```python
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
for feature in result.select("result").take(3):
    print(feature)
    
```

### n-grams

```python
from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["label", "words"])
ngram = NGram(inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordDataFrame)
for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
    print(ngrams_label)
```

## Applying Sparks MLlib NLP Functions to the Excerpts:

### Remove Punctuation from excerpts

In [8]:
# CONSIDER: reworking with REGEX
# CONSIDER: Remove Punctuation with stopwords (Tip From Sally)

def remove_punctuation(text):
    # return text.translate(None, punctuation)

    # this avoids error in spark due to apparent 'translate' name collision
    # Would it be more efficient to use .stip() on each word after tokenize?
    return "".join(c for c in text if c not in set(string.punctuation))


removepunctuation_udf = udf(lambda x : remove_punctuation(x))

df = df.withColumn("words_only", removepunctuation_udf(df.excerpt).cast(StringType()))

### Tokenize the punctuationless excerpts

In [9]:

tokenizer = Tokenizer(inputCol="words_only", outputCol="tokenized")
df = tokenizer.transform(df).persist()


### regexTokenizer
#### May be worth playing with later - to split and remove punctuation in one step <br> and to handle cases like church-yard vs. instance--literally 

In [None]:
# df= df.drop('tokenized')

# re_Tokenizer = RegexTokenizer(inputCol="excerpt", outputCol="tokenized", pattern="\\w+")
# # re_tokenizer = RegexTokenizer(inputCol="excerpt", outputCol="tokenized")
# df = re_tokenizer.transform(df).persist()

In [10]:
df.createOrReplaceTempView("data")
temp = spark.sql('''
    SELECT author, excerpt, tokenized
    FROM data 
    LIMIT 10
    ''').show()

+--------------+--------------------+--------------------+
|        author|             excerpt|           tokenized|
+--------------+--------------------+--------------------+
|CharlesDickens|A CHRISTMAS CAROL...|[a, christmas, ca...|
|CharlesDickens|Mind! I don't mea...|[mind, i, dont, m...|
|CharlesDickens|Scrooge never pai...|[scrooge, never, ...|
|CharlesDickens|Nobody ever stopp...|[nobody, ever, st...|
|CharlesDickens|The door of Scroo...|[the, door, of, s...|
|CharlesDickens|"I do," said Scro...|[i, do, said, scr...|
|CharlesDickens|"Uncle!" pleaded ...|[uncle, pleaded, ...|
|CharlesDickens|The clerk in the ...|[the, clerk, in, ...|
|CharlesDickens|"I am sorry, with...|[i, am, sorry, wi...|
|CharlesDickens|"Mr. Marley has b...|[mr, marley, has,...|
+--------------+--------------------+--------------------+



### Remove stop words?
##### Stop words left in for the moment on the theory/suspicion that authors' use these terms may be relevant to identification.

In [None]:
# remover = StopWordsRemover(inputCol="tokenized", outputCol="words_nostops")
# df = remover.transform(df).persist()

### CountVectorize token lists in df (turn wordlist into a vector of word counts)

In [11]:

cv = CountVectorizer(inputCol="tokenized", outputCol="count_vectorized")
cvmodel = cv.fit(df)
df = cvmodel.transform(df).persist()


### Calculate Term Frequency - Inverse Document Frequency

In [12]:

idf = IDF(inputCol="count_vectorized", outputCol="tfidf")
idfmodel = idf.fit(df)
df = idfmodel.transform(df).persist()


In [14]:
df.take(10)[9][]

Row(author='CharlesDickens', excerpt='"Mr. Marley has been dead these seven years," Scrooge replied. "He died seven years ago, this very night." || "We have no doubt his liberality is well represented by his surviving partner," said the gentleman, presenting his credentials. || It certainly was; for they had been two kindred spirits. At the ominous word "liberality" Scrooge frowned, and shook his head, and handed the credentials back. || "At this festive season of the year, Mr. Scrooge," said the gentleman, taking up a pen, "it is more than usually desirable that we should make some slight provision for the poor and destitute, who suffer greatly at the present time. Many thousands are in want of common necessaries; hundreds of thousands are in want of common comforts, sir." || "Are there no prisons?" asked Scrooge. || "Plenty of prisons," said the gentleman, laying down the pen again. || "And the Union workhouses?" demanded Scrooge. "Are they still in operation?" || "They are. Still," 

## Word2Vec

In [11]:
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="tokenized", outputCol="w2v")
w2vmodel = word2Vec.fit(df)
df = w2vmodel.transform(df).persist()

## N-grams

In [16]:
ngram = NGram(n=2, inputCol="tokenized", outputCol="2_gram")
df = ngram.transform(df)

ngram = NGram(n=3, inputCol="tokenized", outputCol="3_gram")
df = ngram.transform(df)

ngram = NGram(n=4, inputCol="tokenized", outputCol="4_gram")
df = ngram.transform(df)

ngram = NGram(n=5, inputCol="tokenized", outputCol="5_gram")
df = ngram.transform(df)


In [17]:
df = df.persist()

### The FINAL DataFrame!

In [None]:
df.createOrReplaceTempView("data")
sqldf = spark.sql('''
    SELECT tokenized, count_vectorized, tfidf, w2v
    FROM data
    LIMIT 6
    ''')
sqldf.show()

In [18]:
df.createOrReplaceTempView("data")
sqldf = spark.sql('''
    SELECT tokenized, 2_gram, 5_gram
    FROM data
    LIMIT 6
    ''')
sqldf.show()

+--------------------+--------------------+--------------------+
|           tokenized|              2_gram|              5_gram|
+--------------------+--------------------+--------------------+
|[a, christmas, ca...|[a christmas, chr...|[a christmas caro...|
|[mind, i, dont, m...|[mind i, i dont, ...|[mind i dont mean...|
|[scrooge, never, ...|[scrooge never, n...|[scrooge never pa...|
|[nobody, ever, st...|[nobody ever, eve...|[nobody ever stop...|
|[the, door, of, s...|[the door, door o...|[the door of scro...|
|[i, do, said, scr...|[i do, do said, s...|[i do said scroog...|
+--------------------+--------------------+--------------------+



## Save test

In [None]:
sqldf.write.save("data/save_test.parquet", format="parquet")

In [None]:
spark.sql("SELECT * FROM parquet.`data/save_test.parquet`").show()

In [None]:
sqldf.createOrReplaceTempView("sqldf")

In [None]:
# not too slow anymore
spark.sql("SELECT tokenized FROM data LIMIT 10").show()

In [None]:
# VERY FAST (direct parquet access)
spark.sql("SELECT tokenized FROM parquet.`data/save_test.parquet`").show()

In [None]:
# Use formatting to set table name

table = "parquet.`data/save_test.parquet`"

spark.sql("SELECT * FROM {}"
          .format(table)).show()

## Query Data directly from Parquet - store data as pandas for plotting etc.

(MOVE to Explore and Visualize notebook)

In [None]:
pddf = spark.sql("SELECT * FROM {}"
          .format(table)).toPandas()

In [None]:
pddf.head().T

## Saving the Final DataFrame!

In [None]:
df.write.mode('overwrite').save("data/excerpt_df.parquet", format="parquet")

In [None]:
df.printSchema()