### Text Processing for Author Recognition using Spark

#### Import statements

In [1]:
import pyspark as ps    # import the spark suite
import warnings         # display warning if spark context already exists
import os

#### Initialized Spark Context

In [2]:
try:
    sc = ps.SparkContext('local[4]') # create spark context to work locally on all available cpus
    print('created SparkContext')
except ValueError:
    warnings.warn('SparkContext already exists')    # issue a warning if context already exists



### Read data.json into Spark SQL context

In [3]:
spark = ps.SQLContext(sc)
print('created SQLContext')

created SQLContext


In [5]:
data_file = 'data/data.json'
df = spark.read.json(data_file)

CONSIDER:

for fun use RDD and map reduce to remove the double bars I put into the excerpts!

In [6]:
print df.printSchema()
print df.count()
df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

None
9050
+--------------+--------------------+---------------+
|        author|             excerpt|          title|
+--------------+--------------------+---------------+
|CharlesDickens|A CHRISTMAS CAROL...|AChristmasCarol|
|CharlesDickens|Mind! I don't mea...|AChristmasCarol|
|CharlesDickens|Scrooge never pai...|AChristmasCarol|
+--------------+--------------------+---------------+
only showing top 3 rows



### Here we import some sql funtions and give our dataframe a SQL table name

In [7]:
from pyspark.sql.functions import length
from pyspark.sql.functions import count

df.createOrReplaceTempView("excerpts")

In [8]:
sqlDF = spark.sql("SELECT count(*) FROM excerpts WHERE author = 'MarkTwain'")
sqlDF.show()

+--------+
|count(1)|
+--------+
|    2284|
+--------+



In [11]:
sqlDF = spark.sql('''SELECT excerpt, length(excerpt)
                     FROM excerpts
                     WHERE author = "MarkTwain"''')
sqlDF.show(10)

+--------------------+---------------+
|             excerpt|length(excerpt)|
+--------------------+---------------+
|A CONNECTICUT YAN...|           1275|
|“You know about t...|           1529|
|HOW SIR LAUNCELOT...|           3034|
|And then they all...|           1574|
|As I laid the boo...|           1211|
|Well, a man like ...|           1224|
|“Fair sir, will y...|           1499|
|“Bridgeport?” sai...|           1197|
| || THE TALE OF T...|           1999|
|As we approached ...|           2457|
+--------------------+---------------+
only showing top 10 rows

