### Text Processing for Author Recognition using Spark

#### Import statements

In [1]:
import pyspark as ps    # import the spark suite
import warnings         # display warning if spark context already exists
import os

#### Initialized Spark Context

In [2]:
try:
    sc = ps.SparkContext('local[4]') # create spark context to work locally on all available cpus
    print('created SparkContext')
except ValueError:
    warnings.warn('SparkContext already exists')    # issue a warning if context already exists



### Read data.json into Spark SQL context

In [3]:
spark = ps.SQLContext(sc)
print('created SQLContext')

created SQLContext


In [5]:
data_file = 'data/data.json'
df = spark.read.json(data_file)

CONSIDER:

for fun use RDD and map reduce to remove the double bars I put into the excerpts!

In [6]:
print df.printSchema()
print df.count()
df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

None
9050
+--------------+--------------------+---------------+
|        author|             excerpt|          title|
+--------------+--------------------+---------------+
|CharlesDickens|A CHRISTMAS CAROL...|AChristmasCarol|
|CharlesDickens|Mind! I don't mea...|AChristmasCarol|
|CharlesDickens|Scrooge never pai...|AChristmasCarol|
+--------------+--------------------+---------------+
only showing top 3 rows



#### NOTE
* Column Objects (like df.excerpt) cannot be called
* Use df.select(col_name).show(num) to view the column

In [62]:
# print "df.excerpt type = \t\t", type(df.excerpt)
# print "df.select('excerpt') type = \t", type(df.select('excerpt'))
# df.select('excerpt').show(10)

### Here we import some sql funtions and give our dataframe a SQL table name

In [78]:
from pyspark.sql.functions import length
from pyspark.sql.functions import count

In [None]:
df.createOrReplaceTempView("excerpts")

In [58]:
spark.sql(
    "SELECT count(*) FROM excerpts WHERE author = 'MarkTwain'"
    ).show()

+--------+
|count(1)|
+--------+
|    2284|
+--------+



SQL queries in Sprak return dataframes. Columns can then be selected through Spark's dataframe syntax

In [82]:
sqlDF = spark.sql('''
        SELECT excerpt, length(excerpt) AS character_count
        FROM excerpts
        WHERE author = "MarkTwain"
        ''')

sqlDF.show(10)

print "type(sqlDF): ", type(sqlDF)
print "type(sqlDF.character_count): ", type(sqlDF.character_count)

+--------------------+---------------+
|             excerpt|character_count|
+--------------------+---------------+
|A CONNECTICUT YAN...|           1275|
|“You know about t...|           1529|
|HOW SIR LAUNCELOT...|           3034|
|And then they all...|           1574|
|As I laid the boo...|           1211|
|Well, a man like ...|           1224|
|“Fair sir, will y...|           1499|
|“Bridgeport?” sai...|           1197|
| || THE TALE OF T...|           1999|
|As we approached ...|           2457|
+--------------------+---------------+
only showing top 10 rows

type(sqlDF):  <class 'pyspark.sql.dataframe.DataFrame'>
type(sqlDF.character_count):  <class 'pyspark.sql.column.Column'>


In [102]:
# individual values seem a little clunky to access . . .

# head returns a list of rows: zeroth row, first col == first character count entry
sqlDF.head(5)[0][1]

1275