In [1]:
# list of marvel book vertices
booksPath = "file:///Users/delshawnkirksey/Development/spark/apps/tutorial_4_marvel/Books.txt"
# list of marvel character vertices
charactersPath = "file:///Users/delshawnkirksey/Development/spark/apps/tutorial_4_marvel/Characters.txt"
# list of relationships between characters and books
edgesPath = "file:///Users/delshawnkirksey/Development/spark/apps/tutorial_4_marvel/Edges.txt"

In [12]:
books = sc.textFile(booksPath)
characters = sc.textFile(charactersPath)
edges = sc.textFile(edgesPath)

In [18]:
# filters out the vertices and returns only edges
def edgeFilter(row):
    if '*' in row or '"' in row:
        return False
    else:
        return True
    
edgesFiltered = edges.filter(edgeFilter)

In [84]:
# maps each character to the list of books the character appears in
characterBookMap = edgesFiltered.map(lambda x: x.split())\
                                .map(lambda x: (x[0], x[1:]))

In [70]:
# parses the characters RDD to a Pair RDD -- (character ID, character Name)
# example row -- 'Vertex 7: ABBOTT, JACK'
# returns ('7', 'ABBOT, JACK')

def charParse(row):
    row = row.split(":")
    return (row[0][7:], row[1].strip())

# Pair RDDs have a function 'collectAsMap' that returns the RDD as a dictionary
characterLookup = characters.map(charParse).collectAsMap()

In [29]:
# Creates a Pair RDD of the character name and the # of books the character appears in and sorts in descendingly
characterStrength = characterBookMap.mapValues(lambda x: len(x))\
                                    .map(lambda x: (characterLookup[x[0]],x[1]))\
                                    .reduceByKey(lambda x, y: x+y)\
                                    .sortBy(lambda x: -x[1])

In [82]:
#view the top 10 characters in the Marvel Universe
characterStrength.take(10)

[('SPIDER-MAN/PETER PARKER', 1625),
 ('CAPTAIN AMERICA', 1367),
 ('IRON MAN/TONY STARK', 1168),
 ('THING/BENJAMIN J. GRIMM', 990),
 ('THOR/DR. DONALD BLAKE/SIGURD JARLSON II/JAKE OLSON/LOREN OLSON', 965),
 ('HUMAN TORCH/JOHNNY STORM', 908),
 ('MR. FANTASTIC/REED RICHARDS', 875),
 ('HULK/DR. ROBERT BRUCE BANNER', 841),
 ('WOLVERINE/LOGAN', 820),
 ('INVISIBLE WOMAN/SUE STORM RICHARDS', 782)]

In [32]:
# Co-occurence network

In [92]:
# reverse the characterBookMap (character is key, books are values) 
# to get bookCharacterMap (books are keys, characters are values)

# flatMapValues is a function that is used to apply a function on each record of an RDD
# and maps the resulting list (if any) into a flat list of records
bookCharacterMap = characterBookMap.flatMapValues(lambda x: x)\
                                    .map(lambda x: (x[1], x[0]))\
                                    .reduceByKey(lambda x,y: x+","+y)\
                                    .mapValues(lambda x: x.split(","))

In [88]:
# for each book we want to generate a complete list of all of the character pairs that appear in that book

# itertools combinations function can generate any number of combinations given a list and a number
import itertools
cooccurenceMap = bookCharacterMap.flatMap(lambda x: list(itertools.combinations(x[1], 2)))

In [89]:
# combines each character pair to get the number of times those characters appeared in a book together
# i.e. ((character1, character2), strength)
cooccurenceStrength = cooccurenceMap.map(lambda x: (x, 1))\
                                    .reduceByKey(lambda x,y: x+y)

In [93]:
# create a basic RDD of each record as 3 elements i.e. (character1, character2, strength)
coocurrenceEdges = cooccurenceStrength\
                    .map(lambda x: (x[0][0], x[0][1], x[1]))

In [44]:
# Find the most important relationships by understanding and modeling the relationships

In [63]:
sortedCooccurrence = coocurrenceEdges\
                    .sortBy(lambda x: x[2])\
                    .map(lambda x: (characterLookup[x[0]], characterLookup[x[1]], x[2]))

In [64]:
sortedCooccurrence.filter(lambda x: 'SPIDER-MAN/PETER PARKER' in x).take(10)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 130.0 failed 1 times, most recent failure: Lost task 0.0 in stage 130.0 (TID 102, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 177, in main
    process()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/delshawnkirksey/Development/spark/spark_env/lib/python3.6/site-packages/pyspark/rdd.py", line 1339, in takeUpToNumLeft
    yield next(iterator)
  File "<ipython-input-63-f463f0cd6b80>", line 1, in <lambda>
KeyError: ' 4228'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:455)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor50.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 177, in main
    process()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/delshawnkirksey/Development/spark/spark_env/lib/python3.6/site-packages/pyspark/rdd.py", line 1339, in takeUpToNumLeft
    yield next(iterator)
  File "<ipython-input-63-f463f0cd6b80>", line 1, in <lambda>
KeyError: ' 4228'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
