In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as func
from pyspark.sql.types import StructField, StructType, IntegerType, StringType


In [2]:
spark = SparkSession.builder.appName("PopularSuperheroes").getOrCreate()
spark


In [3]:
# schema for marvel names
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)])


In [4]:
df_names = spark.read.schema(schema).option("delimiter", " ").csv("Marvel-Names")
df_names.printSchema()


root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



In [5]:
df_names.show(5)

+---+--------------------+
| id|                name|
+---+--------------------+
|  1|24-HOUR MAN/EMMANUEL|
|  2|3-D MAN/CHARLES CHAN|
|  3|    4-D MAN/MERCURIO|
|  4|             8-BALL/|
|  5|                   A|
+---+--------------------+
only showing top 5 rows



In [6]:
df_lines = spark.read.text("Marvel-Graph")
df_lines.show(5)


+--------------------+
|               value|
+--------------------+
|5988 748 1722 375...|
|5989 4080 4264 44...|
|5982 217 595 1194...|
|5983 1165 3836 43...|
|5980 2731 3712 15...|
+--------------------+
only showing top 5 rows



In [13]:
df_connections = df_lines.withColumn("id", func.split("value", " ")[0]) \
    .withColumn("connections", func.size(func.split("value", " ")) - 1) \
    .groupBy("id").sum("connections").orderBy("sum(connections)", ascending=False)

In [14]:
df_connections.show(5)

+----+----------------+
|  id|sum(connections)|
+----+----------------+
| 859|            1937|
|5306|            1745|
|2664|            1532|
|5716|            1429|
|6306|            1397|
+----+----------------+
only showing top 5 rows



In [22]:
# join with df_names (superhero names)
df_results = df_connections.join(df_names, df_connections.id == df_names.id, "left") \
    .select(df_connections.id, "name", func.col("sum(connections)").alias("connections"))
df_results.show(5)


+----+--------------------+-----------+
|  id|                name|connections|
+----+--------------------+-----------+
| 859|     CAPTAIN AMERICA|       1937|
|5306|SPIDER-MAN/PETER PAR|       1745|
|2664|IRON MAN/TONY STARK |       1532|
|5716|THING/BENJAMIN J. GR|       1429|
|6306|    WOLVERINE/LOGAN |       1397|
+----+--------------------+-----------+
only showing top 5 rows



In [23]:
df_results.where(df_results.connections == 1).show()

+----+--------------------+-----------+
|  id|                name|connections|
+----+--------------------+-----------+
| 467|        BERSERKER II|          1|
| 577|              BLARE/|          1|
|3490|MARVEL BOY II/MARTIN|          1|
|3489|MARVEL BOY/MARTIN BU|          1|
|2139|      GIURESCU, RADU|          1|
|1089|       CLUMSY FOULUP|          1|
|1841|              FENRIS|          1|
|4517|              RANDAK|          1|
|5028|           SHARKSKIN|          1|
| 835|     CALLAHAN, DANNY|          1|
|1408|         DEATHCHARGE|          1|
|4784|                RUNE|          1|
|4945|         SEA LEOPARD|          1|
|4602|         RED WOLF II|          1|
|6411|              ZANTOR|          1|
|3014|JOHNSON, LYNDON BAIN|          1|
|3298|          LUNATIK II|          1|
|2911|                KULL|          1|
|2117|GERVASE, LADY ALYSSA|          1|
+----+--------------------+-----------+

