In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("HogwartsAnalysis").getOrCreate()

data = [
    ("Gryffindor", 1, 80, "Harry Potter"),
    ("Slytherin", 1, 60, "Draco Malfoy"),
    ("Ravenclaw", 1, 45, "Luna Lovegood"),
    ("Hufflepuff", 1, 30, "Cedric Diggory"),
    ("Gryffindor", 2, 90, "Hermione Granger"),
    ("Slytherin", 2, 70, "Pansy Parkinson"),
    ("Ravenclaw", 2, 55, "Cho Chang"),
    ("Hufflepuff", 2, 65, "Hannah Abbott"),
    ("Gryffindor", 3, 20, "Ron Weasley"),
    ("Slytherin", 3, 85, "Blaise Zabini")
]

columns = ["house", "year", "points", "student"]
df = spark.createDataFrame(data, columns)
df.show()

+----------+----+------+----------------+
|     house|year|points|         student|
+----------+----+------+----------------+
|Gryffindor|   1|    80|    Harry Potter|
| Slytherin|   1|    60|    Draco Malfoy|
| Ravenclaw|   1|    45|   Luna Lovegood|
|Hufflepuff|   1|    30|  Cedric Diggory|
|Gryffindor|   2|    90|Hermione Granger|
| Slytherin|   2|    70| Pansy Parkinson|
| Ravenclaw|   2|    55|       Cho Chang|
|Hufflepuff|   2|    65|   Hannah Abbott|
|Gryffindor|   3|    20|     Ron Weasley|
| Slytherin|   3|    85|   Blaise Zabini|
+----------+----+------+----------------+



In [2]:
from pyspark.sql.functions import col, sum as spark_sum

df1 = df.select("house", "year", "points")
df2 = df1.filter(col("points") > 50)
df3 = df2.groupBy("house", "year") \
    .agg(spark_sum("points").alias("total_points"))

df4 = df3.orderBy(col("year").asc(), col("total_points").desc())
df4.show()


+----------+----+------------+
|     house|year|total_points|
+----------+----+------------+
|Gryffindor|   1|          80|
| Slytherin|   1|          60|
|Gryffindor|   2|          90|
| Slytherin|   2|          70|
|Hufflepuff|   2|          65|
| Ravenclaw|   2|          55|
| Slytherin|   3|          85|
+----------+----+------------+



In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Ring Battle Stats").getOrCreate()

data = [
    ("Aragorn", "Human", 10, 2, "Helms Deep"),
    ("Legolas", "Elf", 15, 0, "Helms Deep"),
    ("Gimli", "Dwarf", 8, 3, "Helms Deep"),
    ("Frodo", "Hobbit", 2, 1, "Moria"),
    ("Sam", "Hobbit", 4, 2, "Moria"),
    ("Gandalf", "Wizard", 12, 1, "Moria"),
    ("Boromir", "Human", 7, 4, "Amon Hen"),
    ("Legolas", "Elf", 20, 0, "Amon Hen"),
    ("Aragorn", "Human", 9, 2, "Amon Hen")
]

columns = ["name", "race", "enemies_defeated", "injuries", "battle"]
df = spark.createDataFrame(data, columns)
df.show()

+-------+------+----------------+--------+----------+
|   name|  race|enemies_defeated|injuries|    battle|
+-------+------+----------------+--------+----------+
|Aragorn| Human|              10|       2|Helms Deep|
|Legolas|   Elf|              15|       0|Helms Deep|
|  Gimli| Dwarf|               8|       3|Helms Deep|
|  Frodo|Hobbit|               2|       1|     Moria|
|    Sam|Hobbit|               4|       2|     Moria|
|Gandalf|Wizard|              12|       1|     Moria|
|Boromir| Human|               7|       4|  Amon Hen|
|Legolas|   Elf|              20|       0|  Amon Hen|
|Aragorn| Human|               9|       2|  Amon Hen|
+-------+------+----------------+--------+----------+



In [4]:
from pyspark.sql.functions import col, avg

df5 = df.select("name", "race", "enemies_defeated")
df6 = df5.filter(col("enemies_defeated") > 5)
df7 = df6.groupBy("race") \
    .agg(avg("enemies_defeated").alias("avg_enemies_defeated"))

df8 = df7.orderBy(col("avg_enemies_defeated").desc())
df8.show()


+------+--------------------+
|  race|avg_enemies_defeated|
+------+--------------------+
|   Elf|                17.5|
|Wizard|                12.0|
| Human|   8.666666666666666|
| Dwarf|                 8.0|
+------+--------------------+



In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("HogwartsAnalysis").getOrCreate()

data = [
    ("SUB-01", "Pacific Strike", 5, "Success"),
    ("SUB-02", "Atlantic Surge", 2, "Failure"),
    ("SUB-01", "Arctic Blitz", 4, "Success"),
    ("SUB-03", "Indian Ocean", 6, "Success"),
    ("SUB-02", "Pacific Strike", 3, "Success"),
    ("SUB-01", "Coral Sea", 7, "Success"),
    ("SUB-03", "Arctic Blitz", 1, "Failure"),
    ("SUB-02", "Bering Strait", 5, "Success")
]

columns = ["submarine_id", "mission_name", "warheads_launched", "status"]
df = spark.createDataFrame(data, columns)
df.show()

+------------+--------------+-----------------+-------+
|submarine_id|  mission_name|warheads_launched| status|
+------------+--------------+-----------------+-------+
|      SUB-01|Pacific Strike|                5|Success|
|      SUB-02|Atlantic Surge|                2|Failure|
|      SUB-01|  Arctic Blitz|                4|Success|
|      SUB-03|  Indian Ocean|                6|Success|
|      SUB-02|Pacific Strike|                3|Success|
|      SUB-01|     Coral Sea|                7|Success|
|      SUB-03|  Arctic Blitz|                1|Failure|
|      SUB-02| Bering Strait|                5|Success|
+------------+--------------+-----------------+-------+



In [6]:
from pyspark.sql.functions import col, count, sum as spark_sum

df9 = df.filter(
    (col("warheads_launched") > 3) & (col("status") == "Success")).select("submarine_id", "mission_name", "warheads_launched")

df10 = df9.groupBy("submarine_id") \
    .agg(count("mission_name").alias("total_missions"),spark_sum("warheads_launched").alias("total_warheads_launched"))

df11 = df10.orderBy(col("total_warheads_launched").desc())

df11.show()


+------------+--------------+-----------------------+
|submarine_id|total_missions|total_warheads_launched|
+------------+--------------+-----------------------+
|      SUB-01|             3|                     16|
|      SUB-03|             1|                      6|
|      SUB-02|             1|                      5|
+------------+--------------+-----------------------+

