In [26]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func


In [2]:
spark = SparkSession.builder.appName("FakeFriends").getOrCreate()
spark


In [22]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv("fakefriends-header.csv")
df.printSchema()


root
 |-- userID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



In [24]:
df_friends = df.select("age", "friends")
df_friends.show(5)


+---+-------+
|age|friends|
+---+-------+
| 33|    385|
| 26|      2|
| 55|    221|
| 40|    465|
| 68|     21|
+---+-------+
only showing top 5 rows



In [27]:
df_friends.groupBy("age").avg("friends").orderBy("age").show(10)

+---+------------------+
|age|      avg(friends)|
+---+------------------+
| 18|           343.375|
| 19|213.27272727272728|
| 20|             165.0|
| 21|           350.875|
| 22|206.42857142857142|
| 23|             246.3|
| 24|             233.8|
| 25|197.45454545454547|
| 26|242.05882352941177|
| 27|           228.125|
+---+------------------+
only showing top 10 rows



In [32]:
# alias => used for renaming the columns
df.groupBy("age").agg(func.round(func.avg("friends"), 1).alias("friends_avg")).sort("age").show()

+---+-----------+
|age|friends_avg|
+---+-----------+
| 18|      343.4|
| 19|      213.3|
| 20|      165.0|
| 21|      350.9|
| 22|      206.4|
| 23|      246.3|
| 24|      233.8|
| 25|      197.5|
| 26|      242.1|
| 27|      228.1|
| 28|      209.1|
| 29|      215.9|
| 30|      235.8|
| 31|      267.3|
| 32|      207.9|
| 33|      325.3|
| 34|      245.5|
| 35|      211.6|
| 36|      246.6|
| 37|      249.3|
+---+-----------+
only showing top 20 rows



In [33]:
# select
df.select("name").show(5)

+--------+
|    name|
+--------+
|    Will|
|Jean-Luc|
|    Hugh|
|  Deanna|
|   Quark|
+--------+
only showing top 5 rows



In [34]:
# filter
df.filter(df.age > 21).show(5)

+------+--------+---+-------+
|userID|    name|age|friends|
+------+--------+---+-------+
|     0|    Will| 33|    385|
|     1|Jean-Luc| 26|      2|
|     2|    Hugh| 55|    221|
|     3|  Deanna| 40|    465|
|     4|   Quark| 68|     21|
+------+--------+---+-------+
only showing top 5 rows



In [35]:
# groupBy
df.groupBy("age").count().show(5)

+---+-----+
|age|count|
+---+-----+
| 31|    8|
| 65|    5|
| 53|    7|
| 34|    6|
| 28|   10|
+---+-----+
only showing top 5 rows



In [36]:
df.select(df.name, df.age + 10).show(5)

+--------+----------+
|    name|(age + 10)|
+--------+----------+
|    Will|        43|
|Jean-Luc|        36|
|    Hugh|        65|
|  Deanna|        50|
|   Quark|        78|
+--------+----------+
only showing top 5 rows

