In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
    .appName('pyspark_exercise') \
    .master('local[*]') \
    .config('spark.sql.execution.arrow.pyspark.enabled', True) \
    .config('spark.sql.session.timeZone', 'UTC') \
    .config('spark.driver.memory','8G') \
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .getOrCreate()

In [5]:
file = "../data/occupation.csv"

In [7]:
df = spark.read.option("delimiter", "|") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .csv(file)

1. print schema:

In [8]:
df.columns

['user_id', 'age', 'gender', 'occupation', 'zip_code']

In [9]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)



In [10]:
df.count()

943

In [11]:
df.show(10)

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      1| 24|     M|   technician|   85711|
|      2| 53|     F|        other|   94043|
|      3| 23|     M|       writer|   32067|
|      4| 24|     M|   technician|   43537|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|      9| 29|     M|      student|   01002|
|     10| 53|     M|       lawyer|   90703|
+-------+---+------+-------------+--------+
only showing top 10 rows



In [12]:
df[['user_id', 'age']]

user_id,age
1,24
2,53
3,23
4,24
5,33
6,42
7,57
8,36
9,29
10,53


In [13]:
df.filter(df['age'] <= 30)

user_id,age,gender,occupation,zip_code
1,24,M,technician,85711
3,23,M,writer,32067
4,24,M,technician,43537
9,29,M,student,1002
12,28,F,other,6405
16,21,M,entertainment,10309
17,30,M,programmer,6355
21,26,M,writer,30068
22,25,M,writer,40206
23,30,F,artist,48197


In [14]:
df2 = df.withColumnRenamed("zip_code", "postal_code")

In [15]:
df2

user_id,age,gender,occupation,postal_code
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,5201
9,29,M,student,1002
10,53,M,lawyer,90703


In [16]:
import pyspark.sql.functions as f

In [17]:
df.withColumn("new", f.lit("unknown"))

user_id,age,gender,occupation,zip_code,new
1,24,M,technician,85711,unknown
2,53,F,other,94043,unknown
3,23,M,writer,32067,unknown
4,24,M,technician,43537,unknown
5,33,F,other,15213,unknown
6,42,M,executive,98101,unknown
7,57,M,administrator,91344,unknown
8,36,M,administrator,5201,unknown
9,29,M,student,1002,unknown
10,53,M,lawyer,90703,unknown


In [18]:
df.withColumn("enriched", f.md5(df["zip_code"]))

user_id,age,gender,occupation,zip_code,enriched
1,24,M,technician,85711,0ef37031ca05ded69...
2,53,F,other,94043,6ae39b3f38de2240b...
3,23,M,writer,32067,32012097fe8ac018c...
4,24,M,technician,43537,10fa97233fb64af4b...
5,33,F,other,15213,05f7136bcb0b2ca2d...
6,42,M,executive,98101,468977905e16600cb...
7,57,M,administrator,91344,82b5f7f58bb61cf00...
8,36,M,administrator,5201,ad384f4dec7f2b218...
9,29,M,student,1002,45568b715fda02135...
10,53,M,lawyer,90703,b803a999ff8aafc51...


In [19]:
df.agg(f.avg(df.age))

avg(age)
34.05196182396607


In [20]:
df.select(f.avg("age")).show()

+-----------------+
|         avg(age)|
+-----------------+
|34.05196182396607|
+-----------------+



In [21]:
df.sort("age", ascending = False)

user_id,age,gender,occupation,zip_code
481,73,M,retired,37771
803,70,M,administrator,78212
767,70,M,engineer,0
860,70,F,retired,48322
559,69,M,executive,10022
585,69,M,librarian,98501
349,68,M,retired,61455
573,68,M,retired,48911
211,66,M,salesman,32605
651,65,M,retired,2903


In [22]:
df.groupBy("occupation").count().alias("count").sort("count", ascending=False) \
.show(1)

+----------+-----+
|occupation|count|
+----------+-----+
|   student|  196|
+----------+-----+
only showing top 1 row



In [23]:
df.select("zip_code").distinct().count()

795

In [25]:
df.agg(f.min(df.age), f.max(df.age))

min(age),max(age)
7,73


In [26]:
df.sort("age").show()

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|     30|  7|     M|      student|   55436|
|    471| 10|     M|      student|   77459|
|    289| 11|     M|         none|   94619|
|    142| 13|     M|        other|   48118|
|    628| 13|     M|         none|   94306|
|    674| 13|     F|      student|   55337|
|    609| 13|     F|      student|   55106|
|    880| 13|     M|      student|   83702|
|    206| 14|     F|      student|   53115|
|    813| 14|     F|      student|   02136|
|    887| 14|     F|      student|   27249|
|    179| 15|     M|entertainment|   20755|
|    101| 15|     M|      student|   05146|
|    281| 15|     F|      student|   06059|
|    618| 15|     F|      student|   44212|
|    461| 15|     M|      student|   98102|
|    849| 15|     F|      student|   25652|
|     57| 16|     M|         none|   84010|
|    434| 16|     F|      student|   49705|
|    451| 16|     M|      studen

In [27]:
df.groupBy("occupation","gender").agg(f.avg("age")).show()


+-------------+------+------------------+
|   occupation|gender|          avg(age)|
+-------------+------+------------------+
|   technician|     M| 32.96153846153846|
|     educator|     F| 39.11538461538461|
|       lawyer|     F|              39.5|
|entertainment|     F|              31.0|
|       lawyer|     M|              36.2|
|      retired|     F|              70.0|
|      student|     F|             20.75|
|   healthcare|     F| 39.81818181818182|
|administrator|     M| 37.16279069767442|
|    marketing|     M|            37.875|
|     engineer|     F|              29.5|
|    homemaker|     F|34.166666666666664|
|       artist|     F|30.307692307692307|
|         none|     F|              36.5|
|       doctor|     M| 43.57142857142857|
|       writer|     F| 37.63157894736842|
|     educator|     M| 43.10144927536232|
|    scientist|     M| 36.32142857142857|
|   technician|     F|              38.0|
|       writer|     M| 35.34615384615385|
+-------------+------+------------

In [31]:
df.dropna().groupBy("occupation","gender").agg(f.avg("age")).show()

+-------------+------+------------------+
|   occupation|gender|          avg(age)|
+-------------+------+------------------+
|   technician|     M| 32.96153846153846|
|     educator|     F| 39.11538461538461|
|       lawyer|     F|              39.5|
|entertainment|     F|              31.0|
|       lawyer|     M|              36.2|
|      retired|     F|              70.0|
|      student|     F|             20.75|
|   healthcare|     F| 39.81818181818182|
|administrator|     M| 37.16279069767442|
|    marketing|     M|            37.875|
|     engineer|     F|              29.5|
|    homemaker|     F|34.166666666666664|
|       artist|     F|30.307692307692307|
|         none|     F|              36.5|
|       doctor|     M| 43.57142857142857|
|       writer|     F| 37.63157894736842|
|     educator|     M| 43.10144927536232|
|    scientist|     M| 36.32142857142857|
|   technician|     F|              38.0|
|       writer|     M| 35.34615384615385|
+-------------+------+------------

In [34]:
df.where(f.col("occupation")== f.lit("none"))

user_id,age,gender,occupation,zip_code
57,16,M,none,84010
127,33,M,none,73439
130,20,M,none,60115
256,35,F,none,39042
289,11,M,none,94619
418,55,F,none,21206
628,13,M,none,94306
657,26,F,none,78704
756,30,F,none,90247
