In [3]:
run ./00_Load_Demo_Data.ipynb

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)

root
 |-- department_id: string (nullable = true)
 |-- department_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- budget: string (nullable = true)



In [4]:
from pyspark.sql.functions import col, expr

# refere to a column
emp.salary
# or
#mp["salary"]

Column<'salary'>

In [5]:
emp.select(col("employee_id"), expr("name"), expr("cast(age as int) as age"), emp.salary).schema

# or use selectxpr
#emp.selectExpr("employee_id","cast(age as int) as age", "salary").schema

StructType([StructField('employee_id', StringType(), True), StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('salary', StringType(), True)])

In [6]:
# schema string
from pyspark.sql.types import _parse_datatype_string

string_schema ="name string, age int"

spark_schema = _parse_datatype_string(string_schema)
spark_schema

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True)])

In [7]:
from pyspark.sql.functions import expr, cast, lit, when, date_format

emp.select(col("salary").cast("double")).schema

emp.withColumn("tax", col("salary") * 0.2) \
    .withColumn("One", lit(1)) \
    .withColumnRenamed("One", "Col_One") \
    .drop("Col_One", "age") \
    .withColumn("new_gender",  when(col("gender") == "Male",'M') \
                              .when(col("gender") == "Female",'F') \
                              .otherwise(None)) \
    .withColumn("date_as_string", date_format(col("hire_date"), "dd/MM/yyyy")) \
    .limit(7) \
    .show()

+-----------+-------------+-------------+------+------+----------+-------+----------+--------------+
|employee_id|department_id|         name|gender|salary| hire_date|    tax|new_gender|date_as_string|
+-----------+-------------+-------------+------+------+----------+-------+----------+--------------+
|        001|          101|     John Doe|  Male| 50000|2015-01-01|10000.0|         M|    01/01/2015|
|        002|          101|   Jane Smith|Female| 45000|2016-02-15| 9000.0|         F|    15/02/2016|
|        003|          102|    Bob Brown|  Male| 55000|2014-05-01|11000.0|         M|    01/05/2014|
|        004|          102|    Alice Lee|Female| 48000|2017-09-30| 9600.0|         F|    30/09/2017|
|        005|          103|    Jack Chan|  Male| 60000|2013-04-01|12000.0|         M|    01/04/2013|
|        006|          103|    Jill Wong|Female| 52000|2018-07-01|10400.0|         F|    01/07/2018|
|        007|          101|James Johnson|  Male| 70000|2012-03-15|14000.0|         M|    15

In [8]:
# handle multiple columns transformations at once
columns = {
    'tax': col('salary') * 2,
    'one': lit(1)
}

emp.withColumns(columns).show(5)

+-----------+-------------+----------+---+------+------+----------+--------+---+
|employee_id|department_id|      name|age|gender|salary| hire_date|     tax|one|
+-----------+-------------+----------+---+------+------+----------+--------+---+
|        001|          101|  John Doe| 30|  Male| 50000|2015-01-01|100000.0|  1|
|        002|          101|Jane Smith| 25|Female| 45000|2016-02-15| 90000.0|  1|
|        003|          102| Bob Brown| 35|  Male| 55000|2014-05-01|110000.0|  1|
|        004|          102| Alice Lee| 28|Female| 48000|2017-09-30| 96000.0|  1|
|        005|          103| Jack Chan| 40|  Male| 60000|2013-04-01|120000.0|  1|
+-----------+-------------+----------+---+------+------+----------+--------+---+
only showing top 5 rows



In [9]:
#Window functions

from pyspark.sql.window import Window
from pyspark.sql.functions import max,col,desc, row_number

window_spec = Window.partitionBy(col("department_id")).orderBy(col("salary").desc())
max_func = max(col("salary")).over(window_spec)

row_num_func = row_number().over(window_spec)

emp.withColumn("maxSalary", max_func) \
   .withColumn("HighestEmployee", row_num_func) \
   .show(3)



+-----------+-------------+-------------+---+------+------+----------+---------+---------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|maxSalary|HighestEmployee|
+-----------+-------------+-------------+---+------+------+----------+---------+---------------+
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|    70000|              1|
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|    70000|              2|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|    70000|              3|
+-----------+-------------+-------------+---+------+------+----------+---------+---------------+
only showing top 3 rows

