In [13]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("00 SparkSession").master("local[*]").getOrCreate()

data = [
     [1, "Gav", 35, 100]
    ,[2, "Zac", 34, 150]
    ,[3, "Hr", 28, 120]
    ,[4, "Mar", 26, 210]
]

schema = "id int, name string, age int, salary int"

df_users = spark.createDataFrame(data, schema)

df_users.show()


+---+----+---+------+
| id|name|age|salary|
+---+----+---+------+
|  1| Gav| 35|   100|
|  2| Zac| 34|   150|
|  3|  Hr| 28|   120|
|  4| Mar| 26|   210|
+---+----+---+------+



In [14]:
# cast
from pyspark.sql.functions import col, cast
df_users = df_users.select("id", "name", col("salary").cast("double"))

In [15]:
# new column
df_users = df_users.withColumn("tax", col("salary") * 0.2)
df_users.show()

+---+----+------+----+
| id|name|salary| tax|
+---+----+------+----+
|  1| Gav| 100.0|20.0|
|  2| Zac| 150.0|30.0|
|  3|  Hr| 120.0|24.0|
|  4| Mar| 210.0|42.0|
+---+----+------+----+



In [16]:
# lit: static values in column
from pyspark.sql.functions import lit
df_users = df_users.withColumn("title", lit("Mr "))
df_users.show()

+---+----+------+----+-----+
| id|name|salary| tax|title|
+---+----+------+----+-----+
|  1| Gav| 100.0|20.0|  Mr |
|  2| Zac| 150.0|30.0|  Mr |
|  3|  Hr| 120.0|24.0|  Mr |
|  4| Mar| 210.0|42.0|  Mr |
+---+----+------+----+-----+



In [17]:
# rename columns
df_users = df_users.withColumnRenamed("name", "user_name")
df_users.show()

+---+---------+------+----+-----+
| id|user_name|salary| tax|title|
+---+---------+------+----+-----+
|  1|      Gav| 100.0|20.0|  Mr |
|  2|      Zac| 150.0|30.0|  Mr |
|  3|       Hr| 120.0|24.0|  Mr |
|  4|      Mar| 210.0|42.0|  Mr |
+---+---------+------+----+-----+



In [22]:
# remove column
df_users = df_users.drop("title", "id")
df_users.limit(3).show()

+---------+------+----+
|user_name|salary| tax|
+---------+------+----+
|      Gav| 100.0|20.0|
|      Zac| 150.0|30.0|
|       Hr| 120.0|24.0|
+---------+------+----+



In [23]:
# multiple columns

columns = {
     "doubletax": col("salary") * 2
    ,"title": lit("Mr")
}

df_users.withColumns(columns).show()


+---------+------+----+---------+-----+
|user_name|salary| tax|doubletax|title|
+---------+------+----+---------+-----+
|      Gav| 100.0|20.0|    200.0|   Mr|
|      Zac| 150.0|30.0|    300.0|   Mr|
|       Hr| 120.0|24.0|    240.0|   Mr|
|      Mar| 210.0|42.0|    420.0|   Mr|
+---------+------+----+---------+-----+

