In [1]:
from pyspark.sql import Row

In [3]:
import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]

# the courses(list) column converted to Array in Spark DataFrame
# The phone_numbers(Row) column converted to Struct in Spark DataFrame

In [6]:
# Arrow is available as an optimization when converting Spark Data Frames to and from Pandas Data Frames.

import pandas as pd

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled",False)

users_df = spark.createDataFrame(pd.DataFrame(users))

In [7]:
users_df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

                                                                                

In [8]:
users_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



## Overview of Narrow and wide transformations

In [9]:
# Reference - https://www.databricks.com/glossary/what-are-transformations

* Here are the functions related to narrow transformations. Narrow transformations doesn't result in shuffling. These are also known as row level transformations.
    * df.select
    * df.filter
    * df.withColumn
    * df.withColumnRenamed
    * df.drop
* Here are the functions related to wide transformations.
    * df.distinct
    * df.union or any set operation
    * df.join or any join operation
    * df.groupBy
    * df.sort or df.orderBy
* Any function that result in shuffling is wide transformation. For all the wide transformations, we have to deal with group of records based on a key.

## Select

In [12]:
help(users_df.select)

Help on method select in module pyspark.sql.dataframe:

select(*cols) method of pyspark.sql.dataframe.DataFrame instance
    Projects a set of expressions and returns a new :class:`DataFrame`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : str, :class:`Column`, or list
        column names (string) or expressions (:class:`Column`).
        If one of the column names is '*', that column is expanded to include all columns
        in the current :class:`DataFrame`.
    
    Examples
    --------
    >>> df.select('*').collect()
    [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
    >>> df.select('name', 'age').collect()
    [Row(name='Alice', age=2), Row(name='Bob', age=5)]
    >>> df.select(df.name, (df.age + 10).alias('age')).collect()
    [Row(name='Alice', age=12), Row(name='Bob', age=15)]



In [21]:
users_df.select("*").show()

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

In [22]:
users_df.select("id","first_name","last_name").show()

+---+----------+------------+
| id|first_name|   last_name|
+---+----------+------------+
|  1|    Corrie|Van den Oord|
|  2|  Nikolaus|     Brewitt|
|  3|    Orelie|      Penney|
|  4|     Ashby|    Maddocks|
|  5|      Kurt|        Rome|
+---+----------+------------+



In [23]:
# list of strings
users_df.select(["id","first_name","last_name"]).show()

+---+----------+------------+
| id|first_name|   last_name|
+---+----------+------------+
|  1|    Corrie|Van den Oord|
|  2|  Nikolaus|     Brewitt|
|  3|    Orelie|      Penney|
|  4|     Ashby|    Maddocks|
|  5|      Kurt|        Rome|
+---+----------+------------+



In [26]:
# Defining alias to dataframe

# we can use drop,select, join and other funcs with use of aliases 

users_df.alias("u").select("u.*").show()

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

In [30]:
users_df.alias("u").select("u.id","u.first_name","u.email").show(truncate=False)

+---+----------+-------------------------+
|id |first_name|email                    |
+---+----------+-------------------------+
|1  |Corrie    |cvandenoord0@etsy.com    |
|2  |Nikolaus  |nbrewitt1@dailymail.co.uk|
|3  |Orelie    |openney2@vistaprint.com  |
|4  |Ashby     |amaddocks3@home.pl       |
|5  |Kurt      |krome4@shutterfly.com    |
+---+----------+-------------------------+



In [32]:
from pyspark.sql.functions import col

# col - Returns a Column based on the given column name.

In [33]:

# We can use col(column name)[ this returns a column] with column strings .

users_df.select(col("id"),"first_name","email").show()

+---+----------+--------------------+
| id|first_name|               email|
+---+----------+--------------------+
|  1|    Corrie|cvandenoord0@etsy...|
|  2|  Nikolaus|nbrewitt1@dailyma...|
|  3|    Orelie|openney2@vistapri...|
|  4|     Ashby|  amaddocks3@home.pl|
|  5|      Kurt|krome4@shutterfly...|
+---+----------+--------------------+



In [44]:
from pyspark.sql.functions import col,concat,lit

# lit ( literal )- add the new content 
# lit Reference - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.lit.html?highlight=lit


users_df.select(
        col("id"),
        "first_name",
        "last_name",
        lit(col("id")),
         concat("first_name",lit(", "),col("last_name")).alias("full_name")

).show(truncate=False)

+---+----------+------------+---+--------------------+
|id |first_name|last_name   |id |full_name           |
+---+----------+------------+---+--------------------+
|1  |Corrie    |Van den Oord|1  |Corrie, Van den Oord|
|2  |Nikolaus  |Brewitt     |2  |Nikolaus, Brewitt   |
|3  |Orelie    |Penney      |3  |Orelie, Penney      |
|4  |Ashby     |Maddocks    |4  |Ashby, Maddocks     |
|5  |Kurt      |Rome        |5  |Kurt, Rome          |
+---+----------+------------+---+--------------------+



## SelectExpr

In [45]:
## here we use sql style syntax

In [47]:
help(users_df.selectExpr)

# selectExpr - uses only spark SQL expressions not the pyspark.sql functions

Help on method selectExpr in module pyspark.sql.dataframe:

selectExpr(*expr) method of pyspark.sql.dataframe.DataFrame instance
    Projects a set of SQL expressions and returns a new :class:`DataFrame`.
    
    This is a variant of :func:`select` that accepts SQL expressions.
    
    .. versionadded:: 1.3.0
    
    Examples
    --------
    >>> df.selectExpr("age * 2", "abs(age)").collect()
    [Row((age * 2)=4, abs(age)=2), Row((age * 2)=10, abs(age)=5)]



In [48]:
users_df.selectExpr("*").show()

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

In [50]:

# alias with selectExpr

users_df.alias("u").selectExpr("u.*").show()

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

In [66]:

users_df.select(
    "id",
    "first_name",
    "last_name",
    concat("first_name",lit(", "),col("last_name")).alias("Full_Name")

).show()

# in selectExpr , we have to use SQL Style syntax

users_df.selectExpr(
    "id",
    "first_name",
    "last_name",
    'concat(first_name,", ",last_name) AS FULL_NAME'

).show()

+---+----------+------------+--------------------+
| id|first_name|   last_name|           Full_Name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+

+---+----------+------------+--------------------+
| id|first_name|   last_name|           FULL_NAME|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [62]:
# To create Temporary View

users_df.createOrReplaceTempView("users")

In [65]:

# SPARK SQL

spark.sql("""
    SELECT 
    id,
    first_name,
    last_name,
    concat(first_name,', ',last_name) AS full_name 
    FROM users
""").show()

+---+----------+------------+--------------------+
| id|first_name|   last_name|           full_name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



## Referring columns using spark dataframe names

In [68]:
users_df["id"] == col("id")

# users_df["id"] - returns a column 

Column<'(id = id)'>

In [69]:
type(users_df["id"])

pyspark.sql.column.Column

In [152]:
users_df.select(users_df["id"],"first_name").show()

# 

+---+----------+
| id|first_name|
+---+----------+
|  1|    Corrie|
|  2|  Nikolaus|
|  3|    Orelie|
|  4|     Ashby|
|  5|      Kurt|
+---+----------+



In [153]:
# In PySpark, the alias method is used to assign an alias or alternative name to a DataFrame or column. However, 
# when using the indexing method (users_df["column_name"]), it returns a Column object instead of a DataFrame. 
# The Column object does not have the alias method available, which is why u["id"] or u["first_name"] within the users_df.alias("u").select(u["id"],"first_name") statement will not work.

# On the other hand, when you use the indexing method within the select statement with string literals ("u.id", "u.first_name"), PySpark treats them as column names rather than Column objects. 
# Thus, you can apply the alias method successfully on the column names, as shown in users_df.alias("u").select("u.id","u.first_name").show().

Column<'id'>

In [78]:
# users_df.alias("u").select(u["id"],"first_name").show() will not work
# aliases scope with indexing method ( u["id"] ) will not be present in select expression


users_df.alias("u").select("u.id","u.first_name").show()

+---+----------+
| id|first_name|
+---+----------+
|  1|    Corrie|
|  2|  Nikolaus|
|  3|    Orelie|
|  4|     Ashby|
|  5|      Kurt|
+---+----------+



In [74]:
users_df.selectExpr(col("id"),"first_name").show()
# it will not work as selectExpr accepts only the typical SQL Style syntaxs only
# col is pyspark.sql.function

TypeError: Column is not iterable

In [86]:
users_df.select(
    "id",
    "first_name",
    "last_name",
    concat(users_df["first_name"],lit(", "),col("last_name")).alias("Full_Name")
).show()

# u can use indexing method to use a particular column eg: users_df["first_name"]

+---+----------+------------+--------------------+
| id|first_name|   last_name|           Full_Name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [85]:
users_df.createOrReplaceTempView("users")

# u can not use indexing method to use a particular column eg: users_df["first_name"]
spark.sql("""
    SELECT 
    "id",
    "first_name",
    concat(users.first_name,", ",users.last_name) as full_name
    from users
""").show()

+---+----------+--------------------+
| id|first_name|           full_name|
+---+----------+--------------------+
| id|first_name|Corrie, Van den Oord|
| id|first_name|   Nikolaus, Brewitt|
| id|first_name|      Orelie, Penney|
| id|first_name|     Ashby, Maddocks|
| id|first_name|          Kurt, Rome|
+---+----------+--------------------+



## Understanding col type

In [87]:
users_df["id"]

Column<'id'>

In [90]:
col("id")

# passing any string to col - creates a column type 

Column<'id'>

In [89]:
cols = ["id","first_name","last_name"]
users_df.select(*cols).show()

+---+----------+------------+
| id|first_name|   last_name|
+---+----------+------------+
|  1|    Corrie|Van den Oord|
|  2|  Nikolaus|     Brewitt|
|  3|    Orelie|      Penney|
|  4|     Ashby|    Maddocks|
|  5|      Kurt|        Rome|
+---+----------+------------+



* There are quite a few functions available on top of column type

    * cast (can be used on all important data frame functions such as select, filter, groupBy, orderBy, etc)
    * asc, desc (typically used as part of sort or orderBy)
    * contains (typically used as part of filter or where)

In [99]:
from pyspark.sql.functions import date_format

# the date_format - returns a col type only

# alias,cast work on col types

users_df.select(
    col("id"),
    date_format("customer_from","yyyy-MM-dd").alias("customer")

).show()

users_df.select(
    col("id"),
    date_format("customer_from","yyyy-MM-dd").alias("customer")

).printSchema()

+---+----------+
| id|  customer|
+---+----------+
|  1|2021-01-15|
|  2|2021-02-14|
|  3|2021-01-21|
|  4|      null|
|  5|      null|
+---+----------+

root
 |-- id: long (nullable = true)
 |-- customer: string (nullable = true)



In [101]:
# Casting to int

users_df.select(
    col("id"),
    date_format("customer_from","yyyyMMdd").cast("int").alias("customer_from")

).show()

users_df.select(
    col("id"),
    date_format("customer_from","yyyyMMdd").cast("int").alias("customer_from")

).printSchema()



cols = [ col("id"),
    date_format("customer_from","yyyyMMdd").cast("int").alias("customer_from")
]

# the select work for both list of col/strings or strings
users_df.select(cols).show() # ------ Takes a list of column or strings  
users_df.select(*cols).show() # ----- Takes strings 



+---+-------------+
| id|customer_from|
+---+-------------+
|  1|     20210115|
|  2|     20210214|
|  3|     20210121|
|  4|         null|
|  5|         null|
+---+-------------+

root
 |-- id: long (nullable = true)
 |-- customer_from: integer (nullable = true)

+---+-------------+
| id|customer_from|
+---+-------------+
|  1|     20210115|
|  2|     20210214|
|  3|     20210121|
|  4|         null|
|  5|         null|
+---+-------------+

+---+-------------+
| id|customer_from|
+---+-------------+
|  1|     20210115|
|  2|     20210214|
|  3|     20210121|
|  4|         null|
|  5|         null|
+---+-------------+



## Invoking functions using Spark Column Objects

In [103]:
# concatenate first name and last name to generate full_name

In [107]:
full_name = concat(col("first_name"),lit(", "),col("last_name")).alias("full_name")

print(type(full_name))

users_df.select("id",full_name).show()

<class 'pyspark.sql.column.Column'>
+---+--------------------+
| id|           full_name|
+---+--------------------+
|  1|Corrie, Van den Oord|
|  2|   Nikolaus, Brewitt|
|  3|      Orelie, Penney|
|  4|     Ashby, Maddocks|
|  5|          Kurt, Rome|
+---+--------------------+



In [109]:
formats = date_format("customer_from","yymmdd").cast("int").alias("date")

users_df.select(formats).show()

users_df.select(formats).printSchema()

+------+
|  date|
+------+
|210015|
|210014|
|210021|
|  null|
|  null|
+------+

root
 |-- date: integer (nullable = true)



## Understanding lit 

In [149]:
#  users_df.select("id","amount_paid"+lit(25)).show()   # returns Null
# operation of string with col type - doesnt work

In [110]:
from pyspark.sql.functions import col,concat,lit

# lit ( literal )- add the new content 
# lit Reference - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.lit.html?highlight=lit


In [150]:
type(col("amount_paid")+"25")

pyspark.sql.column.Column

In [134]:
# using Spark SQL

users_df.createOrReplaceTempView("users")

spark.sql("""
    select id,amount_paid +25
    from users

""").show()

users_df.selectExpr("id",'amount_paid+25 AS amount_paid').show()

users_df.select("id",users_df.amount_paid+lit(25)).show()
                
#  users_df.select("id","amount_paid"+lit(25)).show()   # returns Null

# lit - converts a literal to column type



users_df.select("id",col("amount_paid")+"25").show() ##- it works 
# the above works as col("amount_paid")+"25" is also a Col
print(type(col("amount_paid")+"25"))


users_df.select("id","amount_paid"+"25").show()

# the above one fails as there is no column named amount_paid25

+---+------------------+
| id|(amount_paid + 25)|
+---+------------------+
|  1|           1025.55|
|  2|             925.0|
|  3|            875.55|
|  4|               NaN|
|  5|               NaN|
+---+------------------+

+---+-----------+
| id|amount_paid|
+---+-----------+
|  1|    1025.55|
|  2|      925.0|
|  3|     875.55|
|  4|        NaN|
|  5|        NaN|
+---+-----------+

+---+------------------+
| id|(amount_paid + 25)|
+---+------------------+
|  1|           1025.55|
|  2|             925.0|
|  3|            875.55|
|  4|               NaN|
|  5|               NaN|
+---+------------------+

+---+------------------+
| id|(amount_paid + 25)|
+---+------------------+
|  1|           1025.55|
|  2|             925.0|
|  3|            875.55|
|  4|               NaN|
|  5|               NaN|
+---+------------------+

<class 'pyspark.sql.column.Column'>


AnalysisException: cannot resolve 'amount_paid25' given input columns: [amount_paid, courses, customer_from, email, first_name, id, is_customer, last_name, last_updated_ts, phone_numbers];
'Project [id#0L, 'amount_paid25]
+- LogicalRDD [id#0L, first_name#1, last_name#2, email#3, phone_numbers#4, courses#5, is_customer#6, amount_paid#7, customer_from#8, last_updated_ts#9], false


In [143]:
users_df.select("id","amount_paid"+lit("25.0")).show()

users_df.select("id",concat("amount_paid",lit(" 25.0"))).show()

+---+--------------------+
| id|(25.0 + amount_paid)|
+---+--------------------+
|  1|                null|
|  2|                null|
|  3|                null|
|  4|                null|
|  5|                null|
+---+--------------------+

+---+--------------------------+
| id|concat(amount_paid,  25.0)|
+---+--------------------------+
|  1|              1000.55 25.0|
|  2|                900.0 25.0|
|  3|               850.55 25.0|
|  4|                  NaN 25.0|
|  5|                  NaN 25.0|
+---+--------------------------+



## Renaming the Spark DataFrame Columns

* There are multiple ways to rename Spark Data Frame Columns or Expressions.

    * We can rename column or expression using alias as part of select
    * We can add or rename column or expression using withColumn on top of Data Frame.
    * We can rename one column at a time using withColumnRenamed on top of Data Frame.
    * We typically use withColumn to perform row level transformations and then to provide a name to the result. If we provide the same name as existing column, then the column will be replaced with new one.
    * If we want to just rename the column then it is better to use withColumnRenamed.
    * If we want to apply any transformation, we need to either use select or withColumn
    * We can rename bunch of columns using toDF.

## Naming Derived Columns using withColumn