In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("functionsSQL").getOrCreate()

In [0]:
parquet_df = spark.read.parquet("/FileStore/tables/Titanic.parquet")
parquet_df.show(5)

parquet_df.createOrReplaceTempView("titanic_table")

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [0]:
#sql tranformations:
#functions: string function = upper, lower, substr, concat
# arthematic functions: +, -, *, /, %, ceil, round, floor
# date functions: dd-mm-yyy to yyy-mm-dd
#

In [0]:
#aritmetic functions
# using sql
spark.sql("select Name, upper(Name) upperName from titanic_table").show(5)
#using dataframe
parquet_df.withColumn("upperName",upper(col("Name"))).select("Name","upperName").show(5)

+--------------------+--------------------+
|                Name|           upperName|
+--------------------+--------------------+
|Braund, Mr. Owen ...|BRAUND, MR. OWEN ...|
|Cumings, Mrs. Joh...|CUMINGS, MRS. JOH...|
|Heikkinen, Miss. ...|HEIKKINEN, MISS. ...|
|Futrelle, Mrs. Ja...|FUTRELLE, MRS. JA...|
|Allen, Mr. Willia...|ALLEN, MR. WILLIA...|
+--------------------+--------------------+
only showing top 5 rows

+--------------------+--------------------+
|                Name|           upperName|
+--------------------+--------------------+
|Braund, Mr. Owen ...|BRAUND, MR. OWEN ...|
|Cumings, Mrs. Joh...|CUMINGS, MRS. JOH...|
|Heikkinen, Miss. ...|HEIKKINEN, MISS. ...|
|Futrelle, Mrs. Ja...|FUTRELLE, MRS. JA...|
|Allen, Mr. Willia...|ALLEN, MR. WILLIA...|
+--------------------+--------------------+
only showing top 5 rows



In [0]:
# using sql
spark.sql("select Name, lower(Name) lowerName from titanic_table").show(5)
#using dataframe
parquet_df.withColumn("lowerName",lower(col("Name"))).select("Name","lowerName").show(5)

+--------------------+--------------------+
|                Name|           lowerName|
+--------------------+--------------------+
|Braund, Mr. Owen ...|braund, mr. owen ...|
|Cumings, Mrs. Joh...|cumings, mrs. joh...|
|Heikkinen, Miss. ...|heikkinen, miss. ...|
|Futrelle, Mrs. Ja...|futrelle, mrs. ja...|
|Allen, Mr. Willia...|allen, mr. willia...|
+--------------------+--------------------+
only showing top 5 rows

+--------------------+--------------------+
|                Name|           lowerName|
+--------------------+--------------------+
|Braund, Mr. Owen ...|braund, mr. owen ...|
|Cumings, Mrs. Joh...|cumings, mrs. joh...|
|Heikkinen, Miss. ...|heikkinen, miss. ...|
|Futrelle, Mrs. Ja...|futrelle, mrs. ja...|
|Allen, Mr. Willia...|allen, mr. willia...|
+--------------------+--------------------+
only showing top 5 rows



In [0]:
#using dataframe
parquet_df.withColumn("CombinedName", concat_ws("=",parquet_df["Sex"],parquet_df["Name"])).show(5)


+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|        CombinedName|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|male=Braund, Mr. ...|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|female=Cumings, M...|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|female=Heikkinen,...|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|female=Futrelle, ...|
|          5|       0|     3|Allen, Mr. Willia..

In [0]:
parquet_df.withColumn("NameLength", length(col("Name"))).show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|NameLength|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|        23|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|        51|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|        22|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|        44|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|    

In [0]:
parquet_df.withColumn("SubStrName", col("Name").substr(2,7)).select("Name","SubStrName").show(5)

+--------------------+----------+
|                Name|SubStrName|
+--------------------+----------+
|Braund, Mr. Owen ...|   raund, |
|Cumings, Mrs. Joh...|   umings,|
|Heikkinen, Miss. ...|   eikkine|
|Futrelle, Mrs. Ja...|   utrelle|
|Allen, Mr. Willia...|   llen, M|
+--------------------+----------+
only showing top 5 rows



In [0]:
#aritmetic functions

parquet_df.withColumn("Fare",col("Fare").cast(IntegerType()))

Out[30]: DataFrame[PassengerId: bigint, Survived: bigint, Pclass: bigint, Name: string, Sex: string, Age: double, SibSp: bigint, Parch: bigint, Ticket: string, Fare: int, Cabin: string, Embarked: string]

In [0]:
parquet_df.withColumn("add_col", col("Fare") + col("Fare")*2).select("Fare", "add_col").show(5)

+-------+------------------+
|   Fare|           add_col|
+-------+------------------+
|   7.25|             21.75|
|71.2833|          213.8499|
|  7.925|            23.775|
|   53.1|             159.3|
|   8.05|24.150000000000002|
+-------+------------------+
only showing top 5 rows



In [0]:
parquet_df.withColumn("sub_col", col("Fare")*3 - col("Fare")).select("Fare", "sub_col").show(5)

+-------+------------------+
|   Fare|           sub_col|
+-------+------------------+
|   7.25|              14.5|
|71.2833|          142.5666|
|  7.925|15.849999999999998|
|   53.1|106.20000000000002|
|   8.05|              16.1|
+-------+------------------+
only showing top 5 rows



In [0]:
parquet_df.withColumn("mul_col", col("Fare")*3).select("Fare", "mul_col").show(5)

+-------+------------------+
|   Fare|           mul_col|
+-------+------------------+
|   7.25|             21.75|
|71.2833|          213.8499|
|  7.925|            23.775|
|   53.1|             159.3|
|   8.05|24.150000000000002|
+-------+------------------+
only showing top 5 rows



In [0]:
parquet_df.withColumn("div_col", col("Fare") / 2).select("Fare", "div_col").show(5)

+-------+--------+
|   Fare| div_col|
+-------+--------+
|   7.25|   3.625|
|71.2833|35.64165|
|  7.925|  3.9625|
|   53.1|   26.55|
|   8.05|   4.025|
+-------+--------+
only showing top 5 rows



In [0]:
parquet_df.withColumn("mod_col", col("Fare")%2).select("Fare", "mod_col").show(5)

+-------+-------------------+
|   Fare|            mod_col|
+-------+-------------------+
|   7.25|               1.25|
|71.2833|  1.283299999999997|
|  7.925| 1.9249999999999998|
|   53.1| 1.1000000000000014|
|   8.05|0.05000000000000071|
+-------+-------------------+
only showing top 5 rows



In [0]:
parquet_df.withColumn("ceil_col", ceil(col("Fare"))).select("Fare", "ceil_col").show(5)

+-------+--------+
|   Fare|ceil_col|
+-------+--------+
|   7.25|       8|
|71.2833|      72|
|  7.925|       8|
|   53.1|      54|
|   8.05|       9|
+-------+--------+
only showing top 5 rows



In [0]:
parquet_df.withColumn("round_col", round(col("Fare"))).select("Fare", "round_col").show(5)

+-------+---------+
|   Fare|round_col|
+-------+---------+
|   7.25|      7.0|
|71.2833|     71.0|
|  7.925|      8.0|
|   53.1|     53.0|
|   8.05|      8.0|
+-------+---------+
only showing top 5 rows



In [0]:
parquet_df.withColumn("floor_col", floor(col("Fare"))).select("Fare", "floor_col").show(5)

+-------+---------+
|   Fare|floor_col|
+-------+---------+
|   7.25|        7|
|71.2833|       71|
|  7.925|        7|
|   53.1|       53|
|   8.05|        8|
+-------+---------+
only showing top 5 rows



In [0]:
#date functions

dummy_df.withColumn("std_date_format_col", to_date(col("date_col"), "M/d/yyyy")).show()
dummy_df.withColumn("std_date_format_col", date_format(to_date(col("date_col"), "M/d/yyyy")), "yyyy/MM/dd").show()
dummy_df.withColumn("std_date_format_col", date_format(to_date(col("date_col"), "M/d/yyyy")), "yyyy*MM*dd").show()

In [0]:
#aggregate functions

parquet_df.agg(min("Fare"),max("Fare"),sum("Fare"),avg("Fare"),count("Fare")).show()

+---------+---------+------------------+----------------+-----------+
|min(Fare)|max(Fare)|         sum(Fare)|       avg(Fare)|count(Fare)|
+---------+---------+------------------+----------------+-----------+
|      0.0| 512.3292|28693.949299999967|32.2042079685746|        891|
+---------+---------+------------------+----------------+-----------+

