In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc=SparkContext()
spark=SparkSession(sc)

# Load the data

In [2]:
iris=spark.read.csv('Iris-data.csv',inferSchema=True,header=True)
iris.show(5)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [6]:
prostate=spark.read.csv('prostate.csv',inferSchema=True,header=True)
prostate.show(10)

+---+-------+---+----+-----+-----+----+----+-------+
| ID|CAPSULE|AGE|RACE|DPROS|DCAPS| PSA| VOL|GLEASON|
+---+-------+---+----+-----+-----+----+----+-------+
|  1|      0| 65|   1|    2|    1| 1.4| 0.0|      6|
|  2|      0| 72|   1|    3|    2| 6.7| 0.0|      7|
|  3|      0| 70|   1|    1|    2| 4.9| 0.0|      6|
|  4|      0| 76|   2|    2|    1|51.2|20.0|      7|
|  5|      0| 69|   1|    1|    1|12.3|55.9|      6|
|  6|      1| 71|   1|    3|    2| 3.3| 0.0|      8|
|  7|      0| 68|   2|    4|    2|31.9| 0.0|      7|
|  8|      0| 61|   2|    4|    2|66.7|27.2|      7|
|  9|      0| 69|   1|    1|    1| 3.9|24.0|      7|
| 10|      0| 68|   2|    1|    2|13.0| 0.0|      6|
+---+-------+---+----+-----+-----+----+----+-------+
only showing top 10 rows



# SQL functions

In [4]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
import pandas as pd
import numpy as np

  return f(*args, **kwds)
  return f(*args, **kwds)


## abs

In [9]:
pdf=pd.DataFrame({'x1':[10,12,-9,32,-8,-15]})
df=spark.createDataFrame(pdf)
df.show()

+---+
| x1|
+---+
| 10|
| 12|
| -9|
| 32|
| -8|
|-15|
+---+



In [10]:
df.select(df.x1,abs(df.x1)).show()

+---+-------+
| x1|abs(x1)|
+---+-------+
| 10|     10|
| 12|     12|
| -9|      9|
| 32|     32|
| -8|      8|
|-15|     15|
+---+-------+



## acos

In [12]:
pdf=pd.DataFrame({'x1':list(-np.random.rand(5)+list(np.random.rand(5)))})
df=spark.createDataFrame(pdf)
df.show()

+--------------------+
|                  x1|
+--------------------+
|-0.45202763693223214|
|-0.06320562523758244|
| -0.7932405390516785|
| 0.14741580416877742|
|  0.2289112899417708|
+--------------------+



In [13]:
#Arc Cosine (inverse of cosine) is evaluated
df.select(df.x1,acos(df.x1)).show()

+--------------------+------------------+
|                  x1|          ACOS(x1)|
+--------------------+------------------+
|-0.45202763693223214|2.0398334856263953|
|-0.06320562523758244|1.6340441117648803|
| -0.7932405390516785| 2.486908910159746|
| 0.14741580416877742|1.4228413069611212|
|  0.2289112899417708|1.3398371980666264|
+--------------------+------------------+



## add_months

In [16]:
import datetime
base=datetime.date.today()
date_list=[base+datetime.timedelta(days=x) for x in list(range(0,10))*10]
pdf=pd.DataFrame({'dates':date_list})
df=spark.createDataFrame(pdf)
df.show()

+----------+
|     dates|
+----------+
|2020-05-12|
|2020-05-13|
|2020-05-14|
|2020-05-15|
|2020-05-16|
|2020-05-17|
|2020-05-18|
|2020-05-19|
|2020-05-20|
|2020-05-21|
|2020-05-12|
|2020-05-13|
|2020-05-14|
|2020-05-15|
|2020-05-16|
|2020-05-17|
|2020-05-18|
|2020-05-19|
|2020-05-20|
|2020-05-21|
+----------+
only showing top 20 rows



In [17]:
df.select('dates',add_months(df.dates,2)).alias('months').show()

+----------+--------------------+
|     dates|add_months(dates, 2)|
+----------+--------------------+
|2020-05-12|          2020-07-12|
|2020-05-13|          2020-07-13|
|2020-05-14|          2020-07-14|
|2020-05-15|          2020-07-15|
|2020-05-16|          2020-07-16|
|2020-05-17|          2020-07-17|
|2020-05-18|          2020-07-18|
|2020-05-19|          2020-07-19|
|2020-05-20|          2020-07-20|
|2020-05-21|          2020-07-21|
|2020-05-12|          2020-07-12|
|2020-05-13|          2020-07-13|
|2020-05-14|          2020-07-14|
|2020-05-15|          2020-07-15|
|2020-05-16|          2020-07-16|
|2020-05-17|          2020-07-17|
|2020-05-18|          2020-07-18|
|2020-05-19|          2020-07-19|
|2020-05-20|          2020-07-20|
|2020-05-21|          2020-07-21|
+----------+--------------------+
only showing top 20 rows



## approx_count_distinct

In [19]:
iris.select(approx_count_distinct(iris.Species)).show()

+------------------------------+
|approx_count_distinct(Species)|
+------------------------------+
|                             3|
+------------------------------+



## array

In [26]:
df_arr=iris.select('Species',array(['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']).alias('features'))
df_arr.show(5)

+-----------+--------------------+
|    Species|            features|
+-----------+--------------------+
|Iris-setosa|[5.1, 3.5, 1.4, 0.2]|
|Iris-setosa|[4.9, 3.0, 1.4, 0.2]|
|Iris-setosa|[4.7, 3.2, 1.3, 0.2]|
|Iris-setosa|[4.6, 3.1, 1.5, 0.2]|
|Iris-setosa|[5.0, 3.6, 1.4, 0.2]|
+-----------+--------------------+
only showing top 5 rows



## array_contains

In [27]:
df_arr_contains=df_arr.select('features',array_contains(df_arr.features,1.4).alias('new_features'))
df_arr_contains.show(5)

+--------------------+------------+
|            features|new_features|
+--------------------+------------+
|[5.1, 3.5, 1.4, 0.2]|        true|
|[4.9, 3.0, 1.4, 0.2]|        true|
|[4.7, 3.2, 1.3, 0.2]|       false|
|[4.6, 3.1, 1.5, 0.2]|       false|
|[5.0, 3.6, 1.4, 0.2]|        true|
+--------------------+------------+
only showing top 5 rows



## asc

In [31]:
#asc returns a sort expression,which can be used as an argument for pyspark.sql.DataFrame.sort() 
#or pyspark.sql.Dataframe.orderBy()
prostate.sort(prostate.PSA.asc()).show()

+---+-------+---+----+-----+-----+----+----+-------+
| ID|CAPSULE|AGE|RACE|DPROS|DCAPS| PSA| VOL|GLEASON|
+---+-------+---+----+-----+-----+----+----+-------+
| 71|      0| 68|   1|    2|    1| 0.3| 0.0|      6|
| 49|      0| 70|   1|    2|    1| 0.4|17.1|      5|
|357|      0| 63|   1|    1|    1| 0.5| 0.0|      0|
| 61|      0| 59|   1|    2|    1| 0.7|96.0|      5|
|323|      0| 63|   1|    3|    1| 0.7|18.6|      5|
|366|      0| 55|   1|    1|    1| 0.8|21.0|      6|
|131|      0| 73|   1|    1|    1| 1.0| 0.0|      5|
|352|      0| 75|   1|    4|    1| 1.0|13.3|      6|
| 37|      0| 54|   1|    2|    1| 1.0| 0.0|      6|
|220|      0| 74|   1|    2|    1| 1.2|21.6|      6|
|225|      0| 71|   1|    1|    1|1.29| 0.0|      7|
| 69|      0| 65|   1|    2|    1| 1.3| 6.8|      5|
|129|      0| 75|   1|    2|    1| 1.4| 0.0|      6|
|277|      0| 59|   1|    2|    1| 1.4| 0.0|      6|
|  1|      0| 65|   1|    2|    1| 1.4| 0.0|      6|
|339|      1| 72|   1|    2|    1| 1.4|24.2|  

## avg

In [34]:
prostate.select(avg('PSA')).show()

+------------------+
|          avg(PSA)|
+------------------+
|15.408631578947354|
+------------------+



## current_date

In [37]:
df=spark.createDataFrame([[1],[2],[3],[4]],['x'])
df.show()

+---+
|  x|
+---+
|  1|
|  2|
|  3|
|  4|
+---+



In [38]:
df.select('x',current_date()).show()

+---+--------------+
|  x|current_date()|
+---+--------------+
|  1|    2020-05-12|
|  2|    2020-05-12|
|  3|    2020-05-12|
|  4|    2020-05-12|
+---+--------------+



## current_timestamp

In [42]:
df.select('x',current_timestamp()).show(truncate=False)

+---+-----------------------+
|x  |current_timestamp()    |
+---+-----------------------+
|1  |2020-05-12 19:06:16.816|
|2  |2020-05-12 19:06:16.816|
|3  |2020-05-12 19:06:16.816|
|4  |2020-05-12 19:06:16.816|
+---+-----------------------+



## date_add

In [44]:
df=spark.createDataFrame([['2019-12-25'],['2019-12-26'],['2020-5-12'],['2020-3-3']],['x'])
df.show()

+----------+
|         x|
+----------+
|2019-12-25|
|2019-12-26|
| 2020-5-12|
|  2020-3-3|
+----------+



In [48]:
df.select('x',date_add(df.x,6)).show()

+----------+--------------+
|         x|date_add(x, 6)|
+----------+--------------+
|2019-12-25|    2019-12-31|
|2019-12-26|    2020-01-01|
| 2020-5-12|    2020-05-18|
|  2020-3-3|    2020-03-09|
+----------+--------------+



## date_format

In [53]:
df.select('x',date_format(df.x,'YYYY/MM/dd').alias('format')).show()

+----------+----------+
|         x|    format|
+----------+----------+
|2019-12-25|2019/12/25|
|2019-12-26|2019/12/26|
| 2020-5-12|2020/05/12|
|  2020-3-3|2020/03/03|
+----------+----------+

