## Date and Timestamp Functions

The default format of the PySpark Date is `yyyy-MM-dd`.

In [0]:
dbutils.library.restartPython() # Removes Python state, but some libraries might not work without calling this command.dbutils.restartPython()

#### Load libraries

In [0]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import IntegerType, DateType, StringType, StructType, StructField, ArrayType, MapType, DoubleType
from pyspark.sql.functions import lit, col, expr, when, sum, avg, max, min, mean, count, udf, explode, concat_ws
from pyspark.sql.functions import year, month, dayofmonth, dayofyear, dayofweek, current_date, date_format, to_date, datediff, months_between, trunc, add_months, date_add, date_sub, next_day, weekofyear, current_timestamp, to_timestamp, hour, minute, second, quarter

#### Create Spark session

In [0]:
spark = SparkSession.builder.appName('PySpark Date and Timestamp Functions').getOrCreate()

#### Create Dataframe

In [0]:
data=[
  ('1','2020-02-01'),
  ('2','2019-03-15'),
  ('3','2021-09-28')
]

df = spark.createDataFrame(data,['id','input'])
df.show()

#### current_date()

In [0]:
df.select(current_date().alias('current_date')).show()

#### date_format()

In [0]:
df.select(
  col('input'), 
  date_format(col('input'), 'MM-dd-yyyy').alias('date_format') 
).show()

#### to_date()

In [0]:
date_df = df.select(
  col('input'), 
  to_date(col('input'), 'yyyy-MM-dd').alias('to_date') 
)

date_df.printSchema()
date_df.show()

#### datediff()

In [0]:
df.select(
  col('input'), 
  datediff(current_date(),col('input')).alias('datediff')  
).show()

#### months_between()

In [0]:
df.select(
  col('input'), 
  months_between(current_date(),col('input')).alias('months_between')  
).show()

#### trunc()

In [0]:
df.select(
  col('input'),
  trunc(col('input'),'Year').alias('Year_Trunc'), 
  trunc(col('input'),'Quarter').alias('Quarter_Trunc'),
  trunc(col('input'),'Month').alias('Month_Trunc')
).show()

#### add_months() , date_add(), date_sub()

In [0]:
df.select(
  col('input'), 
  add_months(col('input'),3).alias ('add_months'), 
  add_months(col('input'),-3).alias('add_months_sub'),
  date_add(col('input'),4).alias('date_add'), 
  date_add(col('input'),-4).alias('date_add_sub'),
  date_sub(col('input'),4).alias('date_sub'),
  date_sub(col('input'),-4).alias('date_sub_add') 
).show()

#### year(), quarter(), month(), weekofyear()

In [0]:
df.select(
  col('input'), 
  year(col('input')).alias('year'),
  quarter(col('input')).alias('quarter'),
  month(col('input')).alias('month'), 
  weekofyear(col('input')).alias('weekofyear') 
).show()

#### next_day()

In [0]:
df.select(
  col('input'),
  next_day(col('input'),'Sunday').alias('next_sunday'),
  next_day(col('input'),'Monday').alias('next_monday')
).show()

#### dayofweek(), dayofmonth(), dayofyear()

In [0]:
df.select(
  col('input'),  
  dayofweek(col('input')).alias('dayofweek'), 
  dayofmonth(col('input')).alias('dayofmonth'), 
  dayofyear(col('input')).alias('dayofyear'), 
).show()

#### current_timestamp()

In [0]:
df.select(
  current_timestamp().alias('current_timestamp')
).show(1,truncate=False)

#### to_timestamp()

In [0]:
df2 = df.select(
  current_timestamp().alias('current_timestamp'),
  to_timestamp(col('current_timestamp'), 'yyyy-MM-dd HH:mm:ss.SSS').alias('to_timestamp') 
).distinct()

df2.printSchema()
df2.show(truncate=False)

#### hour(), minute(), second()

In [0]:
df2.select(
  col('current_timestamp'), 
  hour(col('current_timestamp')).alias('hour'), 
  minute(col('current_timestamp')).alias('minute'),
  second(col('current_timestamp')).alias('second')
).show(truncate=False)

#### The end of the notebook