In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Dates").getOrCreate()

df = spark.read.csv("appl_stock.csv", inferSchema = True, header = True)

df.printSchema()
df.show()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            21

In [8]:
from pyspark.sql.functions import (dayofmonth, hour,
                                    dayofyear, month,
                                    year, weekofyear,
                                    format_number,                                             date_format)
df.select("Date",dayofmonth(df["Date"]),
            year(df["Date"]),
            dayofyear(df["Date"]),
            weekofyear(df["Date"])).show(5)


+----------+----------------+----------+---------------+----------------+
|      Date|dayofmonth(Date)|year(Date)|dayofyear(Date)|weekofyear(Date)|
+----------+----------------+----------+---------------+----------------+
|2010-01-04|               4|      2010|              4|               1|
|2010-01-05|               5|      2010|              5|               1|
|2010-01-06|               6|      2010|              6|               1|
|2010-01-07|               7|      2010|              7|               1|
|2010-01-08|               8|      2010|              8|               1|
+----------+----------------+----------+---------------+----------------+
only showing top 5 rows



In [17]:
# get avg closing price per year
new_df = df.withColumn("Year",year(df["Date"]))

new_df.groupBy("Year").mean().select(["Year","avg(Close)"]).show()
result = new_df.groupBy("Year").mean().select(["Year","avg(Close)"])

result.withColumnRenamed("avg(Close)", "Average Closing Price").select("Year", format_number("Average Closing Price",2)).show()


+----+------------------+
|Year|        avg(Close)|
+----+------------------+
|2015|120.03999980555547|
|2013| 472.6348802857143|
|2014| 295.4023416507935|
|2012| 576.0497195640002|
|2016|104.60400786904763|
|2010| 259.8424600000002|
|2011|364.00432532142867|
+----+------------------+

+----+---------------------------------------+
|Year|format_number(Average Closing Price, 2)|
+----+---------------------------------------+
|2015|                                 120.04|
|2013|                                 472.63|
|2014|                                 295.40|
|2012|                                 576.05|
|2016|                                 104.60|
|2010|                                 259.84|
|2011|                                 364.00|
+----+---------------------------------------+

