In [1]:
import findspark
findspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')

## Session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("dates").getOrCreate()

## Read dataset

In [3]:
df = spark.read.csv("appl_stock.csv", header=True, inferSchema=True)

df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [4]:
df.show(5)

+-------------------+----------+----------+------------------+------------------+---------+------------------+
|               Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
o

## Get day, month, year...

In [5]:
# get day
from pyspark.sql.functions import dayofmonth

df.select(
    dayofmonth(df['Date'])).show(5)

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
+----------------+
only showing top 5 rows



In [6]:
# get hour
from pyspark.sql.functions import hour

df.select(
    hour(df['Date'])).show(5)

+----------+
|hour(Date)|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 5 rows



In [7]:
# Get month
from pyspark.sql.functions import month

df.select(
    month(df['Date'])).show(5)

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
+-----------+
only showing top 5 rows



In [8]:
# Get year
from pyspark.sql.functions import year

df.select(
    year(df['Date'])).show(5)

+----------+
|year(Date)|
+----------+
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
+----------+
only showing top 5 rows



## Group

In [9]:
from pyspark.sql.functions import format_number

# Create new column 'Year'
newdf = df.withColumn("Year", year(df['Date']))

# GroupBy year calculate mean of 'Close'
result= newdf \
    .groupBy("Year").mean()

# Select 2 columns
result = result[['Year','avg(Close)']]

# Select and display result
result \
    .select(
        'Year',
        format_number('avg(Close)', 2).alias("Mean Close")) \
    .show()

+----+----------+
|Year|Mean Close|
+----+----------+
|2015|    120.04|
|2013|    472.63|
|2014|    295.40|
|2012|    576.05|
|2016|    104.60|
|2010|    259.84|
|2011|    364.00|
+----+----------+

