In [95]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F

spark = (
    SparkSession
    .builder
    .appName("dates")
    .getOrCreate()
)

In [96]:
df = spark.read.csv('apple_stock_data.csv', header=True, inferSchema=True)
df.show(1)

+----------+------+------+------+------+--------+---------+
|      Date|  Open|  High|   Low| Close|  Volume|Adj Close|
+----------+------+------+------+------+--------+---------+
|2012-03-30|608.77|610.56|597.94|599.55|26050900|   599.55|
+----------+------+------+------+------+--------+---------+
only showing top 1 row


In [None]:
(df.select(
    dayofmonth(
        df['Date']
    )
)
 .orderBy('Date', ascending=False)
 .show(5))

In [None]:
(df.select(dayofmonth(df['Date']).alias('Day'))).show(10)

In [None]:
(df
    .select(
        dayofmonth(
            df['Date']
        )
        .alias('Day')
    )
).show(5)

In [None]:
# Экстаркт часа из timestamp или date
#df.select(hour(df['Date']).alias('Hour')).agg(F.max('Hour')).show()

df.select(
    hour(
        df['Date']
    )
    .alias('Hour')
).show()

In [None]:
# Экстракт номера дня в году из timestamp или date
df.select(dayofyear(df['Date'])).show()



In [None]:
(
    df
        .select(
            dayofyear(
                df['Date']
            )
        ).show()
)

In [None]:
# Экстракт номера месяца в году из timestamp или date
df.select(month(df['Date']).alias('Month')).show(2)


In [None]:
(
    df
    .select(
        month(
            df['Date']
        )
    ).show(3)
)

In [None]:
(
    df.select(
        year(df['Date'])
        .alias('Year')
    ).distinct().show()
)

In [None]:
# Использование функции обработки времени при создании колонки
df.withColumn(
    "Year",
    year(df['Date'])
).show(1)

In [None]:
(
    df
        .withColumn("Year", year(df['Date']))
        .withColumn("Month", month(df['Date']))
        .withColumn("Day", day(df["Date"]))
).show(1)

In [97]:
newdf = df.withColumn("Year", year(df['Date']))

In [134]:
result = (newdf
    .groupBy("Year")
    .mean()
    [['avg(Year)', 'avg(Close)']]
 )
result = result.withColumnRenamed("avg(Year)", "Year")
result.show()

+------+------------------+
|  Year|        avg(Close)|
+------+------------------+
|1990.0| 37.56268774703557|
|2003.0|18.544761904761902|
|2007.0| 128.2739043824701|
|2006.0| 70.81063745019918|
|1997.0|17.965849802371523|
|1988.0| 41.54007905138338|
|1994.0| 34.08134920634923|
|2004.0|35.526944444444446|
|1991.0| 52.49553359683792|
|1996.0|24.917559055118087|
|1989.0| 41.65976190476188|
|1998.0| 30.56511904761903|
|1985.0| 20.19367588932804|
|2012.0|503.67983870967726|
|1987.0| 53.88968379446637|
|2009.0|146.81412698412706|
|1995.0| 40.54210317460315|
|2001.0|20.219112903225806|
|1992.0| 54.80338582677165|
|2005.0|   52.401746031746|
+------+------------------+
only showing top 20 rows


In [135]:
result

DataFrame[Year: double, avg(Close): double]

In [117]:
result = result.select('Year', format_number('avg(Close)', 2).alias('Mean Close'))

In [125]:
result.show(2)

+------+----------+
|  Year|Mean Close|
+------+----------+
|1990.0|     37.56|
|2003.0|     18.54|
+------+----------+
only showing top 2 rows


In [136]:
result = result.select('Year', format_number('avg(Close)', 1).alias('Mean Close'))

In [137]:
result.show()


+------+----------+
|  Year|Mean Close|
+------+----------+
|1990.0|      37.6|
|2003.0|      18.5|
|2007.0|     128.3|
|2006.0|      70.8|
|1997.0|      18.0|
|1988.0|      41.5|
|1994.0|      34.1|
|2004.0|      35.5|
|1991.0|      52.5|
|1996.0|      24.9|
|1989.0|      41.7|
|1998.0|      30.6|
|1985.0|      20.2|
|2012.0|     503.7|
|1987.0|      53.9|
|2009.0|     146.8|
|1995.0|      40.5|
|2001.0|      20.2|
|1992.0|      54.8|
|2005.0|      52.4|
+------+----------+
only showing top 20 rows
