# Exercise 2

## Setup

In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F

In [2]:
agn_file_path = '../data/agn.us.csv'
ainv_file_path = '../data/ainv.us.csv'
ale_file_path = '../data/ale.us.csv'

In [3]:
spark = SparkSession.builder.appName('Exercise 2').getOrCreate()

spark

## Data Loading

In [4]:
agn_df = spark.read.csv(agn_file_path, header=True, inferSchema=True)
ainv_df = spark.read.csv(ainv_file_path, header=True, inferSchema=True)
ale_df = spark.read.csv(ale_file_path, header=True, inferSchema=True)

In [5]:
agn_df.show(5)

+----------+------+------+------+------+-------+-------+
|      Date|  Open|  High|   Low| Close| Volume|OpenInt|
+----------+------+------+------+------+-------+-------+
|2005-01-03| 32.31| 32.31|31.527|31.616|1027044|      0|
|2005-01-04|31.527|31.616| 31.22|31.338|1927762|      0|
|2005-01-05|30.971|31.051|30.714|30.843| 943399|      0|
|2005-01-06|30.843|31.398|30.764| 31.26| 662398|      0|
|2005-01-07| 31.26| 31.26|30.456|30.566|1087886|      0|
+----------+------+------+------+------+-------+-------+
only showing top 5 rows



In [6]:
ainv_df.show(5)

+----------+------+------+------+------+------+-------+
|      Date|  Open|  High|   Low| Close|Volume|OpenInt|
+----------+------+------+------+------+------+-------+
|2005-02-25|7.9331|8.0665|7.8034|8.0665|632096|      0|
|2005-02-28|8.0756|8.1248|7.8524|7.8615|750398|      0|
|2005-03-01|7.8673|8.1101|7.8524|8.0756|977229|      0|
|2005-03-02|8.0385|8.1335|8.0385|8.0625|837318|      0|
|2005-03-03|8.0625| 8.148|8.0287|8.0996|541465|      0|
+----------+------+------+------+------+------+-------+
only showing top 5 rows



In [7]:
ale_df.show(5)

+----------+------+------+------+------+------+-------+
|      Date|  Open|  High|   Low| Close|Volume|OpenInt|
+----------+------+------+------+------+------+-------+
|2005-02-25|30.717|31.104|  30.6|31.087|214817|      0|
|2005-02-28|31.073|31.087|30.652|  30.7|180957|      0|
|2005-03-01|30.659|30.946|30.606|30.946|223738|      0|
|2005-03-02|30.908|31.179|30.884|31.073|238730|      0|
|2005-03-03|31.073|31.211|30.987|31.195|224381|      0|
+----------+------+------+------+------+------+-------+
only showing top 5 rows



## Queries

### Q1 - The monthly averages of the opening prices, closing prices and trading volumes for each stock

In [8]:
def monthly_avg(df):
    df = df.withColumn('Month', F.month('Date')) \
        .withColumn('Year', F.year('Date')) \
        .groupBy('Year', 'Month') \
        .agg(F.avg('Open').alias('Average Open'), F.avg('Close').alias('Average Close'), F.avg('Volume').alias('Average Volume'))

    df = df.withColumn('Average Open', F.round(df['Average Open'], 2))
    df = df.withColumn('Average Close', F.round(df['Average Close'], 2))
    df = df.withColumn('Average Volume', F.round(df['Average Volume'], 2))

    df = df.sort('Year', 'Month')

    return df

In [9]:
agn_monthly_avg = monthly_avg(agn_df)
agn_monthly_avg.show()

+----+-----+------------+-------------+--------------+
|Year|Month|Average Open|Average Close|Average Volume|
+----+-----+------------+-------------+--------------+
|2005|    1|       30.35|        30.23|      895649.0|
|2005|    2|       29.78|        29.77|    1457348.32|
|2005|    3|       31.22|        31.21|     914340.27|
|2005|    4|       30.48|        30.43|     825136.57|
|2005|    5|       29.64|        29.67|    1181446.76|
|2005|    6|       30.37|        30.34|     576798.73|
|2005|    7|       29.65|        29.75|     861111.35|
|2005|    8|       34.28|        34.29|    1071394.91|
|2005|    9|       34.64|        34.72|     951379.29|
|2005|   10|       35.05|        34.98|     933635.67|
|2005|   11|       32.99|        32.99|     902174.81|
|2005|   12|        32.6|        32.56|     828961.05|
|2006|    1|       33.18|        33.22|     807458.65|
|2006|    2|       31.29|        31.25|    1200371.05|
|2006|    3|       29.06|        29.04|     996914.43|
|2006|    

In [10]:
ainv_monthly_avg = monthly_avg(ainv_df)
ainv_monthly_avg.show()

+----+-----+------------+-------------+--------------+
|Year|Month|Average Open|Average Close|Average Volume|
+----+-----+------------+-------------+--------------+
|2005|    2|         8.0|         7.96|      691247.0|
|2005|    3|        8.16|         8.16|     667090.86|
|2005|    4|        7.96|         7.95|     815881.05|
|2005|    5|        7.73|         7.76|     365223.57|
|2005|    6|        8.44|         8.49|     662943.73|
|2005|    7|        8.72|         8.76|     661470.85|
|2005|    8|        9.12|         9.17|     873391.13|
|2005|    9|        9.43|         9.44|     563145.14|
|2005|   10|        8.96|         8.94|     677502.76|
|2005|   11|        9.13|         9.12|     481479.24|
|2005|   12|        8.91|         8.87|     685912.71|
|2006|    1|        8.75|         8.74|     533957.05|
|2006|    2|        9.11|          9.1|     761886.63|
|2006|    3|        8.73|          8.7|    2146854.35|
|2006|    4|        8.78|         8.79|     1262171.0|
|2006|    

In [11]:
ale_monthly_avg = monthly_avg(ale_df)
ale_monthly_avg.show()

+----+-----+------------+-------------+--------------+
|Year|Month|Average Open|Average Close|Average Volume|
+----+-----+------------+-------------+--------------+
|2005|    2|        30.9|        30.89|      197887.0|
|2005|    3|       32.55|        32.62|     282829.86|
|2005|    4|       32.23|        32.23|     201695.62|
|2005|    5|       35.06|        35.26|     241937.19|
|2005|    6|       38.18|        38.25|     244023.91|
|2005|    7|       38.01|        37.99|     176326.95|
|2005|    8|       35.74|        35.66|     237297.39|
|2005|    9|       34.27|        34.29|     215680.19|
|2005|   10|       33.84|        33.76|     200767.43|
|2005|   11|       34.24|        34.22|     117670.52|
|2005|   12|       35.51|        35.41|     119306.43|
|2006|    1|       34.99|        34.99|      99518.55|
|2006|    2|       34.62|        34.71|     159199.53|
|2006|    3|       35.72|        35.63|      147331.3|
|2006|    4|       35.88|        35.86|     151082.79|
|2006|    

### Q2 - The number of days the opening price of each stock was higher than $35

In [12]:
def days_higher_than(df: DataFrame, price: float, stock_name: str):
    count = df.filter(df['Open'] > price).count()
    print(f'The opening price of {stock_name} was higher than ${price} for {count} days.')

In [13]:
days_higher_than(agn_df, 35, 'AGN')

The opening price of AGN was higher than $35 for 2071 days.


In [14]:
days_higher_than(ainv_df, 35, 'AINV')

The opening price of AINV was higher than $35 for 0 days.


In [15]:
days_higher_than(ale_df, 35, 'ALE')

The opening price of ALE was higher than $35 for 1667 days.


### Q3 - The days each stock had the highest opening price and the highest volume

In [16]:
def highest_opening_day(df: DataFrame, stock_name: str):
    row = df.orderBy(F.col('Open').desc()).select('Date', 'Open').first()
    return f"The highest opening price of {stock_name} was ${row['Open']} on {row['Date']}."

def highest_volume_day(df: DataFrame, stock_name: str):
    row = df.orderBy(F.col('Volume').desc()).select('Date', 'Volume').first()
    return f"The highest trading volume of {stock_name} was {row['Volume']} on {row['Date']}."

In [17]:
print(highest_opening_day(agn_df, 'AGN'))
print(highest_volume_day(agn_df, 'AGN'))

The highest opening price of AGN was $334.08 on 2015-07-30.
The highest trading volume of AGN was 36807460 on 2016-04-05.


In [18]:
print(highest_opening_day(ainv_df, 'AINV'))
print(highest_volume_day(ainv_df, 'AINV'))

The highest opening price of AINV was $11.474 on 2007-02-20.
The highest trading volume of AINV was 57522365 on 2014-02-28.


In [19]:
print(highest_opening_day(ale_df, 'ALE'))
print(highest_volume_day(ale_df, 'ALE'))

The highest opening price of ALE was $80.0 on 2017-11-01.
The highest trading volume of ALE was 2118295 on 2014-02-27.


### Q4 - The years each stock had the highest opening price and the lowest closing price

In [20]:
def highest_opening_year(df: DataFrame, stock_name: str):
    row = df.orderBy(F.col('Open').desc()).select(F.year('Date').alias('Year'), 'Open').first()
    print(f"The highest opening price of {stock_name} was ${row['Open']} in {row['Year']}.")

def lowest_closing_year(df: DataFrame, stock_name: str):
    row = df.orderBy(F.col('Close').asc()).select(F.year('Date').alias('Year'), 'Close').first()
    print(f"The lowest closing price of {stock_name} was ${row['Close']} in {row['Year']}.")

In [21]:
highest_opening_year(agn_df, 'AGN')
lowest_closing_year(agn_df, 'AGN')

The highest opening price of AGN was $334.08 in 2015.
The lowest closing price of AGN was $20.575 in 2008.


In [22]:
highest_opening_year(ainv_df, 'AINV')
lowest_closing_year(ainv_df, 'AINV')

The highest opening price of AINV was $11.474 in 2007.
The lowest closing price of AINV was $0.9797 in 2009.


In [23]:
highest_opening_year(ale_df, 'ALE')
lowest_closing_year(ale_df, 'ALE')

The highest opening price of ALE was $80.0 in 2017.
The lowest closing price of ALE was $18.29 in 2009.


## Stop

In [24]:
spark.stop()