In [1]:
from pyspark.sql import SparkSession
from datetime import date, timedelta, datetime
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col, udf, collect_list, dayofyear
from pyspark.sql.types import *


bucket_name = "web-app-project"
spark = SparkSession\
    .builder\
    .appName("price_prediction")\
    .getOrCreate()

today = date.today().isoformat()
prices_path = "s3://" + bucket_name + "/price-data-" + today + ".csv" 
df = spark.read.csv(prices_path, header=True)

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
4,application_1604345130372_0005,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# we have minute by minute data
df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+-------------------+------+-------+------+-------+------+
|symbol|          timeframe|  open|   high|   low|  close|volume|
+------+-------------------+------+-------+------+-------+------+
|  FLIR|2020-04-01 13:29:00| 30.38|30.3871|30.355|30.3871|  1897|
|  FLIR|2020-04-01 13:28:00| 30.38|  30.38|  30.3|  30.33|  2698|
|  FLIR|2020-04-01 13:27:00| 30.43|  30.44| 30.36|  30.38|  7775|
|  FLIR|2020-04-01 13:26:00| 30.43|  30.46|  30.4|  30.42|  2630|
|  FLIR|2020-04-01 13:25:00|30.325|  30.42|30.325|  30.39|  2309|
|  FLIR|2020-04-01 13:24:00| 30.25|  30.35| 30.25|  30.35|  3129|
|  FLIR|2020-04-01 13:23:00|30.215|  30.25| 30.21|  30.25|  3731|
|  FLIR|2020-04-01 13:22:00| 30.12|  30.19| 30.12|  30.19|  2970|
|  FLIR|2020-04-01 13:21:00| 30.14|  30.15| 30.12|  30.12|  1333|
|  FLIR|2020-04-01 13:20:00| 30.11|  30.13| 30.11|  30.11|  1565|
|  FLIR|2020-04-01 13:19:00| 30.02| 30.125| 30.01| 30.125|  3383|
|  FLIR|2020-04-01 13:18:00| 30.09|  30.12| 30.02|  30.03|  7629|
|  FLIR|20

In [3]:
# over a year for 150 stocks
df.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

46148447

In [4]:
# convert column types
df = df.withColumn("timeframe", col("timeframe").cast(TimestampType()))
for col_name in df.columns[2:]:
    df = df.withColumn(col_name, col(col_name).cast(FloatType()))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# extract day of year from timestamp
df = df.withColumn("dayofyear", dayofyear("timeframe"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# aggregate by day
df = df.groupBy("symbol", "dayofyear") \
.agg({"open": "avg"}) \
.orderBy("symbol", "dayofyear", ascending=[1, 1])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# remove null rows
df = df.filter(df.dayofyear.isNotNull())

# create a features column : list of open prices averaged by day
output = df.groupby('symbol').agg(collect_list('avg(open)').alias("features"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# add a yearly average column
yearly_avg = udf(lambda x: sum(x)/len(x), DoubleType())
output = output.withColumn("yearly_average", yearly_avg("features"))

# convert to vectors for the linear regression model
array_to_vector = udf(lambda x: Vectors.dense(x[0]), VectorUDT())
output = output.withColumn("features", array_to_vector("features"))

output.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+--------------------+------------------+
|symbol|            features|    yearly_average|
+------+--------------------+------------------+
|   ALL|[112.12088938554128]|101.45785856936234|
|   BMY| [63.42082037948107]| 60.46307769685911|
|  CSCO|  [48.1446674601237]| 43.62785568062368|
|  JNPR|[24.663368762447146]|23.243451761090988|
|  SPGI| [276.2612751805505]| 308.9092829158356|
|   TEL| [96.19516385494893]| 87.21687016986051|
|  TTWO|[122.43402517029128]|136.51316238148243|
|   TSN| [90.23361413604967]| 68.99426036304392|
|  ABMD|[169.75242100591245]|215.77113403068725|
|   AXP|[125.81960952491092]|105.83465311852946|
|   CMG| [850.7590249551309]| 960.8799267647728|
|   DGX|[105.55475975016701]|108.70560963625907|
|   GIS|[52.645226509038004]|57.425679998283464|
|   FLT|[288.08307684656563]|260.24610334331174|
|   FRT|[125.75166911776104]|  97.7972130926789|
|   HAS|[104.68668922932943]| 83.62468778130545|
|  INFO|  [75.2559786602893]| 73.81775680953817|
|   OXY|[42.27503182

In [9]:
lr = LinearRegression(labelCol="yearly_average", maxIter=5, regParam=0.2, elasticNetParam=0.8)

train_data, test_data = output.randomSplit([0.8, 0.2])

model = lr.fit(train_data)

results = model.evaluate(test_data)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
results.predictions.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+--------------------+------------------+------------------+
|symbol|            features|    yearly_average|        prediction|
+------+--------------------+------------------+------------------+
|   ALL|[112.12088938554128]|101.45785856936234|107.60294385865248|
|   CMG| [850.7590249551309]| 960.8799267647728| 891.4916065289343|
|  JNPR|[24.663368762447146]|23.243451761090988| 14.78759141434742|
|   TSN| [90.23361413604967]| 68.99426036304392| 84.37480805222025|
|   EQR| [79.89796229771206]| 67.69198634982793|  73.4059724245656|
|  INTC|  [60.7035345922141]|56.410302546541864| 53.03565367141665|
|  MKTX| [379.5526673453195]|433.43359019498274| 391.4181555099817|
|   NOV|[24.983040930044773]|15.388677077269975| 15.12684737187766|
|  DXCM| [217.2046298174791]| 317.2740200902268|219.12433544245056|
|   SHW| [563.9035714571593]| 587.4116650986135| 587.0627840869374|
|  ATVI|[58.944528227870904]| 67.92506071942839|  51.1688868742891|
|   CAG| [33.67942974100283]| 32.96103989589461|

In [11]:
model_path = "s3://" + bucket_name + "/lr_model"
model.save(model_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…