In [1]:
import pandas as pd
import pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
spark=SparkSession.builder.config("spark.driver.memory","12g").config("spark.memory.offHeap.enabled","true") .config("spark.memory.offHeap.size","10g").appName('task3_regression_ml_with_spark').getOrCreate()

23/04/30 10:33:00 WARN Utils: Your hostname, Adedayos-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 192.168.2.27 instead (on interface en0)
23/04/30 10:33:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/30 10:33:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/30 10:33:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
spark

In [5]:
file_path = './ml_input_data.parquet/*.parquet'
pyspark_df = spark.read.parquet(file_path, header=True, inferSchema=True)
pyspark_df.show(3)

[Stage 1:>                                                          (0 + 1) / 1]

+------+----------+--------------+---------------------+
|Symbol|Volume_New|vol_moving_avg|adj_close_rolling_med|
+------+----------+--------------+---------------------+
|    AB|  471200.0|          null|                  NaN|
|    AB|  430000.0|      471200.0|           0.03649914|
|    AB|  430000.0|      450600.0|           0.03694973|
+------+----------+--------------+---------------------+
only showing top 3 rows



                                                                                

In [6]:
pyspark_df = pyspark_df.na.drop(how='any')

In [7]:
pyspark_df.show(5)

+------+----------+-----------------+---------------------+
|Symbol|Volume_New|   vol_moving_avg|adj_close_rolling_med|
+------+----------+-----------------+---------------------+
|    AB|  430000.0|         471200.0|           0.03649914|
|    AB|  430000.0|         450600.0|           0.03694973|
|    AB|  430000.0|443733.3333333333|          0.037400328|
|    AB|  376400.0|         440300.0|          0.037400328|
|    AB|  364400.0|         427520.0|          0.037400328|
+------+----------+-----------------+---------------------+
only showing top 5 rows



In [119]:
from pyspark.ml.feature import StringIndexer, StringIndexerModel

In [9]:
# build indexer
string_indexer = StringIndexer(inputCol='Symbol', outputCol='Indexed_Symbol')

In [10]:
# learn the model
string_indexer_model = string_indexer.fit(pyspark_df)

                                                                                

In [11]:
# transform the data
pyspark_df_stringindexer = string_indexer_model.transform(pyspark_df)

In [12]:
pyspark_df_stringindexer.show(5)

+------+----------+-----------------+---------------------+--------------+
|Symbol|Volume_New|   vol_moving_avg|adj_close_rolling_med|Indexed_Symbol|
+------+----------+-----------------+---------------------+--------------+
|    AB|  430000.0|         471200.0|           0.03649914|         874.0|
|    AB|  430000.0|         450600.0|           0.03694973|         874.0|
|    AB|  430000.0|443733.3333333333|          0.037400328|         874.0|
|    AB|  376400.0|         440300.0|          0.037400328|         874.0|
|    AB|  364400.0|         427520.0|          0.037400328|         874.0|
+------+----------+-----------------+---------------------+--------------+
only showing top 5 rows



In [15]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["Indexed_Symbol","vol_moving_avg", "adj_close_rolling_med"],
                                 outputCol="Independent Features")

In [16]:
new_pyspark_df=featureassembler.transform(pyspark_df_stringindexer)

In [17]:
new_pyspark_df.show(5)

+------+----------+-----------------+---------------------+--------------+--------------------+
|Symbol|Volume_New|   vol_moving_avg|adj_close_rolling_med|Indexed_Symbol|Independent Features|
+------+----------+-----------------+---------------------+--------------+--------------------+
|    AB|  430000.0|         471200.0|           0.03649914|         874.0|[874.0,471200.0,0...|
|    AB|  430000.0|         450600.0|           0.03694973|         874.0|[874.0,450600.0,0...|
|    AB|  430000.0|443733.3333333333|          0.037400328|         874.0|[874.0,443733.333...|
|    AB|  376400.0|         440300.0|          0.037400328|         874.0|[874.0,440300.0,0...|
|    AB|  364400.0|         427520.0|          0.037400328|         874.0|[874.0,427520.0,0...|
+------+----------+-----------------+---------------------+--------------+--------------------+
only showing top 5 rows



In [19]:
ml_input_data=new_pyspark_df.select("Independent Features","Volume_New")

In [20]:
ml_input_data.show(5)

+--------------------+----------+
|Independent Features|Volume_New|
+--------------------+----------+
|[874.0,471200.0,0...|  430000.0|
|[874.0,450600.0,0...|  430000.0|
|[874.0,443733.333...|  430000.0|
|[874.0,440300.0,0...|  376400.0|
|[874.0,427520.0,0...|  364400.0|
+--------------------+----------+
only showing top 5 rows



In [None]:
"""
Linear regression modelling
"""

In [22]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data=ml_input_data.randomSplit([0.8,0.2])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Volume_New')
regressor=regressor.fit(train_data)

23/04/30 10:47:47 WARN Instrumentation: [ff485157] regParam is zero, which might cause numerical instability and overfitting.
23/04/30 10:48:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/30 10:48:18 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [23]:
### Coefficients
regressor.coefficients

DenseVector([-8.8942, 0.9632, 0.0])

In [24]:
### Prediction
pred_results=regressor.evaluate(test_data)

                                                                                

In [25]:
pred_results.predictions.show(5)

[Stage 15:>                                                         (0 + 1) / 1]

+--------------------+----------+------------------+
|Independent Features|Volume_New|        prediction|
+--------------------+----------+------------------+
|[9.0,1026630.0,0....|  938400.0|1047515.1484261984|
|[9.0,1031623.3333...|  878500.0|1052324.8031515302|
|[9.0,1048916.6666...|  668900.0| 1068982.005230984|
|[9.0,1049250.0,0....| 1108200.0| 1069303.076974998|
|[9.0,1050256.6666...|  998400.0|1070272.7136419206|
+--------------------+----------+------------------+
only showing top 5 rows



                                                                                

In [26]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError, pred_results.rootMeanSquaredError

(436367.8819334006, 38987836190980.22, 6244024.038308967)

In [32]:
test_y_actual_mean = pred_results.predictions.agg({'Volume_New':'mean'})

In [47]:
test_y_actual_mean.select('avg(Volume_New)').show()



+------------------+
|   avg(Volume_New)|
+------------------+
|1107345.1757088697|
+------------------+



                                                                                

In [48]:
(pred_results.rootMeanSquaredError / 1107345.1757088697 ) * 100

563.8733229060072

In [None]:
"""
Random Forest Regression
"""

In [76]:
from pyspark.ml.regression import RandomForestRegressionModel, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [65]:
RR_Regressor = RandomForestRegressor(featuresCol='Independent Features', labelCol='Volume_New', maxBins=8043)

In [66]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[featureassembler, RR_Regressor])

In [67]:
rr_train_data,rr_test_data=pyspark_df_stringindexer.randomSplit([0.8,0.2])

In [68]:
RR_Regressor=pipeline.fit(rr_train_data)

23/04/30 11:37:28 WARN MemoryStore: Not enough space to cache rdd_137_1 in memory! (computed 493.2 MiB so far)
23/04/30 11:37:28 WARN BlockManager: Persisting block rdd_137_1 to disk instead.
23/04/30 11:37:28 WARN MemoryStore: Not enough space to cache rdd_137_0 in memory! (computed 493.2 MiB so far)
23/04/30 11:37:28 WARN BlockManager: Persisting block rdd_137_0 to disk instead.
23/04/30 11:37:38 WARN MemoryStore: Not enough space to cache rdd_137_7 in memory! (computed 493.2 MiB so far)
23/04/30 11:37:38 WARN MemoryStore: Not enough space to cache rdd_137_3 in memory! (computed 493.2 MiB so far)
23/04/30 11:37:38 WARN BlockManager: Persisting block rdd_137_3 to disk instead.
23/04/30 11:37:38 WARN BlockManager: Persisting block rdd_137_7 to disk instead.
23/04/30 11:37:39 WARN MemoryStore: Not enough space to cache rdd_137_6 in memory! (computed 493.2 MiB so far)
23/04/30 11:37:39 WARN BlockManager: Persisting block rdd_137_6 to disk instead.
23/04/30 11:37:39 WARN MemoryStore: Not 

In [69]:
#Make predictions
predictions = RR_Regressor.transform(rr_test_data)

In [71]:
predictions.show(5)

[Stage 44:>                                                         (0 + 1) / 1]

+------+----------+------------------+---------------------+--------------+--------------------+------------------+
|Symbol|Volume_New|    vol_moving_avg|adj_close_rolling_med|Indexed_Symbol|Independent Features|        prediction|
+------+----------+------------------+---------------------+--------------+--------------------+------------------+
|    AB|    9600.0|           84320.0|            0.8204644|         874.0|[874.0,84320.0,0....| 297903.3446872417|
|    AB|   11000.0|28486.666666666668|            1.3839588|         874.0|[874.0,28486.6666...| 297903.3446872417|
|    AB|   16400.0|43706.666666666664|           0.06333716|         874.0|[874.0,43706.6666...|359376.43049111945|
|    AB|   17200.0| 79093.33333333333|           0.62116843|         874.0|[874.0,79093.3333...| 297903.3446872417|
|    AB|   17400.0|31353.333333333332|            1.3806239|         874.0|[874.0,31353.3333...| 297903.3446872417|
+------+----------+------------------+---------------------+------------

                                                                                

In [80]:
evaluator = RegressionEvaluator()
evaluator.setPredictionCol("prediction")
evaluator.setLabelCol("Volume_New")
evaluator.setMetricName("rmse")
#(PredictionCol='prediction', labelCol='Volume_New')

RegressionEvaluator_93eb9389999f

In [81]:
evaluator2 = RegressionEvaluator()
evaluator2.setPredictionCol("prediction")
evaluator2.setLabelCol("Volume_New")
evaluator2.setMetricName("mae")

RegressionEvaluator_afbe64d17dbb

In [82]:
rmse = evaluator.evaluate(predictions)
rmse

                                                                                

6469058.527134548

In [83]:
(rmse / 1107345.1757088697 ) * 100

584.1953050450926

In [84]:
mae = evaluator2.evaluate(predictions)
mae

                                                                                

723967.8856936437

In [None]:
#MAE: {LR: 436,367.8819334006 , RR: 723,967.8856936437}

In [86]:
import pickle

In [120]:
string_indexer_model.save("./string_indexer/model/mySI")

In [121]:
load_mystringindexer = StringIndexerModel.load("./string_indexer/model/mySI")

In [122]:
data = [('AB',10000,600)]
dataColumns = ["Symbol","vol_moving_avg", "adj_close_rolling_med"]
dataDF = spark.createDataFrame(data=data, schema = dataColumns)
dataDF.printSchema()
dataDF.show(truncate=False)

root
 |-- Symbol: string (nullable = true)
 |-- vol_moving_avg: long (nullable = true)
 |-- adj_close_rolling_med: long (nullable = true)

+------+--------------+---------------------+
|Symbol|vol_moving_avg|adj_close_rolling_med|
+------+--------------+---------------------+
|AB    |10000         |600                  |
+------+--------------+---------------------+



In [123]:
test_df = load_mystringindexer.transform(dataDF)
test_df.show()

+------+--------------+---------------------+--------------+
|Symbol|vol_moving_avg|adj_close_rolling_med|Indexed_Symbol|
+------+--------------+---------------------+--------------+
|    AB|         10000|                  600|         874.0|
+------+--------------+---------------------+--------------+



In [126]:
featureassembler.save('./vector_assembler/model/myVA')


In [127]:
loadedAssembler = VectorAssembler.load('./vector_assembler/model/myVA')

In [129]:
assembler_df = loadedAssembler.transform(test_df)
assembler_df.show()

+------+--------------+---------------------+--------------+--------------------+
|Symbol|vol_moving_avg|adj_close_rolling_med|Indexed_Symbol|Independent Features|
+------+--------------+---------------------+--------------+--------------------+
|    AB|         10000|                  600|         874.0|[874.0,10000.0,60...|
+------+--------------+---------------------+--------------+--------------------+



In [130]:
test_data = assembler_df.select('Independent Features')

In [104]:

regressor.save("./linear_regression/model/myLR")
 

In [95]:
from pyspark.ml.regression import LinearRegressionModel

In [105]:
savedModel= LinearRegressionModel.load("./linear_regression/model/myLR")

In [106]:
savedModel

LinearRegressionModel: uid=LinearRegression_4399efe7fa28, numFeatures=3

In [131]:
import numpy as np
y_pred=savedModel.transform(test_data)
y_pred.show()

+--------------------+-----------------+
|Independent Features|       prediction|
+--------------------+-----------------+
|[874.0,10000.0,60...|60588.14570159758|
+--------------------+-----------------+



In [141]:
prediction = y_pred.select('prediction').toJSON().first()

In [144]:
import json
prediction_json = json.loads(prediction)

In [146]:
prediction_json['prediction'] 

60588.14570159758