In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as f
from pyspark.sql.functions import col, signum

spark = SparkSession.builder.master("local[1]").getOrCreate()


In [2]:
#Need to carry over from day to day
#Dollar bars
df=spark.read.parquet("SPYcandlestick.parquet")
df=df.withColumn("Ticker",f.lit("SPY"))
df.show(5)
print(df.count())

+------+--------+-------+------+--------+----------+------+------+
|  Open|    High|    Low| Close|  Volume|      Time|Status|Ticker|
+------+--------+-------+------+--------+----------+------+------+
|326.65|  326.67| 326.63|326.63|791327.0|1578622260|    ok|   SPY|
|295.99|  295.99| 295.99|295.99|   780.0|1571102520|    ok|   SPY|
|306.67|306.6735|306.355|306.45|352821.0|1591139280|    ok|   SPY|
|310.05|  310.09| 310.01|310.02| 56166.0|1574295780|    ok|   SPY|
|308.88|  308.93| 308.87|308.91| 71974.0|1575415920|    ok|   SPY|
+------+--------+-------+------+--------+----------+------+------+
only showing top 5 rows

200255


In [3]:
import sys
df=df.withColumn('DolExch',col('Close')*col('Volume'))
df.select("DolExch").distinct().show(5)
df=df.withColumn("CumDolExch",f.sum("DolExch").over(Window.partitionBy('Ticker').orderBy("Time").rowsBetween(-sys.maxsize,0)))
df.show()
df.select("CumDolExch").distinct().show(5)
df=df.withColumn('DolBars', col('CumDolExch')%(1E7))#Take data pt every 10M dollars exchanged per stock
df=df.withColumn('Mark',f.when(df.DolBars<=df.DolExch,1).otherwise(0))
df.select("Mark").distinct().show()

+-------------+
|      DolExch|
+-------------+
|    526530.09|
|1.232265061E7|
| 6.16455684E7|
|4.233734953E7|
|3.878613142E7|
+-------------+
only showing top 5 rows

+------+------+------+------+-------+----------+------+------+------------------+------------------+
|  Open|  High|   Low| Close| Volume|      Time|Status|Ticker|           DolExch|        CumDolExch|
+------+------+------+------+-------+----------+------+------+------------------+------------------+
|297.53|297.53|297.53|297.53|  100.0|1562167500|    ok|   SPY|29752.999999999996|29752.999999999996|
| 297.5|297.56| 297.5|297.56| 1500.0|1562167620|    ok|   SPY|          446340.0|          476093.0|
|297.51|297.51|297.51|297.51|  100.0|1562167680|    ok|   SPY|           29751.0|          505844.0|
| 297.5| 297.5|297.47| 297.5| 1438.0|1562167740|    ok|   SPY|          427805.0|          933649.0|
| 297.5|297.51|297.47|297.49| 1634.0|1562167800|    ok|   SPY|486098.66000000003|1419747.6600000001|
|297.46|297.47|297.46|2

In [4]:
df=df.withColumn('CumMark',f.sum("Mark").over(Window.partitionBy('Ticker').orderBy("Time").rowsBetween(-sys.maxsize,0)))
df.show(20)
df.select("CumMark").distinct().show(10)

+------+------+------+------+-------+----------+------+------+------------------+------------------+------------------+----+-------+
|  Open|  High|   Low| Close| Volume|      Time|Status|Ticker|           DolExch|        CumDolExch|           DolBars|Mark|CumMark|
+------+------+------+------+-------+----------+------+------+------------------+------------------+------------------+----+-------+
|297.53|297.53|297.53|297.53|  100.0|1562167500|    ok|   SPY|29752.999999999996|29752.999999999996|29752.999999999996|   1|      1|
| 297.5|297.56| 297.5|297.56| 1500.0|1562167620|    ok|   SPY|          446340.0|          476093.0|          476093.0|   0|      1|
|297.51|297.51|297.51|297.51|  100.0|1562167680|    ok|   SPY|           29751.0|          505844.0|          505844.0|   0|      1|
| 297.5| 297.5|297.47| 297.5| 1438.0|1562167740|    ok|   SPY|          427805.0|          933649.0|          933649.0|   0|      1|
| 297.5|297.51|297.47|297.49| 1634.0|1562167800|    ok|   SPY|486098.

In [5]:
df=df.withColumn("Volume", f.sum("Volume").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-sys.maxsize,0)))

df=df.withColumn("Low",f.min("Low").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-sys.maxsize,0)))
df.show()
print(df.head())
df=df.withColumn("Open",f.first("Open").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-sys.maxsize,0)))
print(df.head())
df=df.withColumn("Close",f.last("Close").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-sys.maxsize,0)))
df=df.filter(df.Mark==1)

#Add target vector(2 for +change, 1 for no change 0 for -change)
df=df.withColumn("next_val", f.lead(col("Close"),1).over(Window.partitionBy('Ticker').orderBy(df["Time"])))#Need to add partitionBy()
df=df.withColumn("label", (col("next_val")-col("Close"))/col("Close"))
print(df.head())
df=df.select("Ticker","Time","Close","Volume","label")#.withColumn("Target", 100*(col("next_val")-col("Close"))/col("Close"))
#df=df.drop("next_val").withColumn("Target",1+signum(col("Target")))
df.head()

+--------+--------+--------+--------+--------+----------+------+------+--------------------+--------------------+------------------+----+-------+
|    Open|    High|     Low|   Close|  Volume|      Time|Status|Ticker|             DolExch|          CumDolExch|           DolBars|Mark|CumMark|
+--------+--------+--------+--------+--------+----------+------+------+--------------------+--------------------+------------------+----+-------+
|  297.24|  297.32|  297.23|  297.27|131122.0|1562175840|    ok|   SPY|       3.897863694E7|    9.201479921487E8|147992.14869999886|   1|     26|
|  297.34|  297.43|  297.33| 297.415| 86366.0|1562176020|    ok|   SPY|       2.568654389E7|    9.941301698587E8| 4130169.858700037|   1|     29|
|  297.79| 297.795|  297.74|297.7515|146296.0|1562360220|    ok|   SPY|4.3559853444000006E7|1.701835819697521...| 8358196.975212097|   1|    474|
|  296.38|  296.43|  296.35|  296.43| 79853.0|1562624820|    ok|   SPY|       2.367082479E7|3.144494718092561...| 4947180.92

Row(Ticker='SPY', Time=1562167500, Close=297.53, Volume=100.0, label=0.0)

In [6]:
#Convert Ticker value to int to include in ML algo, dictionary present (tick_dict) to convert back if needed
tickers=df.select("Ticker").distinct().rdd.flatMap(lambda x: x).collect()

tick_dict = {val : str(idx + 1) for idx, val in enumerate(tickers)} 
print(tick_dict)

from pyspark.sql.types import IntegerType
df=df.replace(to_replace=tick_dict, subset=['Ticker'])

df.printSchema()
df=df.withColumn("Ticker",col("Ticker").cast(IntegerType()))
df.printSchema()

{'SPY': '1'}
root
 |-- Ticker: string (nullable = false)
 |-- Time: long (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- label: double (nullable = true)

root
 |-- Ticker: integer (nullable = true)
 |-- Time: long (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- label: double (nullable = true)



In [7]:
from pyspark.ml.feature import VectorAssembler
#Create input features
df_cols=df.columns
df_cols=[ elem for elem in df_cols if elem not in ["Time","label"]]
#Move features to a single vector
assembler=VectorAssembler(inputCols=df_cols,outputCol="features")
df=assembler.transform(df)
df=df.na.drop()
df=df.dropDuplicates()
print(df.count())
df.show(10)

118183
+------+----------+-------+--------+--------------------+--------------------+
|Ticker|      Time|  Close|  Volume|               label|            features|
+------+----------+-------+--------+--------------------+--------------------+
|     1|1562344620| 297.97| 14200.0|-3.35604255462035...|[1.0,297.97,14200.0]|
|     1|1562365800| 298.14| 53824.0|2.180183806265436...|[1.0,298.14,53824.0]|
|     1|1562696880| 296.26| 47518.0|3.037872139338143E-4|[1.0,296.26,47518.0]|
|     1|1562721000| 297.08| 57480.0|1.346438669719283...|[1.0,297.08,57480.0]|
|     1|1562781000| 299.34|238514.0|8.351707088940372E-5|[1.0,299.34,23851...|
|     1|1562870520| 298.73| 83874.0|-3.34750443544413...|[1.0,298.73,83874.0]|
|     1|1562961660| 299.97| 28375.0|1.000100010000090...|[1.0,299.97,28375.0]|
|     1|1562970600| 300.18|193117.0|1.332533813046187...|[1.0,300.18,19311...|
|     1|1563299640| 300.65| 78073.0|1.663063362714497...|[1.0,300.65,78073.0]|
|     1|1563395400|298.895| 43447.0|-8.364141

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(df)

# Split the data into training and test sets (last 20% held out for testing)
df=df.sort("Time")
trainingData=df.limit(int(df.count()*0.8))
maxTime=trainingData.agg({'Time':'max'}).collect()[0]
maxTime=maxTime["max(Time)"]
testData=df.filter(df.Time>maxTime)
testData.show(10)


# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])

paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [3, 10, 20]) \
                              .addGrid(rf.numTrees, [5, 10]) \
                              .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=5)

# Train model.  This also runs the indexer.
model = crossval.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)




KeyboardInterrupt: 

In [None]:
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R Squared on test data = %g" % r2)

#rfModel = model.stages[1]
#print(rfModel)  # summary only


In [None]:
#r2=0.000213893 for SPY_candlestick random split 80/20%
#r2=-0.000156503 for SPY_candlestick with first 80% being train data, last 20% is test
#r2=9.93879e-05 for "" with first 80% being train data, use 5-fold CV

In [None]:
#RMSE test> RMSE train -> likely overfitting the data

In [None]:
ranFor=RandomForestRegressor(labelCol="label", featuresCol="features")
modelRf=ranFor.fit(trainingData)
print(modelRf.featureImportances) #[Ticker, Close, Volume]


In [None]:
predictions=predictions.repartition(100)
predictions.write.parquet('SPYpred.parquet')

In [None]:
print(dir(rf))

In [None]:
print(df.agg({"Time":'max'}).collect()[0])

In [None]:
int(df.count()*0.8)

In [None]:
df.count()

In [None]:
testData.count()

In [None]:
predictions.filter(predictions.prediction<0).count()

In [None]:
predictions.show(10)

In [None]:
#IDEA: Add length of bar as a feature

In [30]:
#Pred pandas df for timeseriescv
pdDf=df.toPandas()
pdDf=pdDf.rename(columns={"Time": "pred_times"})
pdDf['eval_times']=pdDf['pred_times']+60;
display(pdDf)

Unnamed: 0,Ticker,pred_times,Close,Volume,label,features,eval_times
0,1,1562344620,297.97,14200.0,-0.000336,"[1.0, 297.97, 14200.0]",1562344680
1,1,1562365800,298.14,53824.0,0.000218,"[1.0, 298.14, 53824.0]",1562365860
2,1,1562696880,296.26,47518.0,0.000304,"[1.0, 296.26, 47518.0]",1562696940
3,1,1562721000,297.08,57480.0,0.000135,"[1.0, 297.08, 57480.0]",1562721060
4,1,1562781000,299.34,238514.0,0.000084,"[1.0, 299.34, 238514.0]",1562781060
...,...,...,...,...,...,...,...
118178,1,1592504040,311.65,156630.0,-0.000417,"[1.0, 311.65, 156630.0]",1592504100
118179,1,1592513280,311.21,99791.0,0.000161,"[1.0, 311.21, 99791.0]",1592513340
118180,1,1592587680,314.03,209341.0,0.000446,"[1.0, 314.03, 209341.0]",1592587740
118181,1,1592857380,310.17,70228.0,0.000161,"[1.0, 310.17, 70228.0]",1592857440


In [46]:
#USING MLFLOW library instead
from timeseriescv import cross_validation as cv
crossval=cv.PurgedWalkForwardCV() #Default settings: n_splits=10, n_test_splits=1, min_train_splits=2, max_train_splits=None
crossval=crossval.split(pdDf, pdDf['label'],pdDf['pred_times'],pdDf['eval_times'],True)
print(next(crossval))


(array([    0,     1,     2, ..., 98829, 98830, 98831]), array([98832, 98833, 98834, 98835, 98836, 98837, 98838, 98839, 98840,
       98841, 98842, 98843, 98844, 98845, 98846, 98847, 98848, 98849,
       98850, 98851, 98852, 98853, 98854, 98855, 98856, 98857, 98858,
       98859, 98860, 98861, 98862, 98863, 98864, 98865, 98866, 98867,
       98868, 98869, 98870, 98871, 98872, 98873, 98874, 98875, 98876,
       98877, 98878, 98879, 98880, 98881, 98882, 98883, 98884, 98885,
       98886, 98887, 98888, 98889, 98890, 98891, 98892, 98893, 98894,
       98895, 98896, 98897, 98898, 98899, 98900, 98901, 98902, 98903,
       98904, 98905, 98906, 98907, 98908, 98909, 98910, 98911, 98912,
       98913, 98914, 98915, 98916, 98917, 98918, 98919, 98920, 98921,
       98922, 98923, 98924, 98925, 98926, 98927, 98928, 98929, 98930,
       98931, 98932, 98933, 98934, 98935, 98936, 98937, 98938, 98939,
       98940, 98941, 98942, 98943, 98944, 98945, 98946, 98947, 98948,
       98949, 98950, 98951, 98952

In [65]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
feat_rows=df.select("features").collect()
feat_list=[row.features for row in feat_rows]
label_rows=df.select("label").collect()
label_list=[row.label for row in label_rows]

In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rfr=RandomForestRegressor()
rf_cv=GridSearchCV(estimator=rfr, param_grid=random_grid, cv=crossval)
rf_cv.fit(feat_list, label_list)

ValueError: No fits were performed. Was the CV iterator empty? Were there no candidates?

In [78]:
print(next(crossval))

StopIteration: 

In [76]:
print(label_list[2])

0.0003037872139338143


In [79]:
df.write.parquet("CleanedSpy.parquet")