In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as f
from pyspark.sql.functions import col, signum

spark = SparkSession.builder.master("local[1]").getOrCreate()


In [2]:
#Need to carry over from day to day
#Dollar bars
df=spark.read.parquet("SPYcandlestick.parquet").cache()
df=df.repartition(10)
#from datetime import datetime
#df=df.withColumn("Date", f.to_date(f.from_unixtime("Time")))
df=df.withColumn('DolExch',col('Close')*col('Volume'))
df=df.withColumn('CumDolExch',f.sum('DolExch').over(Window.orderBy("Time").rowsBetween(-df.count(),0)))
df=df.withColumn('DolBars', col('CumDolExch')%(1E9))
df=df.withColumn('Mark',f.when(df.DolBars<=df.DolExch,1).otherwise(0))
df=df.withColumn('CumMark',f.sum('Mark').over(Window.orderBy("Time").rowsBetween(-df.count(),0)))
df=df.withColumn("Volume", f.sum("Volume").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-df.count(),0)))
df=df.withColumn("High",f.max("High").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-df.count(),0)))
df=df.withColumn("Low",f.min("Low").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-df.count(),0)))
df=df.withColumn("Open",f.first("Open").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-df.count(),0)))
df=df.withColumn("Close",f.last("Close").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-df.count(),0)))
df=df.filter(df.Mark==1)

#Add target vector(2 for +change, 1 for no change 0 for -change)
df=df.withColumn("next_val", f.lead(col("Close"),1).over(Window.orderBy(df["Time"])))
df=df.select("Time","Open","Low","High","Close","Volume","next_val").withColumn("Target", 100*(col("next_val")-col("Close"))/col("Close"))

In [4]:
df=df.drop("next_val").withColumn("Target",1+signum(col("Target")))
print(df.take(2))

KeyboardInterrupt: 

In [None]:
from pyspark.ml.feature import VectorAssembler
#Create input features
df_cols=df.columns
df_cols=[ elem for elem in df_cols if elem not in ["Time","Target"]]
#Move features to a single vector
assembler=VectorAssembler(inputCols=df_cols,outputCol="features")
df=assembler.transform(df)
print(df.count())

In [None]:
from pyspark.ml.regression import RandomForestRegressor
import time




train,test=df.randomSplit([0.8,0.2],seed=1)

print('repartitioning')
train=train.repartition(10)
test=test.repartition(10)

start_time=time.time()

dt=DecisionTreeClassifier(featuresCol='features',
                         labelCol='Target',
                         maxDepth=30,
                         minInstancesPerNode=2)

dtModel=dt.fit(train)
predictions=dtModel.transform(test)
end_time=time.time()

delta_time = end_time - start_time

# 5. print total run time 
print(f'run-time: {round(delta_time/60.0, 2)}')

predictions.printSchema()

#predictions.select('Time','Target','probability','prediction').show()

In [None]:
#predictions.select('Target','prediction').show()
correct_preds=predictions.filter(col('Target')==col('prediction'))
pc=predictions.count()
print(correct_preds.count()/pc)
print(pc)

In [None]:
#from pyspark.ml.tuning import CrossValidator

