In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as f
from pyspark.sql.functions import col, signum

spark = SparkSession.builder.master("local[1]").getOrCreate()


In [2]:
#Need to carry over from day to day
#Dollar bars
df=spark.read.parquet("Allcandlestick.parquet")
df.show(5)

+------+-------+------+-------+-------+------+----------+------+
|Ticker|   Open|  High|    Low|  Close|Volume|      Time|Status|
+------+-------+------+-------+-------+------+----------+------+
|   AAT|  35.82| 35.82|  35.82|  35.82| 304.0|1583965200|    ok|
|    AB|  35.88| 35.88|  35.88|  35.88| 100.0|1582235820|    ok|
|  AAAU|  14.89| 14.89|  14.88|  14.88| 868.0|1568210640|    ok|
|   AAP|157.964|158.58|157.964|158.105|6226.0|1573846380|    ok|
|  AACG|   1.28|  1.34|   1.26|   1.34|5370.0|1591882500|    ok|
+------+-------+------+-------+-------+------+----------+------+
only showing top 5 rows



In [3]:
import sys
df=df.withColumn('DolExch',col('Close')*col('Volume'))
df.select("DolExch").distinct().show(5)
df=df.withColumn("CumDolExch",f.sum("DolExch").over(Window.partitionBy('Ticker').orderBy("Time").rowsBetween(-sys.maxsize,0)))
df.show()
df.select("CumDolExch").distinct().show(5)
df=df.withColumn('DolBars', col('CumDolExch')%(1E8))#Take data pt every 10M dollars exchanged per stock
df=df.withColumn('Mark',f.when(df.DolBars<=df.DolExch,1).otherwise(0))
df.select("Mark").distinct().show()

+-------------+
|      DolExch|
+-------------+
|      231.235|
|  1232713.635|
|   1312601.32|
|1.603866854E7|
|     33301.68|
+-------------+
only showing top 5 rows

+------+-----+-----+-----+-------+------+----------+------+------------------+-----------------+
|Ticker| Open| High|  Low|  Close|Volume|      Time|Status|           DolExch|       CumDolExch|
+------+-----+-----+-----+-------+------+----------+------+------------------+-----------------+
|   AAT|47.64|47.64|47.64|  47.64|1621.0|1562607000|    ok|          77224.44|         77224.44|
|   AAT| 47.8| 47.8| 47.8|   47.8| 200.0|1562607600|    ok|            9560.0|         86784.44|
|   AAT| 47.8|47.97|47.74|  47.97|3076.0|1562607900|    ok|         147555.72|        234340.16|
|   AAT|47.96|47.97|47.89|47.8925|2097.0|1562607960|    ok|       100430.5725|      334770.7325|
|   AAT|47.89|47.91|47.88|  47.88|1744.0|1562608020|    ok|          83502.72|      418273.4525|
|   AAT|47.83|47.83|47.82|  47.82| 800.0|1562608080|   

In [4]:
df=df.withColumn('CumMark',f.sum("Mark").over(Window.partitionBy('Ticker').orderBy("Time").rowsBetween(-sys.maxsize,0)))
df=df.na.drop()
df=df.dropDuplicates()
df.show(20)
df.select("CumMark").distinct().show(10)

+------+-----+-----+-----+-------+------+----------+------+------------------+-----------------+-----------------+----+-------+
|Ticker| Open| High|  Low|  Close|Volume|      Time|Status|           DolExch|       CumDolExch|          DolBars|Mark|CumMark|
+------+-----+-----+-----+-------+------+----------+------+------------------+-----------------+-----------------+----+-------+
|   AAT|47.64|47.64|47.64|  47.64|1621.0|1562607000|    ok|          77224.44|         77224.44|         77224.44|   1|      1|
|   AAT| 47.8| 47.8| 47.8|   47.8| 200.0|1562607600|    ok|            9560.0|         86784.44|         86784.44|   0|      1|
|   AAT| 47.8|47.97|47.74|  47.97|3076.0|1562607900|    ok|         147555.72|        234340.16|        234340.16|   0|      1|
|   AAT|47.96|47.97|47.89|47.8925|2097.0|1562607960|    ok|       100430.5725|      334770.7325|      334770.7325|   0|      1|
|   AAT|47.89|47.91|47.88|  47.88|1744.0|1562608020|    ok|          83502.72|      418273.4525|      41

In [None]:
df=df.withColumn("Volume", f.sum("Volume").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-sys.maxsize,0)))

df=df.withColumn("Low",f.min("Low").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-sys.maxsize,0)))
df.show()
print(df.head())
df=df.withColumn("Open",f.first("Open").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-sys.maxsize,0)))
print(df.head())
df=df.withColumn("Close",f.last("Close").over(Window.partitionBy("CumMark").orderBy("Time").rowsBetween(-sys.maxsize,0)))
df=df.filter(df.Mark==1)

#Add target vector(2 for +change, 1 for no change 0 for -change)
df=df.withColumn("next_val", f.lead(col("Close"),1).over(Window.partitionBy('Ticker').orderBy(df["Time"])))#Need to add partitionBy()
df=df.withColumn("Target", (col("next_val")-col("Close"))/col("Close"))
print(df.head())
df=df.select("Ticker","Time","Open","Low","High","Close","Volume","Target")#.withColumn("Target", 100*(col("next_val")-col("Close"))/col("Close"))
#df=df.drop("next_val").withColumn("Target",1+signum(col("Target")))
df.head()

+------+--------+--------+--------+--------+--------+----------+------+--------------------+--------------------+--------------------+----+-------+
|Ticker|    Open|    High|     Low|   Close|  Volume|      Time|Status|             DolExch|          CumDolExch|             DolBars|Mark|CumMark|
+------+--------+--------+--------+--------+--------+----------+------+--------------------+--------------------+--------------------+----+-------+
|  AAPL|  200.08|  200.16|  200.07|  200.16| 84569.0|1562614140|    ok|       1.692733104E7|   2.6003686751554E9|  368675.15539979935|   1|     26|
|  AAPL|200.1401|200.1401|  199.82|  199.95|183251.0|1562614200|    ok|        1.97314659E7|   2.6201001410554E9|2.0100141055399895E7|   0|     26|
|  AAPL|  199.95|  199.99|  199.82|199.9293|216765.0|1562614260|    ok|        6700430.5602|   2.6268005716156E9| 2.680057161560011E7|   0|     26|
|  AAPL|199.9146|  200.01|  199.82|  199.97|262265.0|1562614320|    ok|           9098635.0|   2.6358992066156E9

In [None]:
from pyspark.ml.feature import VectorAssembler
#Create input features
df_cols=df.columns
df_cols=[ elem for elem in df_cols if elem not in ["Time","Target"]]
#Move features to a single vector
assembler=VectorAssembler(inputCols=df_cols,outputCol="features")
df=assembler.transform(df)
print(df.count())

In [None]:
from pyspark.ml.regression  import RandomForestRegressor
import time




train,test=df.randomSplit([0.5,0.5],seed=1)

print('repartitioning')
train=train.repartition(10)
test=test.repartition(10)
print('End repartition')

start_time=time.time()

dt=RandomForestRegressor(featuresCol='features',
                         labelCol='Target',
                         maxDepth=30,
                         minInstancesPerNode=2)
print('Start training')
dtModel=dt.fit(train)
print('End training')
predictions=dtModel.transform(test)
end_time=time.time()

delta_time = end_time - start_time

# 5. print total run time 
print(f'run-time: {round(delta_time/60.0, 2)}')

predictions.printSchema()

#predictions.select('Time','Target','probability','prediction').show()

In [None]:
#predictions.select('Target','prediction').show()
correct_preds=predictions.filter(col('Target')==col('prediction'))
pc=predictions.count()
print(correct_preds.count()/pc)
print(pc)

In [None]:
#from pyspark.ml.tuning import CrossValidator


In [None]:
toDisplay=predictions.select("Time","Target","prediction")
toDisplay=toDisplay.withColumn("Target",f.round(100*col("Target"),5))
toDisplay=toDisplay.withColumn("prediction",f.round(100*col("prediction"),5)).show(20)
