In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as f
from pyspark.sql.functions import col, signum

spark = SparkSession.builder.master("local[1]").getOrCreate()
df=spark.read.parquet("CleanedSpy.parquet")

In [2]:
#Pred pandas df for timeseriescv
pdDf=df.toPandas()
pdDf=pdDf.rename(columns={"Time": "pred_times"})
pdDf['eval_times']=pdDf['pred_times']+60;#Change to 300 for 5 min for example
display(pdDf)

Unnamed: 0,Ticker,pred_times,Close,Volume,label,features,eval_times
0,1,1562611080,296.7300,48808.0,0.000067,"[1.0, 296.73, 48808.0]",1562611140
1,1,1562612340,296.7100,91917.0,0.000337,"[1.0, 296.71, 91917.0]",1562612400
2,1,1562698140,296.6969,88602.0,-0.000023,"[1.0, 296.6969, 88602.0]",1562698200
3,1,1562771880,296.5000,10000.0,-0.000270,"[1.0, 296.5, 10000.0]",1562771940
4,1,1562800800,298.4100,40558.0,0.000005,"[1.0, 298.41, 40558.0]",1562800860
...,...,...,...,...,...,...,...
118178,1,1592847780,307.4000,253004.0,0.000065,"[1.0, 307.4, 253004.0]",1592847840
118179,1,1592857800,310.1900,53208.0,-0.000032,"[1.0, 310.19, 53208.0]",1592857860
118180,1,1592859660,310.5900,69776.0,0.000451,"[1.0, 310.59, 69776.0]",1592859720
118181,1,1592862540,310.5654,57509.0,-0.000372,"[1.0, 310.5654, 57509.0]",1592862600


In [3]:
#USING MLFLOW library instead
from timeseriescv import cross_validation as cv
from sklearn.model_selection import train_test_split
from itertools import tee

train, test=train_test_split(pdDf, test_size=0.2, shuffle=False)
splits=4
crossval=cv.PurgedWalkForwardCV(n_splits=splits+2) #Default settings: n_splits=10, n_test_splits=1, min_train_splits=2, max_train_splits=None
crossval=crossval.split(train, train['label'],train['pred_times'],train['eval_times'],False)
#True above means identical time intervals as opposed to identical sample splits
crossval,crossval_backup=tee(crossval)

In [4]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80,90,100,110],#80,90,100,110
    'max_features': [1,2, 3, 4],#1,2, 3, 4
    'min_samples_leaf': [3,4,5],#3,4,5
    'min_samples_split': [8,10,12],#8,10,12
    'n_estimators': [100,200,300,1000]#100,200,300,1000
}


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Create a based model
rf = RandomForestRegressor(random_state=1)
# Instantiate the grid search model
rndm_search = RandomizedSearchCV(
    estimator = rf, param_distributions = param_grid, n_iter=2,cv = crossval, scoring='r2', n_jobs = -1)

In [6]:
import numpy as np
from sklearn.metrics import r2_score
def evaluate(model, test_features, test_labels):
        predictions = model.predict(test_features)
        r2=r2_score(test_labels, predictions)
        print('Model Performance')
        print('r2: {:0.4f}.'.format(r2))
        return r2

In [7]:
import joblib 
from dask.distributed import Client, progress
client = Client(processes=False, threads_per_worker=4,
                n_workers=4, memory_limit='8GB')
client

with joblib.parallel_backend('dask'):
    rndm_search.fit(train[['Ticker','pred_times','Close','Volume']],train['label'])

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/grantjensen/.local/share/virtualenvs/grantjensen-LUZRbGIG/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-c3c4d1013bdf>", line 8, in <module>
    rndm_search.fit(train[['Ticker','pred_times','Close','Volume']],train['label'])
  File "/Users/grantjensen/.local/share/virtualenvs/grantjensen-LUZRbGIG/lib/python3.7/site-packages/sklearn/utils/validation.py", line 73, in inner_f
    return f(**kwargs)
  File "/Users/grantjensen/.local/share/virtualenvs/grantjensen-LUZRbGIG/lib/python3.7/site-packages/sklearn/model_selection/_search.py", line 736, in fit
    self._run_search(evaluate_candidates)
  File "/Users/grantjensen/.local/share/virtualenvs/grantjensen-LUZRbGIG/lib/python3.7/site-packages/sklearn/model_selection/_search.py", line 1531, in _run_search
    random_state=self.random_state))
  File "/Users/grantjensen/.local/s

KeyboardInterrupt: 

In [None]:
import pandas as pd
pd.DataFrame(rndm_search.cv_results_).head()

In [None]:
print(rndm_search.best_params_)
best_grid = rndm_search.best_estimator_
grid_accuracy = evaluate(best_grid, test[['Ticker','pred_times','Close','Volume']], test['label'])

In [None]:
import matplotlib.pyplot as plt

#Obtain matrix with rows = fold #, cols=train/test, 
data=[]
for i in range(splits):
    train_temp,test_temp=next(crossval_backup)
    data.append([train_temp[-1]/len(pdDf),(test_temp[1]-train_temp[-1])/len(pdDf),test_temp[-1]/len(pdDf)])
#Need to take transpose of data
data=list(map(list,zip(*data)))
#Need to subtract training from test
import operator
data[2]=list(map(operator.sub, data[2], data[0]))
data[2]=list(map(operator.add, data[2], data[1]))

rows=('Train','Purge','Test')
columns=['Fold %d'%x for x in range(1,splits+1)]
#columns.append("Holdout")
values=np.arange(0,1,0.1)

colors = plt.cm.BuPu([0.25,0,0.5])
n_rows = len(data)

index = np.arange(len(columns)) + 0.3
bar_width = 0.4

# Initialize the vertical-offset for the stacked bar chart.
y_offset = np.zeros(len(columns))

# Plot bars and create text labels for the table
cell_text = []
for row in range(n_rows):
    plt.bar(index, data[row], bar_width, bottom=y_offset, color=colors[row])
    y_offset = y_offset + data[row]
    cell_text.append(['%1f' % x for x in y_offset])
# Reverse colors and text labels to display the last value at the top.
colors = colors[::-1]
cell_text.reverse()

# Add a table at the bottom of the axes
the_table = plt.table(cellText=cell_text,
                      rowLabels=rows,
                      rowColours=colors,
                      colLabels=columns,
                      loc='bottom')

# Adjust layout to make room for the table:
plt.subplots_adjust(left=0.2, bottom=0.2)

plt.ylabel("Cross Validation %")
plt.yticks(values, ['%1.1f' % val for val in values])
plt.xticks([])
plt.title('Cross Validation Train/Test split')

plt.show()

In [None]:
#TODO:
#Add more stocks
#Compare via non k-fold? (or other k-fold types?)