In [2]:
# Base Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.figure as fgr
import seaborn as sns

In [3]:
df = pd.read_csv("data/dow_jones_index.data", sep=',')
print(df.shape)
print(df.head(5))

(750, 16)
   quarter stock       date    open    high     low   close     volume  \
0        1    AA   1/7/2011  $15.82  $16.72  $15.78  $16.42  239655616   
1        1    AA  1/14/2011  $16.71  $16.71  $15.64  $15.97  242963398   
2        1    AA  1/21/2011  $16.19  $16.38  $15.60  $15.79  138428495   
3        1    AA  1/28/2011  $15.87  $16.63  $15.82  $16.13  151379173   
4        1    AA   2/4/2011  $16.18  $17.39  $16.18  $17.14  154387761   

   percent_change_price  percent_change_volume_over_last_wk  \
0               3.79267                                 NaN   
1              -4.42849                            1.380223   
2              -2.47066                          -43.024959   
3               1.63831                            9.355500   
4               5.93325                            1.987452   

   previous_weeks_volume next_weeks_open next_weeks_close  \
0                    NaN          $16.71           $15.97   
1            239655616.0          $16.19    

In [4]:
df['stock'] = df['stock'].astype('category')
df['date'] = pd.to_datetime(df['date'])

currency_features = df.select_dtypes(include=['object']).columns
df[currency_features] = df[currency_features].replace('[\$,]', '', regex=True).astype(float)

print(df.dtypes)

quarter                                        int64
stock                                       category
date                                  datetime64[ns]
open                                         float64
high                                         float64
low                                          float64
close                                        float64
volume                                         int64
percent_change_price                         float64
percent_change_volume_over_last_wk           float64
previous_weeks_volume                        float64
next_weeks_open                              float64
next_weeks_close                             float64
percent_change_next_weeks_price              float64
days_to_next_dividend                          int64
percent_return_next_dividend                 float64
dtype: object


In [5]:
from sklearn.preprocessing import LabelEncoder

labelencoder=LabelEncoder()
df["stock"]=labelencoder.fit_transform(df["stock"])
df

Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,percent_change_next_weeks_price,days_to_next_dividend,percent_return_next_dividend
0,1,0,2011-01-07,15.82,16.72,15.78,16.42,239655616,3.79267,,,16.71,15.97,-4.428490,26,0.182704
1,1,0,2011-01-14,16.71,16.71,15.64,15.97,242963398,-4.42849,1.380223,239655616.0,16.19,15.79,-2.470660,19,0.187852
2,1,0,2011-01-21,16.19,16.38,15.60,15.79,138428495,-2.47066,-43.024959,242963398.0,15.87,16.13,1.638310,12,0.189994
3,1,0,2011-01-28,15.87,16.63,15.82,16.13,151379173,1.63831,9.355500,138428495.0,16.18,17.14,5.933250,5,0.185989
4,1,0,2011-02-04,16.18,17.39,16.18,17.14,154387761,5.93325,1.987452,151379173.0,17.33,17.37,0.230814,97,0.175029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2,29,2011-05-27,80.22,82.63,80.07,82.63,68230855,3.00424,-21.355713,86758820.0,83.28,81.18,-2.521610,75,0.568801
746,2,29,2011-06-03,83.28,83.75,80.18,81.18,78616295,-2.52161,15.221032,68230855.0,80.93,79.78,-1.420980,68,0.578960
747,2,29,2011-06-10,80.93,81.87,79.72,79.78,92380844,-1.42098,17.508519,78616295.0,80.00,79.02,-1.225000,61,0.589120
748,2,29,2011-06-17,80.00,80.82,78.33,79.02,100521400,-1.22500,8.811952,92380844.0,78.65,76.78,-2.377620,54,0.594786


In [8]:
df=df.dropna() #removing the missing values since they appear in an important attribute
df.shape # The size of the dataset after removing the 30 missing values 

(720, 16)

In [9]:
# Definindo a variável target
target = 'percent_return_next_dividend'
Y = np.asarray(df[target])

In [10]:
features = df.columns.drop([target])
X = np.asarray(df[features])

In [11]:
X

array([[1, 0, Timestamp('2011-01-14 00:00:00'), ..., 15.79, -2.47066, 19],
       [1, 0, Timestamp('2011-01-21 00:00:00'), ..., 16.13, 1.63831, 12],
       [1, 0, Timestamp('2011-01-28 00:00:00'), ..., 17.14, 5.93325, 5],
       ...,
       [2, 29, Timestamp('2011-06-10 00:00:00'), ..., 79.02, -1.225, 61],
       [2, 29, Timestamp('2011-06-17 00:00:00'), ..., 76.78, -2.37762,
        54],
       [2, 29, Timestamp('2011-06-24 00:00:00'), ..., 82.01, 6.67274, 47]],
      dtype=object)

In [None]:
# Standarize features
scaler = StandardScaler()
predictor = scaler.fit_transform(predictor)

#Let us now split the dataset into train & test
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(predictor, target, test_size = 0.30, random_state=0)
print("x_train ",x_train.shape)
print("x_test ",x_test.shape)
print("y_train ",y_train.shape)
print("y_test ",y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import sklearn.tree
from sklearn import tree

In [None]:
import datetime as dt
x_num = pd.to_datetime(x['date'], format="%m/%d/%Y")
x_num=x_num.map(dt.datetime.toordinal)
print (x_num)

In [19]:
##
AAPL=df.filter(df.stock == 1)
print(AAPL)
AAPL=AAPL[["date","open","high","low","volume","close"]]
def RF(AAPL,grid_search=False):
  trainingData, testData = AAPL.randomSplit([0.8, 0.2])
  va = VectorAssembler().setInputCols(trainingData.columns[1:5]).setOutputCol('features')
  rf = RandomForestRegressor(labelCol="Close", featuresCol="features",maxDepth=9,numTrees=200)
  pipe = Pipeline(stages=[va, rf])
  evaluator_rf = RegressionEvaluator(labelCol="Close", predictionCol="prediction", metricName='rmse')
  if grid_search:
    paramGrid = ParamGridBuilder() \
        .addGrid(rf.numTrees, [2,10,25,50,100,200,500,1000]) \
        .addGrid(rf.maxDepth, [2,4,6,8,9,10,11,12]) \
        .build()


    crossval = CrossValidator(estimator=pipe,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator_rf,
                              numFolds=3)  # use 3+ folds in practice

    # Run cross-validation, and choose the best set of parameters to Train a GBT model.
    model_rf=crossval.fit(trainingData)
    hyperparams = model_rf.getEstimatorParamMaps()[np.argmax(model_rf.avgMetrics)]
    print(hyperparams)
    model_rf = model_rf.bestModel
  else:
    model_rf = pipe.fit(trainingData)

  # Make predictions.
  predictions = model_rf.transform(testData)

  #RMSE 
  rmse = evaluator_rf.evaluate(predictions)
  print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
  # print("Feature importance as of column index:",model_rf.stages[-1].featureImportances)
  ##
  return predictions,testData,rmse
predictions,testData,rmse = RF(AAPL)

#Line plot comparing Prediction versus reality in the test set of AAPL
pred= predictions.toPandas()
test=testData.toPandas()
plt.figure(figsize=(20,7))
plt.plot(np.arange(len(pred)), pred.prediction, alpha=0.7, label='prediction')
plt.plot(np.arange(len(test)), test.Close, alpha=0.7, label='Close' );
plt.title('Prediction versus reality in the test set of AAPL')
plt.legend()

Empty DataFrame
Columns: []
Index: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 109, ...]

[720 rows x 0 columns]


KeyError: "None of [Index(['date', 'open', 'high', 'low', 'volume', 'close'], dtype='object')] are in the [columns]"

In [20]:
from imblearn.over_sampling import SMOTE

ImportError: cannot import name '_OneToOneFeatureMixin' from 'sklearn.base' (c:\Users\p059043\Anaconda3\lib\site-packages\sklearn\base.py)

In [22]:
from sklearn.base import OneToOneFeatureMixin

ImportError: cannot import name 'OneToOneFeatureMixin' from 'sklearn.base' (c:\Users\p059043\Anaconda3\lib\site-packages\sklearn\base.py)