#### Import Libraries

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In [90]:
!pip install pyspark==2.2.3
from pyspark.sql import types
from pyspark.sql.functions import col
from pyspark import SparkConf, SparkContext



In [0]:
import requests, pandas as pd, numpy as np
from pandas import DataFrame
from io import StringIO
import time, json
from datetime import date
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error
import matplotlib.pylab as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

#### Create Spark Session

In [0]:
# Reason why we have the getOrCreate code
# http://stackoverflow.com/questions/28999332/how-to-access-sparkcontext-in-pyspark-script
sc = SparkContext.getOrCreate()

#### Load the Apple Stock data with fetaures created

In [93]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Stock_price_prediction').getOrCreate()
df = spark.read.csv("AAPL_processed.csv", header = True, inferSchema = True)
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj_Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- O-L: double (nullable = true)
 |-- dummy_column: string (nullable = true)
 |-- Next_Adj_Close: string (nullable = true)
 |-- DATEFORMAT: string (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAYOFMONTH: integer (nullable = true)
 |-- DAYOFWEEK: integer (nullable = true)
 |-- WEEKOFYEAR: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- Avg_Close_20: double (nullable = true)
 |-- Avg_Close_10: double (nullable = true)
 |-- Avg_Close_5: double (nullable = true)
 |-- Avg_Close_80: double (nullable = true)
 |-- Avg_Close_0_1_8_15: double (nullable = true)
 |-- Avg_Close_0_1_3_5: double (nullable = true)
 |-- Avg_Close_0_1_5_20_80: double (nullable = true

In [94]:
df.show(2)

+-------------------+--------+--------+--------+--------+---------+---------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+------------+-----------+------------+------------------+-----------------+---------------------+-----------+-----------+--------+--------+------+
|               Date|    Open|    High|     Low|   Close|Adj_Close|   Volume|     O-L|dummy_column|Next_Adj_Close|DATEFORMAT|YEAR|MONTH|DAYOFMONTH|DAYOFWEEK|WEEKOFYEAR|week|Avg_Close_20|Avg_Close_10|Avg_Close_5|Avg_Close_80|Avg_Close_0_1_8_15|Avg_Close_0_1_3_5|Avg_Close_0_1_5_20_80|Avg_Close_3|Avg_Close_2|     O-C|     H-L|Target|
+-------------------+--------+--------+--------+--------+---------+---------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+------------+-----------+------------+------------------+-----------------+---------------------+-----------+-----------+--------+--------+------+
|

#### select fetaures for the Classification Model

In [0]:
cols=["Adj_Close","Volume","O-L","Avg_Close_20","Avg_Close_10",	"Avg_Close_5","Avg_Close_80","Avg_Close_0_1_8_15","Avg_Close_0_1_3_5","Avg_Close_0_1_5_20_80","Avg_Close_3","Avg_Close_2","O-C","H-L"]

#### Create Vector Assembler for building the model

In [0]:
from pyspark.ml.feature import VectorAssembler
df_assembler = VectorAssembler(inputCols=cols, outputCol="features")

In [118]:
df_final = df_assembler.transform(df)
df_final.show(2)

+-------------------+--------+--------+--------+--------+---------+---------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+------------+-----------+------------+------------------+-----------------+---------------------+-----------+-----------+--------+--------+------+--------------------+
|               Date|    Open|    High|     Low|   Close|Adj_Close|   Volume|     O-L|dummy_column|Next_Adj_Close|DATEFORMAT|YEAR|MONTH|DAYOFMONTH|DAYOFWEEK|WEEKOFYEAR|week|Avg_Close_20|Avg_Close_10|Avg_Close_5|Avg_Close_80|Avg_Close_0_1_8_15|Avg_Close_0_1_3_5|Avg_Close_0_1_5_20_80|Avg_Close_3|Avg_Close_2|     O-C|     H-L|Target|            features|
+-------------------+--------+--------+--------+--------+---------+---------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+------------+-----------+------------+------------------+-----------------+---------------------+---------

#### Spliiting the Training and Testing Data

In [119]:
# Data after 2017 is taken for testing
test_df = df_final.where(df_final.YEAR > '2017' )
test_df.show(2)

+-------------------+----------+----------+----------+----------+----------+--------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+------------+-----------+------------+------------------+-----------------+---------------------+-----------+-----------+--------+--------+------+--------------------+
|               Date|      Open|      High|       Low|     Close| Adj_Close|  Volume|     O-L|dummy_column|Next_Adj_Close|DATEFORMAT|YEAR|MONTH|DAYOFMONTH|DAYOFWEEK|WEEKOFYEAR|week|Avg_Close_20|Avg_Close_10|Avg_Close_5|Avg_Close_80|Avg_Close_0_1_8_15|Avg_Close_0_1_3_5|Avg_Close_0_1_5_20_80|Avg_Close_3|Avg_Close_2|     O-C|     H-L|Target|            features|
+-------------------+----------+----------+----------+----------+----------+--------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+------------+-----------+------------+------------------+-----------------+-------

In [120]:
# Data before 2017 is taken for training
train_df = df_final.where(df_final.YEAR <= '2017' )
train_df.show(2)

+-------------------+--------+--------+--------+--------+---------+---------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+------------+-----------+------------+------------------+-----------------+---------------------+-----------+-----------+--------+--------+------+--------------------+
|               Date|    Open|    High|     Low|   Close|Adj_Close|   Volume|     O-L|dummy_column|Next_Adj_Close|DATEFORMAT|YEAR|MONTH|DAYOFMONTH|DAYOFWEEK|WEEKOFYEAR|week|Avg_Close_20|Avg_Close_10|Avg_Close_5|Avg_Close_80|Avg_Close_0_1_8_15|Avg_Close_0_1_3_5|Avg_Close_0_1_5_20_80|Avg_Close_3|Avg_Close_2|     O-C|     H-L|Target|            features|
+-------------------+--------+--------+--------+--------+---------+---------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+------------+-----------+------------+------------------+-----------------+---------------------+---------

#### Logistic Regression  Model

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
log_reg=LogisticRegression(labelCol='Target').fit(train_df)

In [123]:
train_results=log_reg.evaluate(train_df).predictions
train_results.show(2)

+-------------------+--------+--------+--------+--------+---------+---------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+------------+-----------+------------+------------------+-----------------+---------------------+-----------+-----------+--------+--------+------+--------------------+--------------------+--------------------+----------+
|               Date|    Open|    High|     Low|   Close|Adj_Close|   Volume|     O-L|dummy_column|Next_Adj_Close|DATEFORMAT|YEAR|MONTH|DAYOFMONTH|DAYOFWEEK|WEEKOFYEAR|week|Avg_Close_20|Avg_Close_10|Avg_Close_5|Avg_Close_80|Avg_Close_0_1_8_15|Avg_Close_0_1_3_5|Avg_Close_0_1_5_20_80|Avg_Close_3|Avg_Close_2|     O-C|     H-L|Target|            features|       rawPrediction|         probability|prediction|
+-------------------+--------+--------+--------+--------+---------+---------+--------+------------+--------------+----------+----+-----+----------+---------+----------+----+------------+

In [129]:
train_results.filter(train_results['Target']==1).filter(train_results['prediction']==0).select(['Target','prediction','probability']).show(10,False)

+------+----------+----------------------------------------+
|Target|prediction|probability                             |
+------+----------+----------------------------------------+
|1     |0.0       |[0.5110017042184938,0.4889982957815062] |
|1     |0.0       |[0.5093105569654581,0.4906894430345418] |
|1     |0.0       |[0.5061284211516204,0.4938715788483796] |
|1     |0.0       |[0.5266262369583081,0.47337376304169176]|
|1     |0.0       |[0.5329370398658472,0.4670629601341528] |
|1     |0.0       |[0.5386959394046814,0.4613040605953186] |
|1     |0.0       |[0.540610120799119,0.45938987920088104] |
|1     |0.0       |[0.5491281804522494,0.45087181954775063]|
|1     |0.0       |[0.55087914759862,0.44912085240138]     |
|1     |0.0       |[0.5674952642318193,0.43250473576818077]|
+------+----------+----------------------------------------+
only showing top 10 rows



#### Model Evaluation

In [133]:
#Test Set results

results=log_reg.evaluate(test_df).predictions
results.select(['Target','prediction']).show(10,False)

+------+----------+
|Target|prediction|
+------+----------+
|0     |1.0       |
|1     |1.0       |
|1     |1.0       |
|0     |1.0       |
|0     |1.0       |
|0     |1.0       |
|1     |1.0       |
|1     |1.0       |
|0     |1.0       |
|1     |1.0       |
+------+----------+
only showing top 10 rows



In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#confusion matrix
true_postives = results[(results.Target == 1) & (results.prediction == 1)].count()
true_negatives = results[(results.Target == 0) & (results.prediction == 0)].count()
false_positives = results[(results.Target == 0) & (results.prediction == 1)].count()
false_negatives = results[(results.Target == 1) & (results.prediction == 0)].count()

In [149]:
recall = float(true_postives)/(true_postives + false_negatives)
print(recall)

0.7750677506775068


In [151]:
precision = float(true_postives) / (true_postives + false_positives)
print(precision)

0.5618860510805501


In [152]:
accuracy=float((true_postives+true_negatives) /(results.count()))
print(accuracy)

0.5342465753424658
