### covert forext data and save it to csv by pandas

In [None]:
import pandas as pd
df=pd.read_csv('EURUSD1.csv', header = None, sep="\t")
df.columns = ['time','open','high','low','close','volume']
df = df.drop('time', axis=1)
df.to_csv("forex.csv",sep=",",index=False,header=True)

### load data by spark

In [56]:
#create sparksession object
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

In [57]:
#import Linear Regression from spark's MLlib
from pyspark.ml.regression import LinearRegression

In [89]:
#Load the dataset
df=spark.read.csv('forex.csv',inferSchema=True,header=True)

In [96]:
#explore the data
df.printSchema()

root
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)



In [97]:
#view statistical measures of data 
df.describe().show(5,False)

+-------+-------------------+--------------------+--------------------+--------------------+------------------+
|summary|open               |high                |low                 |close               |volume            |
+-------+-------------------+--------------------+--------------------+--------------------+------------------+
|count  |50000              |50000               |50000               |50000               |50000             |
|mean   |1.1621056084000059 |1.1621945742000055  |1.1620166916        |1.1621050774        |271.81078         |
|stddev |0.02222208413534274|0.022230970074184387|0.022212394925094324|0.022221542503365454|1352.8702800252788|
|min    |1.12198            |1.12222             |1.12192             |1.12199             |1                 |
|max    |1.19632            |1.19656             |1.19624             |1.19633             |122732            |
+-------+-------------------+--------------------+--------------------+--------------------+------------

In [100]:
#import vectorassembler to create dense vectors
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [98]:
df.columns

['open', 'high', 'low', 'close', 'volume']

In [101]:
#create the vector assembler 
vec_assmebler=VectorAssembler(inputCols=['open', 'high', 'low', 'close', 'volume'],outputCol='features')

In [102]:
#transform the values
features_df=vec_assmebler.transform(df)
#validate the presence of dense vectors 
features_df.printSchema()

root
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- features: vector (nullable = true)



In [103]:
#view the details of dense vector
features_df.select('features').show(5,False)

+------------------------------------------------------------+
|features                                                    |
+------------------------------------------------------------+
|[1.12456,1.1246399999999999,1.1245399999999999,1.1246,101.0]|
|[1.12461,1.12461,1.1245100000000001,1.12459,68.0]           |
|[1.12458,1.1246,1.12445,1.12447,72.0]                       |
|[1.12447,1.12448,1.12436,1.12438,71.0]                      |
|[1.12437,1.12439,1.12436,1.12438,24.0]                      |
+------------------------------------------------------------+
only showing top 5 rows



In [105]:
#create data containing input features and output column
model_df=features_df.select('features','volume')
model_df.show(5,False)

+------------------------------------------------------------+------+
|features                                                    |volume|
+------------------------------------------------------------+------+
|[1.12456,1.1246399999999999,1.1245399999999999,1.1246,101.0]|101   |
|[1.12461,1.12461,1.1245100000000001,1.12459,68.0]           |68    |
|[1.12458,1.1246,1.12445,1.12447,72.0]                       |72    |
|[1.12447,1.12448,1.12436,1.12438,71.0]                      |71    |
|[1.12437,1.12439,1.12436,1.12438,24.0]                      |24    |
+------------------------------------------------------------+------+
only showing top 5 rows



### Split Data - Train & Test sets

In [106]:
#split the data into 70/30 ratio for train test purpose
train_df,test_df=model_df.randomSplit([0.7,0.3])
print((train_df.count(), len(train_df.columns)))
print((test_df.count(), len(test_df.columns)))

(34844, 2)
(15156, 2)


In [107]:
train_df.describe().show()

+-------+------------------+
|summary|            volume|
+-------+------------------+
|  count|             34844|
|   mean|269.44564343932956|
| stddev|1275.0272174087052|
|    min|                 1|
|    max|            102123|
+-------+------------------+



## Build Linear Regression Model 

In [109]:
#Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='volume')
#fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)
lr_model.intercept

-1.966648742397079e-10

In [110]:
print(lr_model.coefficients)

[4.130356841987603e-10,-3.0030382118057333e-09,2.4681944514026168e-09,2.9140645566448707e-10,0.9999999999999999]


In [111]:
training_predictions=lr_model.evaluate(train_df)

In [124]:
print(training_predictions.meanSquaredError)
print(training_predictions.r2)

1.3928561812065817e-23
1.0


In [114]:
#make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [115]:
#view the residual errors based on predictions 
test_results.residuals.show(10)

+--------------------+
|           residuals|
+--------------------+
|6.764366844436154...|
|7.105427357601002...|
|6.52278231427772E-12|
|6.437517185986508...|
|6.73594513500575E-12|
|6.764366844436154...|
|6.52278231427772E-12|
|6.764366844436154...|
|6.73594513500575E-12|
|6.693312570860144...|
+--------------------+
only showing top 10 rows



In [123]:
#coefficient of determination value for model
print(test_results.r2)
print(test_results.rootMeanSquaredError)
print(test_results.meanSquaredError)

1.0
3.73197938133693e-12
1.3927670102723974e-23
