## Machine Learning with spark

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('practice').getOrCreate()
spark

In [26]:
df_spark=spark.read.csv('simple.csv',header=True,inferSchema=True)
df_spark.show()

+----+---+----------+------+
|Name|age|Experience|Salary|
+----+---+----------+------+
|  x1| 31|        10| 30000|
|  x2| 30|         8| 25000|
|  x3| 29|         4| 22000|
|  x4| 24|         3| 20000|
|  x5| 21|         1| 15000|
|  x6| 23|         2| 18000|
|  x7| 20|         1| 14000|
|  x8| 27|         3| 21000|
+----+---+----------+------+



In [27]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [28]:
df_spark.columns

['Name', 'age', 'Experience', 'Salary']

In [29]:
#In spark we have to assemble indipendent feautres in to group using vector Assembler
#catagory varibles needs to convert in to numeric data
from pyspark.ml.feature import VectorAssembler
featureAssembler=VectorAssembler(inputCols=['age','Experience'],outputCol="Independent Features")

In [30]:
#transform dataset using feature assembeler
output=featureAssembler.transform(df_spark)

In [31]:
output.show()

+----+---+----------+------+--------------------+
|Name|age|Experience|Salary|Independent Features|
+----+---+----------+------+--------------------+
|  x1| 31|        10| 30000|         [31.0,10.0]|
|  x2| 30|         8| 25000|          [30.0,8.0]|
|  x3| 29|         4| 22000|          [29.0,4.0]|
|  x4| 24|         3| 20000|          [24.0,3.0]|
|  x5| 21|         1| 15000|          [21.0,1.0]|
|  x6| 23|         2| 18000|          [23.0,2.0]|
|  x7| 20|         1| 14000|          [20.0,1.0]|
|  x8| 27|         3| 21000|          [27.0,3.0]|
+----+---+----------+------+--------------------+



In [32]:
#Pick only output feature and Indipendent feature columns
finalData=output.select('Independent Features','Salary')

In [33]:
finalData.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 22000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
|          [20.0,1.0]| 14000|
|          [27.0,3.0]| 21000|
+--------------------+------+



In [34]:
#import model
from pyspark.ml.regression import LinearRegression

In [35]:
#train test spilit
train_data,test_data=finalData.randomSplit([0.75,0.25])

In [36]:
#train model
regressor=LinearRegression(featuresCol='Independent Features',labelCol='Salary')
regressor=regressor.fit(train_data)

In [37]:
regressor.coefficients

DenseVector([688.1288, 565.3924])

In [38]:
regressor.intercept

-182.0925553324607

In [39]:
#predection
pred_results=regressor.evaluate(test_data)

In [40]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [23.0,2.0]| 18000|16775.653923541253|
|          [24.0,3.0]| 20000| 18029.17505030181|
|          [27.0,3.0]| 21000| 20093.56136820932|
|         [31.0,10.0]| 30000|26803.822937625682|
+--------------------+------+------------------+





In [41]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(1824.446680080484, 4105088.2761357725)