###  pyspark ML

In [1]:
from pyspark.sql import SparkSession
# Create a Spark session
spark = SparkSession.builder \
    .appName("Project 5") \
    .getOrCreate()


In [2]:
training = spark.read.csv("proj5.csv",header=True, inferSchema=True)

In [3]:
training.show()

+-----+---+---+------+
| name|age|exp|salary|
+-----+---+---+------+
|feras| 32| 10|  2000|
|milad| 65| 40|  3000|
| sara| 24|  3|   700|
|rahaf| 33| 10|  1500|
+-----+---+---+------+



In [4]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- exp: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [5]:
training.columns

['name', 'age', 'exp', 'salary']

In [6]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["age","exp" ], outputCol="Independent Features")

In [7]:
output =assembler.transform(training)

In [8]:
output.show()

+-----+---+---+------+--------------------+
| name|age|exp|salary|Independent Features|
+-----+---+---+------+--------------------+
|feras| 32| 10|  2000|         [32.0,10.0]|
|milad| 65| 40|  3000|         [65.0,40.0]|
| sara| 24|  3|   700|          [24.0,3.0]|
|rahaf| 33| 10|  1500|         [33.0,10.0]|
+-----+---+---+------+--------------------+



In [9]:
output.columns

['name', 'age', 'exp', 'salary', 'Independent Features']

In [11]:
finalized_data = output.select("Independent Features", "salary")

In [12]:
finalized_data.show()

+--------------------+------+
|Independent Features|salary|
+--------------------+------+
|         [32.0,10.0]|  2000|
|         [65.0,40.0]|  3000|
|          [24.0,3.0]|   700|
|         [33.0,10.0]|  1500|
+--------------------+------+



In [13]:
from pyspark.ml.regression import LinearRegression

train_data,test_data= finalized_data.randomSplit([0.75,0.25])

regressor = LinearRegression(featuresCol='Independent Features', labelCol='salary')
regressor = regressor.fit(train_data)

In [14]:
regressor.coefficients

DenseVector([3555.5556, -3877.7778])

In [15]:
regressor.intercept

-73000.0000031392

In [16]:
pred_results = regressor.evaluate(test_data)

In [17]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|salary|       prediction|
+--------------------+------+-----------------+
|         [33.0,10.0]|  1500|5555.555555728148|
+--------------------+------+-----------------+



In [18]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(4055.5555557281477, 16447530.865597446)