# ** PySpark ile Lineer Regresyon **

## Gerekli Kütüphaneler

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [2]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
dataset = spark.read.csv('BostonHousing.csv',inferSchema=True, header =True)

In [3]:
dataset.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [4]:
#Input all the features in one vector column
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'], outputCol = 'Attributes')
output = assembler.transform(dataset)
#Input vs Output
finalized_data = output.select("Attributes","medv")
finalized_data.show()

+--------------------+----+
|          Attributes|medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796,0.0,8.14...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



### Verimizi train-test %80-/%20  olarak bölüp lineer regresyonu uygulayalım 

In [5]:
#Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.8,0.2])
regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'medv')
#Learn to fit the model from training set
regressor = regressor.fit(train_data)
#To predict the prices on testing set
pred = regressor.evaluate(test_data)
#Predict the model
pred.predictions.show()

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.01381,80.0,0.4...|50.0|41.087749356547604|
|[0.01439,60.0,2.9...|29.1| 31.53021045127438|
|[0.01965,80.0,1.7...|20.1|20.162344681278622|
|[0.02543,55.0,3.7...|23.9|27.833605697448494|
|[0.02731,0.0,7.07...|21.6|24.714295168027167|
|[0.03615,80.0,4.9...|27.9| 32.39602620970229|
|[0.03768,80.0,1.5...|34.6|34.444274961423915|
|[0.04011,80.0,1.5...|33.3| 35.88609237452471|
|[0.0459,52.5,5.32...|22.3|27.141284279505932|
|[0.04666,80.0,1.5...|30.3|32.363110281095906|
|[0.05023,35.0,6.0...|17.1|19.798878044844606|
|[0.05083,0.0,5.19...|22.2| 22.36393094937121|
|[0.05515,33.0,2.1...|36.1|33.718060131619254|
|[0.05735,0.0,4.49...|26.6| 27.75243568091222|
|[0.06127,40.0,6.4...|33.1|  35.4076807492924|
|[0.06211,40.0,1.2...|22.9|20.454348248314652|
|[0.06911,45.0,3.4...|30.5| 30.16427864855687|
|[0.07503,33.0,2.1...|33.4| 35.96323486331119|
|[0.07896,0.0

In [6]:
#coefficient of the regression model
coeff = regressor.coefficients
#X and Y intercept
intr = regressor.intercept
print ("The coefficient of the model is : %a" %coeff)
print ("The Intercept of the model is : %f" %intr)

The coefficient of the model is : DenseVector([-0.1121, 0.0514, 0.0069, 3.0204, -16.5758, 3.9326, -0.0052, -1.5281, 0.3376, -0.0131, -0.8589, 0.0105, -0.5196])
The Intercept of the model is : 33.541287


# Metrikler

In [7]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="medv", predictionCol="prediction", metricName="rmse")
# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)
# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)
# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)
# r2 - coefficient of determination
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 4.366
MSE: 19.064
MAE: 3.381
r2: 0.759


# Faydalı olması dileğiyle