# Implementación Linear Regression in Pyspark

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master('local[4]')
    .appName('linear_reg')
    .getOrCreate()
)

spark

22/10/02 09:28:00 WARN Utils: Your hostname, notebook resolves to a loopback address: 127.0.1.1; using 192.168.0.18 instead (on interface wlp9s0)
22/10/02 09:28:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/10/02 09:28:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Load data

In [6]:
# load
path_input = "/home/walter/Documents/serie-notas/z_data/20221002_linear_regresion_dataset/data.csv"

data = spark.read.csv(path_input, header = True)

data.show(5, False)

+-----+-----+-----+-----+-----+------+
|var_1|var_2|var_3|var_4|var_5|output|
+-----+-----+-----+-----+-----+------+
|734  |688  |81   |0.328|0.259|0.418 |
|700  |600  |94   |0.32 |0.247|0.389 |
|712  |705  |93   |0.311|0.247|0.417 |
|734  |806  |69   |0.315|0.26 |0.415 |
|613  |759  |61   |0.302|0.24 |0.378 |
+-----+-----+-----+-----+-----+------+
only showing top 5 rows



# Exploratory

In [31]:
print(data.count(), len(data.columns))

1232 6


In [32]:
data.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- label: double (nullable = true)



In [25]:
# Change types
from pyspark.sql.types import IntegerType, DoubleType

integer_type = ['var_1', 'var_2', 'var_3']
float_type = ['var_4', 'var_5', 'output']

for c in integer_type:
    data = data.withColumn(c, data[c].cast(IntegerType()))

for c in float_type:
    data = data.withColumn(c, data[c].cast(DoubleType()))

data = data.withColumnRenamed('output', 'label')

In [34]:
# describe
data.describe().show()


+-------+-----------------+-----------------+------------------+--------------------+-------------------+--------------------+
|summary|            var_1|            var_2|             var_3|               var_4|              var_5|               label|
+-------+-----------------+-----------------+------------------+--------------------+-------------------+--------------------+
|  count|             1232|             1232|              1232|                1232|               1232|                1232|
|   mean|715.0819805194806|715.0819805194806| 80.90422077922078| 0.32633116848573285|  0.259272727111427| 0.39734172100177056|
| stddev| 91.5342940441652|93.07993263118064|11.458139049993724|0.015012772053329796|0.01290722968511499|0.033266899150425924|
|    min|              463|              472|                40|  0.2770000100135803|0.21400000154972076|  0.3009999990463257|
|    max|             1009|             1103|               116| 0.37299999594688416| 0.2939999997615814|  0.49

In [35]:
# Show Correlation

import pyspark.sql.functions as F

data.select(F.corr('var_1', 'label')).show()

+------------------+
|corr(var_1, label)|
+------------------+
|0.9187399601657321|
+------------------+



# Prep for Model in pyspark

In [46]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [50]:
# preprocessing-pyspark. build model_df
features = ['var_1', 'var_2', 'var_3', 'var_4', 'var_5']

vec_assembler = VectorAssembler(inputCols = features, outputCol= 'features')
features_df = vec_assembler.transform(data)
model_df = features_df.select('features', 'label')
model_df.show(5, False)

+---------------------------------------------------------+-------------------+
|features                                                 |label              |
+---------------------------------------------------------+-------------------+
|[734.0,688.0,81.0,0.328000009059906,0.2590000033378601]  |0.4180000126361847 |
|[700.0,600.0,94.0,0.3199999928474426,0.24699999392032623]|0.3889999985694885 |
|[712.0,705.0,93.0,0.3109999895095825,0.24699999392032623]|0.4169999957084656 |
|[734.0,806.0,69.0,0.3149999976158142,0.25999999046325684]|0.41499999165534973|
|[613.0,759.0,61.0,0.3019999861717224,0.23999999463558197]|0.3779999911785126 |
+---------------------------------------------------------+-------------------+
only showing top 5 rows



In [51]:
# build train-test
train_df, test_df = model_df.randomSplit([0.7, 0.3])

# Model

In [52]:
# model
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(labelCol='label')
lr_fitted = lr.fit(train_df)

22/10/02 09:57:44 WARN Instrumentation: [9c15752e] regParam is zero, which might cause numerical instability and overfitting.
22/10/02 09:57:44 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/10/02 09:57:44 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/10/02 09:57:44 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/10/02 09:57:44 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


# Eval

In [55]:
# print coef
print(lr_fitted.coefficients)

[0.0003369587017994129,5.639903812680925e-05,0.00025517302924750744,-0.7229177737958294,0.5803927116672541]


In [56]:
# print intercept
print(lr_fitted.intercept)

0.1803452510883159


In [57]:
# r2 in training (eval in training)
eval_train = lr_fitted.evaluate(train_df)
print(eval_train.r2)

0.876249599311955


In [59]:
# r2 in testing (eval in testing)
eval_test = lr_fitted.evaluate(test_df)
print(eval_test.r2)

0.8493379732302191


In [61]:
# RMSE in testing (eval in testing)
print(eval_test.rootMeanSquaredError)

0.01258615918477524


Lo anterior muestra los pasos esenciales del proceso. A Continuación lo mismo pero en pipeline.

# Model with Pipeline

In [70]:
from pyspark.sql.types import IntegerType, DoubleType

# load
path_input = "/home/walter/Documents/serie-notas/z_data/20221002_linear_regresion_dataset/data.csv"

new_df = spark.read.csv(path_input, header = True)

# Change types

integer_type = ['var_1', 'var_2', 'var_3']
float_type = ['var_4', 'var_5', 'output']

for c in integer_type:
    new_df = new_df.withColumn(c, new_df[c].cast(IntegerType()))

for c in float_type:
    new_df = new_df.withColumn(c, new_df[c].cast(DoubleType()))

new_df = new_df.withColumnRenamed('output', 'label')

# splot train test
train_df, test_df = new_df.randomSplit([0.7, 0.3])

In [71]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline

In [72]:
# build pipeline
features = ['var_1', 'var_2', 'var_3', 'var_4', 'var_5']

stage_1 = VectorAssembler(inputCols=features, outputCol='out_features')
stage_2 = StandardScaler(inputCol='out_features', outputCol='features')
stage_3 = LinearRegression()
stages = [stage_1, stage_2, stage_3]
pipeline = Pipeline(stages=stages)

In [73]:
# model fit
model = pipeline.fit(train_df)

22/10/02 10:23:50 WARN Instrumentation: [713894c8] regParam is zero, which might cause numerical instability and overfitting.


In [75]:
# predict (test)
pred_result = model.transform(test_df)
pred_result.show(5, False)

+-----+-----+-----+-----+-----+-----+------------------------------+----------------------------------------------------------------------------------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|out_features                  |features                                                                                      |prediction         |
+-----+-----+-----+-----+-----+-----+------------------------------+----------------------------------------------------------------------------------------------+-------------------+
|463  |527  |67   |0.284|0.228|0.311|[463.0,527.0,67.0,0.284,0.228]|[4.978771040576806,5.470461860169901,5.77128718287567,18.54647676989447,17.284202823257868]   |0.3116664074117967 |
|468  |746  |52   |0.285|0.225|0.329|[468.0,746.0,52.0,0.285,0.225]|[5.032537466500961,7.743765745136141,4.479207962828879,18.61178126556311,17.05677910189921]   |0.31972134553581627|
|498  |672  |61   |0.288|0.238|0.325|[498.0,672.0,61.0,0.288,0.238]|[5.355136022

In [76]:
# evaluate (test)
from pyspark.ml.evaluation import RegressionEvaluator

reg_eval = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='rmse')

# r2 (eval)
acc = reg_eval.evaluate(pred_result, {reg_eval.metricName: 'r2'})
print(acc)

0.8605285911118769


In [78]:
# rmse (eval)
rmse = reg_eval.evaluate(pred_result)
print(rmse)

0.012031547226028371
