## Machine Learning en Spark (Pipeline)

In [1]:
from pyspark.context import SparkContext
print("Running Spark Version %s" % (sc.version))

Running Spark Version 2.4.3


In [2]:
from pyspark.conf import SparkConf
conf = SparkConf()
print(conf.toDebugString())

spark.app.name=PySparkShell
spark.eventLog.dir=file:///G:/spark_outputs/logs/spark-events/
spark.eventLog.enabled=true
spark.history.fs.logDirectory=file:///G:/spark_outputs/logs/spark-events/
spark.master=local[*]
spark.submit.deployMode=client
spark.ui.showConsoleProgress=true


In [3]:
df_cars = spark.read.option("header",True).option("inferSchema",True).csv('car-milage.csv')
df_cars = df_cars.na.drop()

In [6]:
# Features (Modelamos mpg a partir del resto de datos)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

# Prepare train data
train_df, test_df = df_cars.randomSplit([0.8,0.2], seed=42)

assembler = VectorAssembler(
    inputCols=["displacement","hp","torque","CRatio","RARatio","CarbBarrells","NoOfSpeed","length","width","weight","automatic"],
    outputCol="features")

lr = LinearRegression(labelCol="mpg", maxIter=1000, regParam=0.3, elasticNetParam=0.8)

pipeline = Pipeline(stages=[assembler, lr])

model = pipeline.fit(train_df)


In [7]:
# Predict model
predictions = model.transform(test_df)

# Evaluate
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="mpg",predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)

print("mse = %g" % mse)


mse = 5.18619


In [8]:
# Save model
model.write().overwrite().save("models/lr")

In [9]:
model_data = spark.read.parquet("models/lr/stages/1_LinearRegression_51584a4207ef/data/")
model_data.show()

+-----------------+--------------------+-----+
|        intercept|        coefficients|scale|
+-----------------+--------------------+-----+
|24.05245322506605|[-0.0263957057626...|  1.0|
+-----------------+--------------------+-----+



In [10]:
# Load model
from pyspark.ml import PipelineModel
modelLoaded = PipelineModel.load("models/lr")
new_predictions = modelLoaded.transform(test_df)
new_predictions.show()

+-----+------------+---+------+------+-------+------------+---------+------+-----+------+---------+--------------------+------------------+
|  mpg|displacement| hp|torque|CRatio|RARatio|CarbBarrells|NoOfSpeed|length|width|weight|automatic|            features|        prediction|
+-----+------------+---+------+------+-------+------------+---------+------+-----+------+---------+--------------------+------------------+
|13.27|       351.0|148|   243|   8.0|   3.26|           2|        3| 216.1| 78.5|  4715|        1|[351.0,148.0,243....|16.320539906777142|
|13.27|       460.0|223|   366|   8.0|    3.0|           4|        3| 228.0| 79.8|  5430|        1|[460.0,223.0,366....|12.082968096851397|
|13.77|       360.0|195|   295|  8.25|   3.15|           4|        3| 209.3| 77.4|  4215|        1|[360.0,195.0,295....|16.939044397471363|
|18.25|       351.0|143|   255|   8.0|    3.0|           2|        3| 199.9| 74.0|  3890|        1|[351.0,143.0,255....|17.362305794523685|
|19.73|       318.0|