<a href="https://colab.research.google.com/github/dansarmiento/python_analytics_solutions/blob/main/Spark_ML_Pipeline_LR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark==3.5 -q
!pip install findspark -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

# FindSpark simplifies the process of using Apache Spark with Python

import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import StandardScaler

In [4]:
# create the session
spark = SparkSession.builder.appName("Practice Project").getOrCreate()

In [5]:
# get the data
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-BD0231EN-Coursera/datasets/mpg-raw.csv

--2025-05-25 02:27:47--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-BD0231EN-Coursera/datasets/mpg-raw.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14354 (14K) [text/csv]
Saving to: ‘mpg-raw.csv’


2025-05-25 02:27:47 (285 MB/s) - ‘mpg-raw.csv’ saved [14354/14354]



In [6]:
# load to spark dataframe
df = spark.read.csv("mpg-raw.csv", header=True, inferSchema=True)

In [7]:
# look at total cars by origin
df.groupBy('Origin').count().orderBy('count').show()

+--------+-----+
|  Origin|count|
+--------+-----+
|    NULL|    1|
|European|   70|
|Japanese|   88|
|American|  247|
+--------+-----+



In [8]:
# drop duplicates and nulls
df = df.dropDuplicates()
df=df.dropna()

In [9]:
# rename columns to remove spaces
df = df.withColumnRenamed("Engine Disp","Engine_Disp")

In [10]:
# save to parquet
df.write.mode("overwrite").parquet("mpg-cleaned.parquet")

In [12]:

import os

print("mpg-cleaned.parquet exists :", os.path.isdir("mpg-cleaned.parquet"))

mpg-cleaned.parquet exists : True


In [13]:
# Load data from "mpg-cleaned.parquet" into a dataframe
df = spark.read.parquet("mpg-cleaned.parquet")

In [14]:
# #print the schema of the dataframe
df.printSchema()

root
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Engine_Disp: double (nullable = true)
 |-- Horsepower: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Accelerate: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [15]:
# define the string indexer pipeline stage
indexer = StringIndexer(inputCol="Origin", outputCol="OriginIndex")

In [16]:
# define the vector assempler pipeline stage
assembler = VectorAssembler(inputCols=['Cylinders','Engine_Disp','Horsepower','Weight','Accelerate','Year'], outputCol="features")

In [17]:
# define the standard scaler pipeline stage
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [18]:
# define the model creation pipeline stage
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="MPG")

In [19]:
# build the pipeline
pipeline = Pipeline(stages=[indexer,assembler, scaler, lr])

In [20]:
# split the data into train and test
(trainingData, testingData) = df.randomSplit([0.7, 0.3], seed=42)

In [21]:
# fit the pipeline
pipelineModel = pipeline.fit(trainingData)

In [23]:
# evaluate
print("Part 2 - Evaluation")

ps = [str(x).split("_")[0] for x in pipeline.getStages()]

print("Pipeline Stage 1 = ", ps[0])
print("Pipeline Stage 2 = ", ps[1])
print("Pipeline Stage 3 = ", ps[2])

print("Label column = ", lr.getLabelCol())

Part 2 - Evaluation
Pipeline Stage 1 =  StringIndexer
Pipeline Stage 2 =  VectorAssembler
Pipeline Stage 3 =  StandardScaler
Label column =  MPG


In [24]:
# predict using the model
predictions = pipelineModel.transform(testingData)

In [25]:
# mean squared error
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="MPG", metricName="mse")
mse = evaluator.evaluate(predictions)
print(mse)

12.226745835570984


In [26]:
# mean absolute error
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="MPG", metricName="mae")
mae = evaluator.evaluate(predictions)
print(mae)

2.84571511301356


In [27]:
# r squared
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="MPG", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(r2)

0.8018737394895767


In [28]:
print("Part 3 - Evaluation")

print("Mean Squared Error = ", round(mse,2))
print("Mean Absolute Error = ", round(mae,2))
print("R Squared = ", round(r2,2))

lrModel = pipelineModel.stages[-1]

print("Intercept = ", round(lrModel.intercept,2))


Part 3 - Evaluation
Mean Squared Error =  12.23
Mean Absolute Error =  2.85
R Squared =  0.8
Intercept =  -17.37


In [29]:
# save the model
pipelineModel.write().save("Practice_Project")

In [30]:
# load the saved model
loadedPipelineModel = PipelineModel.load("Practice_Project")

In [31]:
# predict with loaded model
predictions = loadedPipelineModel.transform(testingData)

In [32]:
predictions.select("MPG","prediction").show()

+----+------------------+
| MPG|        prediction|
+----+------------------+
|10.0| 6.960764577508108|
|11.0| 8.545911819807788|
|12.0|10.226709705747702|
|12.0| 5.446415257213548|
|13.0|21.430212400590097|
|13.0|17.437792078059676|
|13.0|11.245494102903791|
|13.0| 14.18062643349771|
|13.0|  9.95908269168963|
|13.0|11.111417171060364|
|13.0|13.170917811818402|
|13.0|  10.8894398745756|
|13.0| 7.144536211554058|
|13.0| 4.279565485353853|
|13.0| 8.611192450277986|
|14.0|10.356052138542726|
|14.0|16.057308446272668|
|14.0| 12.32766854237568|
|14.0| 10.78736711252116|
|14.0|10.983935628157035|
+----+------------------+
only showing top 20 rows



In [33]:
# evaluate
print("Part 4 - Evaluation")

loadedmodel = loadedPipelineModel.stages[-1]
totalstages = len(loadedPipelineModel.stages)
inputcolumns = loadedPipelineModel.stages[1].getInputCols()

print("Number of stages in the pipeline = ", totalstages)
for i,j in zip(inputcolumns, loadedmodel.coefficients):
    print(f"Coefficient for {i} is {round(j,4)}")

Part 4 - Evaluation
Number of stages in the pipeline =  4
Coefficient for Cylinders is 0.119
Coefficient for Engine_Disp is 0.4971
Coefficient for Horsepower is -0.2517
Coefficient for Weight is -5.7923
Coefficient for Accelerate is 0.2369
Coefficient for Year is 2.9258


In [34]:
spark.stop()