# Linear Regression in PySpark

# Dependencies
## Install pyspark, pandas

In [1]:
# !pip install pyspark

In [2]:
# !pip install pandas

## Import dependencies/packages such as pyspark

In [3]:
import pyspark

In [4]:
import pandas as pd

# Create Spark Session

In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('spark_project').getOrCreate()

### Note: getOrCreate() is important.
Otherwise you have to manually reset kernel everytime, and manually run cells in proper sequence

## Change logging options, to suppress WARNings

In [7]:
spark.sparkContext.setLogLevel("ERROR")

# ALWAYS USE SPARK FUNCTIONS
TO TAKE ADVANTAGE OF SPARK'S EXECUTION SPEED. STAY AWAY FROM USER-DEFINED FUNCTIONS IF POSSIBLE.

In [8]:
spark

## SparkUI hyperlink available

# Data Wrangling, ETL process
Get Data, part of

In [9]:
# df_spark = spark.read.option('header','true').csv("file:///D:/2_R_repo/2_python repo/Spark project/auto-mpg.csv", inferSchema=True)
df_spark = spark.read.csv("file:///D:/2_general_repo/1_public_repo/Spark project/auto-mpg.csv", inferSchema=True, header=True)

Note: without inferSchema, everything is type-string.
df_spark.describe()
DataFrame[summary: string, _c0: string, V1: string, V2: string, V3: string, V4: string, V5: string, V6: string, V7: string, V8: string, V9: string]

## Rename ALL columns
toDF(*new_col_names)
Can be used for multiple (as in less than all) columns

In [10]:
new_col_names = ["sr_no", "mpg", "cyl", "dspl", "hp", "wt", "accl", "yr", "origin", "name"]

Without assignment OR without capturing return value of function, the result is only view, not modification to df.
Also Spark uses RDD, immutable datastructures, so everytime a brand new datastructure is created

In [11]:
df_spark = df_spark.toDF(*new_col_names)

## Show data, .show()
similar to pandas .head()

In [12]:
# Drop extra columns
col_list_to_drop = ("name", "origin")
df_spark = df_spark.drop(*col_list_to_drop)
df_spark.show(2)

+-----+----+---+-----+---+----+----+---+
|sr_no| mpg|cyl| dspl| hp|  wt|accl| yr|
+-----+----+---+-----+---+----+----+---+
|    1|18.0|  8|307.0|130|3504|12.0| 70|
|    2|15.0|  8|350.0|165|3693|11.5| 70|
+-----+----+---+-----+---+----+----+---+
only showing top 2 rows



## Dimensions of dataset, shape

In [13]:
(df_spark.count() , len(df_spark.columns))

(225, 8)

## Get data_types in DataFrame, .dtypes()

In [14]:
df_spark.dtypes

[('sr_no', 'int'),
 ('mpg', 'double'),
 ('cyl', 'int'),
 ('dspl', 'double'),
 ('hp', 'string'),
 ('wt', 'int'),
 ('accl', 'double'),
 ('yr', 'int')]

## Null value handling

In [15]:
from pyspark.sql.functions import isnan, when, count, col

In [16]:
df_spark.na.drop(how="all")

DataFrame[sr_no: int, mpg: double, cyl: int, dspl: double, hp: string, wt: int, accl: double, yr: int]

In [17]:
df_spark.select( [ count( when( col(c).isNull(), c)).alias(c) for c in df_spark.columns]).show()

+-----+---+---+----+---+---+----+---+
|sr_no|mpg|cyl|dspl| hp| wt|accl| yr|
+-----+---+---+----+---+---+----+---+
|    0|  0|  0|   0|  0|  0|   0|  0|
+-----+---+---+----+---+---+----+---+



# PIPELINE

### Input Column list
Define column lists that will be transformed, or converted to categorical variable from original data_type

In [18]:
input_col_list = ['cyl', 'yr']

In [19]:
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler

In [20]:
from pyspark.ml import Pipeline

## Initialize empty list of Stages
...in Pipeline

In [21]:
stages_list = []

## Append Stages, StringIndexer

## Append Stages, IndexToString

In [22]:
for col in input_col_list:
    # recast to string_index type from original type. ### Convert to String first.
    # NOTE: It really converts to string_type that is indexed by frequency, max_frequency is given index 0.
    stages_list.append(StringIndexer(inputCol=col, outputCol=col + '_str_ix', handleInvalid='skip'))
    # recast to categorical variable from string_index
    stages_list.append(IndexToString(inputCol=col + '_str_ix', outputCol=col + '_catg'))

## Append Stages, VectorAssembler

### Select features using column names list

In [23]:
predictor_cols = ["dspl", "wt", "accl" ]

Transform DataFrame with vector-assembled features

In [24]:
stages_list.append(VectorAssembler(inputCols=[col for col in predictor_cols], outputCol='features'))

Note: VectorAssembler_object.transform(DataFrame) returns a new DataFrame

Note: outputCol in VectorAssembler is the name of column,
containing "vector" of input or independent features or predictors.
It is added as a new column to original dataframe.

## Form Pipeline

In [25]:
from pyspark.ml.regression import LinearRegression

In [26]:
lm_model = LinearRegression(featuresCol='features', labelCol='mpg')
stages_list.append(lm_model)

In [27]:
print(stages_list)

[StringIndexer_3ddee2487869, IndexToString_7694d43ffeba, StringIndexer_eb26ee939760, IndexToString_42a97236ee99, VectorAssembler_3aeaa4579cfd, LinearRegression_84c30c2efb17]


### Train-Test Split

In [28]:
train_data, test_data = df_spark.randomSplit([0.75,0.25], seed=2022)

### Train pipeline on train_data

In [29]:
# Assemble pipeline
pipeline = Pipeline(stages=stages_list)
# Estimator fit , Train the model
pipeline_model = pipeline.fit(train_data)
# Transformer fit, Make Predictions
df_train_preds = pipeline_model.transform(train_data)
df_test_preds = pipeline_model.transform(test_data)
df_test_preds.show(2)

+-----+----+---+-----+---+----+----+---+----------+--------+---------+-------+-------------------+------------------+
|sr_no| mpg|cyl| dspl| hp|  wt|accl| yr|cyl_str_ix|cyl_catg|yr_str_ix|yr_catg|           features|        prediction|
+-----+----+---+-----+---+----+----+---+----------+--------+---------+-------+-------------------+------------------+
|    2|15.0|  8|350.0|165|3693|11.5| 70|       1.0|       8|      4.0|     70|[350.0,3693.0,11.5]|15.463128434480147|
|    3|18.0|  8|318.0|150|3436|11.0| 70|       1.0|       8|      4.0|     70|[318.0,3436.0,11.0]|17.026710273595334|
+-----+----+---+-----+---+----+----+---+----------+--------+---------+-------+-------------------+------------------+
only showing top 2 rows



In [30]:
(train_data.count() , len(train_data.columns))

(167, 8)

In [31]:
(test_data.count(), len(test_data.columns))

(58, 8)

### Drop extra columns

In [32]:
col_list_to_drop = ("cyl", "yr", "cyl_str_ix", "yr_str_ix", "name", "origin", "dspl", "hp", "wt", "accl","cyl_catg", "yr_catg")

In [33]:
df_train_preds = df_train_preds.drop(*col_list_to_drop)
df_train_preds.show(2)

+-----+----+-------------------+------------------+
|sr_no| mpg|           features|        prediction|
+-----+----+-------------------+------------------+
|    1|18.0|[307.0,3504.0,12.0]|16.945609026333628|
|    4|16.0|[304.0,3433.0,12.0]|17.272056648623412|
+-----+----+-------------------+------------------+
only showing top 2 rows



In [34]:
df_test_preds = df_test_preds.drop(*col_list_to_drop)
df_test_preds.show(2)

+-----+----+-------------------+------------------+
|sr_no| mpg|           features|        prediction|
+-----+----+-------------------+------------------+
|    2|15.0|[350.0,3693.0,11.5]|15.463128434480147|
|    3|18.0|[318.0,3436.0,11.0]|17.026710273595334|
+-----+----+-------------------+------------------+
only showing top 2 rows



In [35]:
df_test_preds.printSchema()

root
 |-- sr_no: integer (nullable = true)
 |-- mpg: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



Refer above code to know that "features" is vector of predictor_cols = ["dspl", "wt", "accl" ]

# Results, Evaluator

In [36]:
from pyspark.ml.evaluation import RegressionEvaluator

## Evaluator args

In [37]:
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="mpg")

## R2 for train_data

In [38]:
dataset = df_train_preds.select("mpg","features","prediction")
round(evaluator.evaluate(dataset, {evaluator.metricName: "r2"}),3)

0.822

## R2 for test_data

In [39]:
dataset = df_test_preds.select("mpg","features","prediction")
round(evaluator.evaluate(dataset, {evaluator.metricName: "r2"}),3)

0.821

## MAE for train_data
Mean Absolute Error

In [40]:
dataset = df_train_preds.select("mpg","features","prediction")
round(evaluator.evaluate(dataset, {evaluator.metricName: "mae"}),3)

1.897

## MAE for test_data
Mean Absolute Error

In [41]:
dataset = df_test_preds.select("mpg","features","prediction")
round(evaluator.evaluate(dataset, {evaluator.metricName: "mae"}),3)

2.227

## RMSE, train_data
Root Mean Squared Error

In [42]:
dataset = df_train_preds.select("mpg","features","prediction")
round(evaluator.evaluate(dataset, {evaluator.metricName: "mae"}),3)

1.897

## RMSE, test_data
Root Mean Squared Error

In [43]:
dataset = df_test_preds.select("mpg","features","prediction")
round(evaluator.evaluate(dataset, {evaluator.metricName: "mae"}),3)

2.227