# Dependencies
## Install pyspark, pandas

In [56]:
# !pip install pyspark

In [57]:
# !pip install pandas

## Import dependencies/packages such as pyspark

In [58]:
import pyspark

In [59]:
import pandas as pd

# Create Spark Session

In [60]:
from pyspark.sql import SparkSession

In [61]:
spark = SparkSession.builder.appName('spark_project ').getOrCreate()

## Change logging options, to suppress WARNings

In [62]:
spark.sparkContext.setLogLevel("ERROR")

### Note: getOrCreate() is important.
Otherwise you have to manually reset kernel everytime, and manually run cells in proper sequence

# ALWAYS USE SPARK FUNCTIONS
TO TAKE ADVANTAGE OF SPARK'S EXECUTION SPEED. STAY AWAY FROM USER-DEFINED FUNCTIONS IF POSSIBLE.

In [63]:
spark

## SparkUI hyperlink available

# Get Data, part of Data Wrangling, ETL process

In [64]:
# df_spark = spark.read.option('header','true').csv("file:///D:/2_R_repo/2_python repo/Spark project/auto-mpg.csv", inferSchema=True)
df_spark = spark.read.csv("file:///D:/2_general_repo/1_public_repo/Spark project/auto-mpg.csv", inferSchema=True, header=True)

Note: without inferSchema, everything is type-string.
df_spark.describe()
DataFrame[summary: string, _c0: string, V1: string, V2: string, V3: string, V4: string, V5: string, V6: string, V7: string, V8: string, V9: string]

## Rename ALL columns, toDF(*new_col_names)
Can be used for multiple (as in less than all) columns

In [65]:
new_col_names = ["sr_no", "mpg", "cyl", "dspl", "hp", "wt", "accl", "yr", "origin", "name"]

Without assignment OR without capturing return value of function, the result is only view, not modification to df.
Also Spark uses RDD, immutable datastructures, so everytime a brand new datastructure is created

In [66]:
df_spark = df_spark.toDF(*new_col_names)

## Dimensions of dataset, shape

In [67]:
(df_spark.count() , len(df_spark.columns))

(225, 10)

## Show data, .show()
similar to pandas .head()

In [68]:
df_spark.toDF(*new_col_names).show(5)

+-----+----+---+-----+---+----+----+---+------+--------------------+
|sr_no| mpg|cyl| dspl| hp|  wt|accl| yr|origin|                name|
+-----+----+---+-----+---+----+----+---+------+--------------------+
|    1|18.0|  8|307.0|130|3504|12.0| 70|     1|chevrolet chevell...|
|    2|15.0|  8|350.0|165|3693|11.5| 70|     1|   buick skylark 320|
|    3|18.0|  8|318.0|150|3436|11.0| 70|     1|  plymouth satellite|
|    4|16.0|  8|304.0|150|3433|12.0| 70|     1|       amc rebel sst|
|    5|17.0|  8|302.0|140|3449|10.5| 70|     1|         ford torino|
+-----+----+---+-----+---+----+----+---+------+--------------------+
only showing top 5 rows



## Get data_types in DataFrame, .dtypes()

In [69]:
df_spark.dtypes

[('sr_no', 'int'),
 ('mpg', 'double'),
 ('cyl', 'int'),
 ('dspl', 'double'),
 ('hp', 'string'),
 ('wt', 'int'),
 ('accl', 'double'),
 ('yr', 'int'),
 ('origin', 'int'),
 ('name', 'string')]

## Check for Null values

In [70]:
from pyspark.sql.functions import isnan, when, count, col

In [71]:
df_spark.select( [ count( when( col(c).isNull(), c)).alias(c) for c in df_spark.columns]).show()

+-----+---+---+----+---+---+----+---+------+----+
|sr_no|mpg|cyl|dspl| hp| wt|accl| yr|origin|name|
+-----+---+---+----+---+---+----+---+------+----+
|    0|  0|  0|   0|  0|  0|   0|  0|     0|   0|
+-----+---+---+----+---+---+----+---+------+----+



In [72]:
df_spark.na.drop(how="all")

DataFrame[sr_no: int, mpg: double, cyl: int, dspl: double, hp: string, wt: int, accl: double, yr: int, origin: int, name: string]

# Linear Regression in PySpark

## Pre_process data
to make suitable for analysis

In [73]:
from pyspark.ml.feature import VectorAssembler

Avengers, assemble!

Note: outputCol in VectorAssembler is the name of column,
containing "vector" of input or independent features or predictors.
It is added as a new column to original dataframe.

In [74]:
features_assemble = VectorAssembler( inputCols=['cyl', 'dspl','wt'], outputCol='input_features')

### Vector Assemble features
Transform DataFrame with vector-assembled features

In [75]:
df_features_assemble = features_assemble.transform(df_spark)

In [76]:
type(df_features_assemble)

pyspark.sql.dataframe.DataFrame

Note: VectorAssembler_object.transform(DataFrame) returns a new DataFrame

In [77]:
df_features_assemble.show(3)

+-----+----+---+-----+---+----+----+---+------+--------------------+------------------+
|sr_no| mpg|cyl| dspl| hp|  wt|accl| yr|origin|                name|    input_features|
+-----+----+---+-----+---+----+----+---+------+--------------------+------------------+
|    1|18.0|  8|307.0|130|3504|12.0| 70|     1|chevrolet chevell...|[8.0,307.0,3504.0]|
|    2|15.0|  8|350.0|165|3693|11.5| 70|     1|   buick skylark 320|[8.0,350.0,3693.0]|
|    3|18.0|  8|318.0|150|3436|11.0| 70|     1|  plymouth satellite|[8.0,318.0,3436.0]|
+-----+----+---+-----+---+----+----+---+------+--------------------+------------------+
only showing top 3 rows



Notice the "input_features" column

### Pre_process complete

## Ready for Analysis

In [78]:
df_ready_for_analysis = df_features_assemble.select('mpg','input_features')

In [79]:
df_ready_for_analysis.show(3)

+----+------------------+
| mpg|    input_features|
+----+------------------+
|18.0|[8.0,307.0,3504.0]|
|15.0|[8.0,350.0,3693.0]|
|18.0|[8.0,318.0,3436.0]|
+----+------------------+
only showing top 3 rows



Refer above code to know that "input_features" is vector of inputCols=['cyl', 'dspl','wt']

In [80]:
df_ready_for_analysis.describe().show()

+-------+------------------+
|summary|               mpg|
+-------+------------------+
|  count|               225|
|   mean|19.964444444444446|
| stddev| 6.043264977419209|
|    min|               9.0|
|    max|              36.0|
+-------+------------------+



### Train-Test Split

In [81]:
train_data, test_data = df_ready_for_analysis.randomSplit([0.75,0.25], seed=2022)

In [82]:
train_data.count()

167

In [83]:
test_data.count()

58

In [84]:
df_ready_for_analysis.count()

225

### Import LinearRegression

In [85]:
from pyspark.ml.regression import LinearRegression

### Instantiate object for LinearRegression

In [86]:
lm_object = LinearRegression(featuresCol='input_features', labelCol='mpg')

In [87]:
print( type(lm_object))

<class 'pyspark.ml.regression.LinearRegression'>


### Fit the model to training data

In [88]:
model_lm_fit_train_data = lm_object.fit(dataset=train_data)

In [89]:
print( type(model_lm_fit_train_data))

<class 'pyspark.ml.regression.LinearRegressionModel'>


In [90]:
print( model_lm_fit_train_data)

LinearRegressionModel: uid=LinearRegression_f7ac3460491c, numFeatures=3


### Get RMSE
Root Mean Squared Error

In [91]:
rmse_lm = round(model_lm_fit_train_data.summary.rootMeanSquaredError,3)
print("RMSE= ", rmse_lm)

RMSE=  2.538


### Get MAE
Mean Absolute Error

In [92]:
mae_lm = round(model_lm_fit_train_data.summary.meanAbsoluteError,3)
print("MAE= ", mae_lm)

MAE=  1.996


### Get R-squared

In [93]:
r2_lm = round(model_lm_fit_train_data.summary.r2, 3)
print("R2= ", r2_lm)

R2=  0.811


### Get coefficients for fitted model

In [94]:
model_coeff = model_lm_fit_train_data.coefficients
print("model_coeff= ", [round(i,3) for i in model_coeff])

model_coeff=  [-0.352, -0.009, -0.004]


Note: "input_features" is vector of inputCols=['cyl', 'dspl', 'wt'].
So coefficients above are 3-qty.total, 1 for each of the 3 predictors.

### Get Intercept for fitted model

In [95]:
model_intercept = round(model_lm_fit_train_data.intercept,3)
print("model_intercept= ", model_intercept)

model_intercept=  37.13


### Make Predictions

In [96]:
yhat_model_predicts = model_lm_fit_train_data.evaluate(dataset= test_data)

In [97]:
yhat_model_predicts

<pyspark.ml.regression.LinearRegressionSummary at 0x202e4ec9030>

In [98]:
yhat_model_predicts.predictions.show(3)

+----+------------------+------------------+
| mpg|    input_features|        prediction|
+----+------------------+------------------+
|10.0|[8.0,307.0,4376.0]|13.300805901781231|
|10.0|[8.0,360.0,4615.0]| 11.83110154839293|
|12.0|[8.0,350.0,4456.0]|12.584412779198807|
+----+------------------+------------------+
only showing top 3 rows



In [99]:
yhat_model_predicts_2 = model_lm_fit_train_data.transform(dataset= test_data)

In [100]:
yhat_model_predicts_2.show(3)

+----+------------------+------------------+
| mpg|    input_features|        prediction|
+----+------------------+------------------+
|10.0|[8.0,307.0,4376.0]|13.300805901781231|
|10.0|[8.0,360.0,4615.0]| 11.83110154839293|
|12.0|[8.0,350.0,4456.0]|12.584412779198807|
+----+------------------+------------------+
only showing top 3 rows



In [101]:
from pyspark.ml.evaluation import RegressionEvaluator

In [102]:
lm_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="mpg", metricName="r2")

In [103]:
type( lm_evaluator)

pyspark.ml.evaluation.RegressionEvaluator

In [104]:
r2_lm_test = round(lm_evaluator.evaluate(yhat_model_predicts_2),3)
print("R Squared (R2) on test data = %g" %r2_lm_test )

R Squared (R2) on test data = 0.853


In [105]:
r2_lm_train = round(model_lm_fit_train_data.summary.r2, 3)
print("R Squared (R2) on TRAIN data = ", r2_lm_train)

R Squared (R2) on TRAIN data =  0.811


In [106]:
rmse_lm_test = round(model_lm_fit_train_data.evaluate(test_data).rootMeanSquaredError,3)
print("RMSE on test data = ", rmse_lm_test)

RMSE on test data =  2.494


In [107]:
rmse_lm_train = round(model_lm_fit_train_data.summary.rootMeanSquaredError,3)
print("RMSE on TRAIN data = ", rmse_lm_train)

RMSE on TRAIN data =  2.538


# CREATE PIPELINE

In [108]:
from pyspark.sql.types import StringType

In [109]:
 # df_spark = df_spark.withColumn("cyl", col("cyl").cast(StringType()))

In [110]:
from pyspark.ml.feature import IndexToString

In [111]:
stage_1 = IndexToString(inputCol="cyl", outputCol="cyl_category")

In [112]:
from pyspark.ml import Pipeline

In [113]:
pipeline = Pipeline(stages=[stage_1])

In [114]:
pipeline_model = pipeline.fit(df_spark)
df_spark_updated = pipeline_model.transform(df_spark)
df_spark_updated.show(5)

IllegalArgumentException: requirement failed: Output column cyl already exists.

In [None]:
df_spark.printSchema()