In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("Chapter27-sample").getOrCreate()

In [2]:
train_raw_df = spark.read.format("csv").option("header", "true")\
                                   .option("inferSchema", "true")\
                                   .option("path", "../data/extra-data/kaggle-housing/train.csv")\
                                   .load().coalesce(5)

In [3]:
VERBOSE = False
LIMIT_FEATURES = None

# Feature engineering

The following steps are taken

* Replace the strings containing number characters (e.g. '1', '2'...) by the reading form (e.g. 'one')
* Select some features, in two steps:
    * Use only those features which are continuous (e.g. data type is integer in the dataset).
    * Hard limit the number of features by user's choice based on value diversity.
* Build pyspark's feature estimators and transformers for later pipelininig

In [4]:
# in order to use RFormula later on, substitute numbers by words (e.g. 1 by One)
translate_dict = {"1":"One", "2":"Two", "3":"Three", "4":"Four", "5":"Five"}
def fix_number (x):
    for k, v in translate_dict.items():
        if k in x:
            return x.replace(k, v) 
    return x
for col_name in train_raw_df.columns:
    train_raw_df = train_raw_df.withColumnRenamed(col_name, fix_number(col_name))

if VERBOSE:
    train_raw_df.printSchema()

In [5]:
# for display purposes, and in order not to enter tokenization and one-hot encoding, let's just take integer features
col_to_keep = [col_name for col_name, col_type in train_raw_df.dtypes if col_type =='int']
train_raw_df = train_raw_df.select(col_to_keep)

if VERBOSE:
    print(train_raw_df.columns)

In [6]:
from pyspark.sql.functions import countDistinct, col, isnull
import pandas as pd

print("Number of records = {}".format(train_raw_df.count()))

spec_df = pd.DataFrame(index=train_raw_df.columns, columns=["distinct_values", "missing_values"])
for c in train_raw_df.columns: 
    spec_df.loc[c, "distinct_values"] = train_raw_df.agg(countDistinct(c).alias("count_distinct")).collect()[0].count_distinct
    spec_df.loc[c, "missing_values"] = train_raw_df.select(isnull(c).alias("is_null")).where(col("is_null")).count()
spec_df = spec_df.sort_values(by="distinct_values", ascending=False)

if VERBOSE: 
    print(spec_df)

Number of records = 1460


In [7]:
# Limit the number of features by selecting the ones with a bigger spectre.
num_features = spec_df.shape[0] if LIMIT_FEATURES is None else LIMIT_FEATURES
features_to_use = spec_df[(spec_df.index!='Id') & (spec_df.index!='SalePrice')].iloc[:num_features].index.values

if VERBOSE:
    print("Using features: {}".format(features_to_use))

In [8]:
# Build pyspark's transformers and estimators
from pyspark.ml.feature import RFormula, StandardScaler

rForm = RFormula(formula="Saleprice ~ " + " + ".join(features_to_use))
std_scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
feature_transformers = [rForm, std_scaler]

In [9]:
# Classic train-test split. Note that only train_df will be used for cross-val 
train_df, test_df = train_raw_df.randomSplit([0.8, 0.2])

# Linear Regression (straight)

In [10]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator

# estimator pipeline
lr_model = LinearRegression(featuresCol="scaledFeatures", labelCol="SalePrice", standardization=False, maxIter=100)
pipe = Pipeline()
pipe.setStages([*feature_transformers, lr_model])

# evaluator
reg_eval = RegressionEvaluator(labelCol="SalePrice", metricName="r2")

# check that the pipe works
pipe_fitted = pipe.fit(train_df)
print("R2 for train data = {}".format(reg_eval.evaluate(pipe_fitted.transform(train_df))))
print("R2 for test data = {}".format(reg_eval.evaluate(pipe_fitted.transform(test_df))))

R2 for train data = 0.8089736349555801
R2 for test data = 0.7926925891268555


In [11]:
# param grid
grid = ParamGridBuilder().addGrid(lr_model.regParam, [0., 1e-3, 0.1, 1., 10., 100.])\
                         .addGrid(lr_model.elasticNetParam, [0., 1e-3, 0.5, 1.]).build()

In [12]:
# this cell is isolated so it can be called back again stand-alone to derive performance
import numpy as np

#tvs
tvs = TrainValidationSplit().setTrainRatio(0.8)\
                            .setEstimatorParamMaps(grid)\
                            .setEstimator(pipe)\
                            .setEvaluator(reg_eval)
tvs_fitted = tvs.fit(train_df)

# evaluation
for ds_name, ds in {"Train": train_df, "Test": test_df}.items():
    r2_val = reg_eval.evaluate(tvs_fitted.transform(ds))
    print("{} data: R2 = {}, log(R2) = {}".format(ds_name, r2_val, np.log(r2_val)))

Train data: R2 = 0.808916040160577, log(R2) = -0.21206014955837327
Test data: R2 = 0.7933432591458462, log(R2) = -0.23149928953461726


In [13]:
# this cell is specific to the regressor, hence can't be called back by other regressor types
# as per bug, getParam() does not work and java object below needs to be accessed.
best_reg = tvs_fitted.bestModel.stages[2]._java_obj.getRegParam()
print("Best regularization param = {}".format(best_reg))

best_en = tvs_fitted.bestModel.stages[2]._java_obj.getElasticNetParam()
print("Best elastic net param = {}".format(best_en))

Best regularization param = 100.0
Best elastic net param = 1.0


# CART (Decision Trees for regression)

In [14]:
from pyspark.ml.regression import DecisionTreeRegressor

dtr_model = DecisionTreeRegressor(featuresCol="scaledFeatures", labelCol="SalePrice")
pipe = Pipeline()
pipe.setStages([*feature_transformers, dtr_model])

# check that the pipe works
pipe_fitted = pipe.fit(train_df)
print("R2 for train data = {}".format(reg_eval.evaluate(pipe_fitted.transform(train_df))))
print("R2 for test data = {}".format(reg_eval.evaluate(pipe_fitted.transform(test_df))))

R2 for train data = 0.8597456165696578
R2 for test data = 0.6546036932394312


In [15]:
# param grid
grid = ParamGridBuilder().addGrid(dtr_model.maxDepth, [4, 5, 6, 7])\
                         .addGrid(dtr_model.maxBins, [24, 32, 48, 64]).build()
%rerun 12

=== Executing: ===
# this cell is isolated so it can be called back again stand-alone to derive performance
import numpy as np

#tvs
tvs = TrainValidationSplit().setTrainRatio(0.8)\
                            .setEstimatorParamMaps(grid)\
                            .setEstimator(pipe)\
                            .setEvaluator(reg_eval)
tvs_fitted = tvs.fit(train_df)

# evaluation
for ds_name, ds in {"Train": train_df, "Test": test_df}.items():
    r2_val = reg_eval.evaluate(tvs_fitted.transform(ds))
    print("{} data: R2 = {}, log(R2) = {}".format(ds_name, r2_val, np.log(r2_val)))
=== Output: ===
Train data: R2 = 0.9330720194788542, log(R2) = -0.06927288981822431
Test data: R2 = 0.5731148550822822, log(R2) = -0.5566691371840694


# Random Forests

In [16]:
from pyspark.ml.regression import RandomForestRegressor

rf_model = RandomForestRegressor(featuresCol="scaledFeatures", labelCol="SalePrice")
pipe = Pipeline()
pipe.setStages([*feature_transformers, rf_model])

# check that the pipe works
pipe_fitted = pipe.fit(train_df)
print("R2 for train data = {}".format(reg_eval.evaluate(pipe_fitted.transform(train_df))))
print("R2 for test data = {}".format(reg_eval.evaluate(pipe_fitted.transform(test_df))))

R2 for train data = 0.8910637983284904
R2 for test data = 0.7932206390576011


In [17]:
# param grid
grid = ParamGridBuilder().addGrid(rf_model.maxDepth, [4, 5, 6, 7])\
                         .addGrid(rf_model.maxBins, [24, 32, 48, 64])\
                         .addGrid(rf_model.numTrees, [2, 3, 4, 5]).build()
%rerun 12

=== Executing: ===
# this cell is isolated so it can be called back again stand-alone to derive performance
import numpy as np

#tvs
tvs = TrainValidationSplit().setTrainRatio(0.8)\
                            .setEstimatorParamMaps(grid)\
                            .setEstimator(pipe)\
                            .setEvaluator(reg_eval)
tvs_fitted = tvs.fit(train_df)

# evaluation
for ds_name, ds in {"Train": train_df, "Test": test_df}.items():
    r2_val = reg_eval.evaluate(tvs_fitted.transform(ds))
    print("{} data: R2 = {}, log(R2) = {}".format(ds_name, r2_val, np.log(r2_val)))
=== Output: ===
Train data: R2 = 0.9033355676481256, log(R2) = -0.10166118036834987
Test data: R2 = 0.7613624849701452, log(R2) = -0.2726457073664021


# Generating a submission

We will use the best estimator to form a prediction of the price of the houses. It will demonstrate how to use the pipeline to predict the output

In [18]:
test_raw_df = spark.read.format("csv").option("header", "true")\
                                   .option("inferSchema", "true")\
                                   .option("path", "../data/extra-data/kaggle-housing/test.csv")\
                                   .load().coalesce(5)
for col_name in test_raw_df.columns:
    test_raw_df = test_raw_df.withColumnRenamed(col_name, fix_number(col_name))
test_raw_df = test_raw_df.select(list(set(train_df.columns) - set(['SalePrice'])))

# correct for schema inteference mistakes
for col_name, col_type in test_raw_df.dtypes:
    if col_type == 'string':
        # that's an error in the schema inference, cast to int
        test_raw_df = test_raw_df.select(*[x for x in test_raw_df.columns if x != col_name], col(col_name).cast('int'))
# ensure filling of nulls
test_raw_df = test_raw_df.fillna(0)
test_raw_df.printSchema()

root
 |-- WoodDeckSF: integer (nullable = true)
 |-- MoSold: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- GrLivArea: integer (nullable = true)
 |-- EnclosedPorch: integer (nullable = true)
 |-- Fireplaces: integer (nullable = true)
 |-- PoolArea: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- KitchenAbvGr: integer (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- ScreenPorch: integer (nullable = true)
 |-- BedroomAbvGr: integer (nullable = true)
 |-- ThreeSsnPorch: integer (nullable = true)
 |-- HalfBath: integer (nullable = true)
 |-- OpenPorchSF: integer (nullable = true)
 |-- FullBath: integer (nullable = true)
 |-- MiscVal: integer (nullable = true)
 |-- TwondFlrSF: integer (nullable = true)
 |-- OnestFlrSF: integer (nullable = true)
 |-- YrSold: integer (nullable = true)
 |-- Id: integer (nullable = true)
 |-- LowQualFinSF: in

In [19]:
%rerun 10
%rerun 11
%rerun 12

=== Executing: ===
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator

# estimator pipeline
lr_model = LinearRegression(featuresCol="scaledFeatures", labelCol="SalePrice", standardization=False, maxIter=100)
pipe = Pipeline()
pipe.setStages([*feature_transformers, lr_model])

# evaluator
reg_eval = RegressionEvaluator(labelCol="SalePrice", metricName="r2")

# check that the pipe works
pipe_fitted = pipe.fit(train_df)
print("R2 for train data = {}".format(reg_eval.evaluate(pipe_fitted.transform(train_df))))
print("R2 for test data = {}".format(reg_eval.evaluate(pipe_fitted.transform(test_df))))
=== Output: ===
R2 for train data = 0.8089736349555801
R2 for test data = 0.7926925891268555
=== Executing: ===
# param grid
grid = ParamGridBuilder().addGrid(lr_model.regParam, [0., 1e-3, 0.1, 1., 10., 100.])\
                        

In [20]:
submission_df = tvs_fitted.transform(test_raw_df).select(col("Id"), col("prediction").alias("SalePrice"))
submission_df.write.format("csv")\
             .mode("overwrite").option("sep", ",")\
             .save("../data/extra-data/kaggle-housing/test_submission.csv")