In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = '5g'
spark = SparkSession.builder.appName("taxi-fare-perdiction")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()

In [3]:
data_dir = "/Users/gimhyeonjeong/data-engineering/01-spark/data/"

In [4]:
train_df = spark.read.parquet(f"{data_dir}/train/")
test_df = spark.read.parquet(f"{data_dir}/test/")

In [5]:
toy_df = train_df.sample(False, 0.1, seed = 1)

In [6]:
toy_df.printSchema()

root
 |-- passenger_count: double (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [7]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

cat_feats = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]

stages = []

for c in cat_feats:
    cat_indexer = StringIndexer(inputCol= c, outputCol= c + "_idx").setHandleInvalid("keep")
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c + "_onehot"])
    stages += [cat_indexer, onehot_encoder]

In [8]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

num_feats = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]

for n in num_feats:
    num_assembler = VectorAssembler(inputCols=[n], outputCol= n + "_vector")
    num_scaler = StandardScaler(inputCol = num_assembler.getOutputCol(), outputCol = n + "_scaled")
    stages += [num_assembler, num_scaler]

In [9]:
assembler_inputs = [c + "_onehot" for c in cat_feats] + [n + "_scaled" for n in num_feats]
assembler = VectorAssembler(inputCols = assembler_inputs, outputCol= "feature_vector")
stages += [assembler]

## Hyperparameter Tuning

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(
    maxIter= 30,
    solver = "normal",
    labelCol = "total_amount",
    featuresCol = "feature_verctor"
)

cv_stages = stages + [lr]

In [11]:
cv_pipeline = Pipeline(stages= cv_stages)

In [12]:
param_grid = ParamGridBuilder()\
                .addGrid(lr.elasticNetParam, [0.1, 0.2, 0.3, 0.4, 0.5])\
                .addGrid(lr.regParam, [0.01, 0.02, 0.03, 0.04, 0.05])\
                .build()

In [13]:
cross_val = CrossValidator(estimator = cv_pipeline, 
                           estimatorParamMaps = param_grid,
                           evaluator = RegressionEvaluator(labelCol="total_amount"),
                           numFolds = 5)

In [14]:
cv_model = cross_val.fit(toy_df)

IllegalArgumentException: feature_verctor does not exist. Available: passenger_count, pickup_location_id, dropoff_location_id, trip_distance, pickup_time, day_of_week, total_amount, CrossValidator_6671ab4dea59_rand, pickup_location_id_idx, pickup_location_id_onehot, dropoff_location_id_idx, dropoff_location_id_onehot, day_of_week_idx, day_of_week_onehot, passenger_count_vector, passenger_count_scaled, trip_distance_vector, trip_distance_scaled, pickup_time_vector, pickup_time_scaled, feature_vector

## Training

In [None]:
transform_stages = stages 
pipeline = Pipeline(stages = transform_stages)
fitted_transformer = pipeline.fit(train_df)

In [None]:
vtrain_df = fitted_transformer.transform(train_df)

In [None]:
lr = LinearRegression(
    maxIter = 5,
    solver = "normal",
    labelCol = "total_amount",
    featuresCol = "feature_vector"
    elasticNetParam= alpha,
    regParam=reg_param,
)

In [None]:
vtrain_df.printSchema()

In [None]:
model = lr.fit(vtrain_df)

In [None]:
vtest_df = fitted_transformer.transform(test_df)

In [None]:
predictions = model.transform(vtest_df)

In [None]:
predictions.cache()

In [None]:
predictions.show()

In [None]:
predictions.select(["trip_distance", "day_of_week", "total_amount", "prediction"]).show()

In [None]:
model.summary.rootMeanSquaredError

In [None]:
model.summary.r2

In [None]:
model_dir = "/Users/gimhyeonjeong/data-engineering/01-spark/data/model"
model.save(model_dir)

In [None]:
from pyspark.ml.regression import LinearRegressionModel

In [None]:
lr_model = LinearRegressionModel().loda(model.dir)

In [None]:
prediction = model.tranform(vtest_df)

In [None]:
prediction.show()