In [1]:
from pyspark.sql import SparkSession

In [2]:
# parquet 압축 코덱 선택, defalut:"snappy"
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("taxi-fare-prediction")\
            .config("spark.executor.memory", MAX_MEMORY)\
            .config("spark.driver.memory", MAX_MEMORY)\
            .config("spark.sql.parquet.compression.codec", None)\
            .getOrCreate()

22/04/09 17:09:45 WARN Utils: Your hostname, devkhk-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.30.1.27 instead (on interface en0)
22/04/09 17:09:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/09 17:09:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# *학습용 데이터를 꺼내 쓸 수 있도록 저장하기
data_dir = "/Users/devkhk/Documents/data-engineering-study/data/"

In [5]:
# 저장된 parquet 불러오기
train_df = spark.read.parquet(f"{data_dir}/train/")
test_df = spark.read.parquet(f"{data_dir}/test/")

                                                                                

In [7]:
# toy_df 학습용 데이터가 너무 크기 때문에 적당한 데이터를 sampling 한다.
toy_df = train_df.sample(False, 0.1, seed=1 )

In [9]:
train_df.printSchema()
toy_df.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [11]:
# one hot Encoding : Wednesday -> 4 -> [0,0,0,1,0,0,0]

from pyspark.ml.feature import OneHotEncoder, StringIndexer
cat_feats = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]

stages = []

for c in cat_feats:
    cat_indexer = StringIndexer(inputCol=c, outputCol= c + "_idx").setHandleInvalid("keep")
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c + "_onehot"])
    stages += [cat_indexer, onehot_encoder]

In [12]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

num_feats = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]

for n in num_feats:
    num_assembler = VectorAssembler(inputCols=[n], outputCol= n + "_vector")
    num_scalar = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol= n + "_scaled")
    stages += [num_assembler, num_scalar]

In [14]:
assembler_input = [c + "_onehot" for c in cat_feats] + [n + "_scaled" for n in num_feats]
assembler = VectorAssembler(inputCols=assembler_input, outputCol="feature_vector")
stages += [assembler]

## Hyperparameter Tunig

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(
    maxIter=30,
    solver="normal",
    labelCol='total_amount',
    featuresCol='feature_vector'
)

cv_stages = stages + [lr]

In [17]:
cv_pipeline = Pipeline(stages=cv_stages)

In [21]:
param_grid = ParamGridBuilder()\
                    .addGrid(lr.elasticNetParam, [0.1, 0.2, 0.3, 0.4, 0.5])\
                .addGrid(lr.regParam, [0.01, 0.02, 0.03, 0.04, 0.05])\
                .build()

In [22]:
# crossValidator를 실행할 인스턴스 생성
cross_val = CrossValidator(estimator=cv_pipeline,
                          estimatorParamMaps=param_grid,
                          evaluator=RegressionEvaluator(labelCol="total_amount"),
                           numFolds=5
                          )

In [23]:
cv_model = cross_val.fit(toy_df)

22/04/09 18:22:28 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/04/09 18:22:28 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/04/09 18:22:30 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/04/09 18:22:30 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [26]:
alpha = cv_model.bestModel.stages[-1]._java_obj.getElasticNetParam()
reg_param = cv_model.bestModel.stages[-1]._java_obj.getRegParam()

## Traing

In [27]:
transform_stages = stages
pipeline = Pipeline(stages=transform_stages)
fitted_transformer = pipeline.fit(train_df)

                                                                                

In [28]:
vtrain_df = fitted_transformer.transform(train_df)

In [29]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(
        maxIter=50,
        solver="normal",
        labelCol="total_amount",
        featuresCol="feature_vector",
        elasticNetParam=alpha,
        regParam=reg_param
)

In [30]:
model = lr.fit(vtrain_df)

                                                                                

In [31]:
vtest_df = fitted_transformer.transform(test_df)

In [32]:
predictions = model.transform(vtest_df)

In [33]:
predictions.cache()

DataFrame[passenger_count: int, pickup_location_id: int, dropoff_location_id: int, trip_distance: double, pickup_time: int, day_of_week: string, total_amount: double, pickup_location_id_idx: double, pickup_location_id_onehot: vector, dropoff_location_id_idx: double, dropoff_location_id_onehot: vector, day_of_week_idx: double, day_of_week_onehot: vector, passenger_count_vector: vector, passenger_count_scaled: vector, trip_distance_vector: vector, trip_distance_scaled: vector, pickup_time_vector: vector, pickup_time_scaled: vector, feature_vector: vector, prediction: double]

In [34]:
predictions.select(["day_of_week","trip_distance", "total_amount", "prediction"]).show()


[Stage 3046:>                                                       (0 + 1) / 1]

+-----------+-------------+------------+------------------+
|day_of_week|trip_distance|total_amount|        prediction|
+-----------+-------------+------------+------------------+
|   Thursday|          1.6|        12.3|14.905032148096899|
|   Saturday|          3.3|       23.15|20.335917682834957|
|  Wednesday|          4.1|        16.3| 16.11540300827887|
|   Thursday|          0.4|         5.8| 8.136376385141844|
|   Thursday|         15.4|        65.3| 47.26759901943578|
|     Friday|          3.8|        13.3|  46.6271718097099|
|     Friday|          4.6|        17.8|  48.1340994114685|
|  Wednesday|         15.2|        76.3| 66.94334757308079|
|     Sunday|          3.5|        17.3| 18.84901665167066|
|     Monday|          6.3|        24.3|26.543356292003317|
|   Saturday|          5.6|       27.35| 24.55224258636062|
|    Tuesday|          7.6|       32.75|30.046050392502785|
|  Wednesday|          0.1|         8.8| 12.37799176709074|
|  Wednesday|          2.0|        12.8|


                                                                                

In [35]:
model.summary.rootMeanSquaredError

5.652536196100418

In [36]:
model.summary.r2

0.8082177057003137

In [None]:
# 오랜시간 걸린 모델을 저장 & 로딩
model_dir = "/Users/devkhk/Documents/data-engineering-study/data/model"
model.save(model_dir)

In [None]:
from pyspark.ml.regression import LinearRegressionModel
lr_model = LinearRegressionModel.load(model_dir)