## 예측 모델 정확도 올리기 위해 다른 변수들을 추가해서 모델 학습

In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("taxi-fare-prediction-expands")\
                            .config("spark.executor.memory", MAX_MEMORY)\
                            .config("spark.driver.memory", MAX_MEMORY)\
                            .getOrCreate()

22/04/21 18:13:18 WARN Utils: Your hostname, devkhk-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.30.1.27 instead (on interface en0)
22/04/21 18:13:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/21 18:13:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# parquet형태 저장
data_dir = "/Users/devkhk/Documents/data-engineering-study/data/"


In [4]:
# parqeut 불러오기
train_df = spark.read.parquet(f"file:///{data_dir}train-review/")
test_df = spark.read.parquet(f"file:///{data_dir}test-review/")

                                                                                

In [5]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [6]:
# 카테고리 / Numberic Feature 분류 => pipeline stages 만들기
stages = []

cat_features = [
    "pickup_location_id",
    "dropoff_location_id",
    "hour",
    "day_of_week"
]

num_features = [
    "passenger_count",
    "trip_distance",
]

for c in cat_features:
    cat_indexer = StringIndexer(inputCol=c, outputCol= c + "_idx").setHandleInvalid("keep")
    one_encoder = OneHotEncoder(inputCol=cat_indexer.getOutputCol(), outputCol= c + "_onehot")
    stages += [cat_indexer, one_encoder]

for n in num_features:
    num_vector = VectorAssembler(inputCols=[n], outputCol= n + "_vector")
    num_std = StandardScaler(inputCol=num_vector.getOutputCol(), outputCol=n + "_std")
    stages += [num_vector, num_std]

In [7]:
assembler = [c+"_onehot" for c in cat_features] + [n + "_std" for n in num_features]
vassembler = VectorAssembler(inputCols=assembler, outputCol="features")
stages += [vassembler]

## Hyperparameter Tuning

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

In [11]:
# 튜닝은 데이터가 너무 크면 오래 걸리므로 적당한 toy_df를 sampling해줘야한다.
toy_df = train_df.sample(True, 0.1, seed=1)

In [12]:
toy_df.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- total_amount: double (nullable = true)



In [13]:
lr = LinearRegression(
    maxIter=30,
    solver='normal',
    labelCol="total_amount",
)
cv_stages = stages + [lr]

In [18]:
param_grid = ParamGridBuilder()\
                    .addGrid(lr.elasticNetParam, [.1, .2, .3, .4, .5])\
                    .addGrid(lr.regParam,[.01, .02, .03, .04, .05])\
                    .build()

In [19]:
cv_pipeline = Pipeline(stages=cv_stages)

In [21]:
cv = CrossValidator(
    estimator=cv_pipeline,
    estimatorParamMaps=param_grid,
    evaluator=RegressionEvaluator(labelCol="total_amount"),
    numFolds=5
)

In [22]:
cv_model = cv.fit(toy_df)

22/04/21 18:55:41 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/04/21 18:55:41 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/04/21 18:55:43 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/04/21 18:55:43 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [23]:
# hyper parameter를 얻는다. 
alpha = cv_model.bestModel.stages[-1]._java_obj.getElasticNetParam()
reg_parm = cv_model.bestModel.stages[-1]._java_obj.getRegParam()

In [29]:
# tuning

In [30]:
lr = lr = LinearRegression(
    maxIter=30,
    elasticNetParam=alpha,
    regParam=reg_parm,
    solver='normal',
    labelCol="total_amount",
)

In [31]:
pipeline = Pipeline(stages=stages)

In [34]:
transformer = pipeline.fit(train_df)

                                                                                

In [36]:
vtrain_df = transformer.transform(train_df)

In [38]:
model = lr.fit(vtrain_df)

                                                                                

In [39]:
vtest_df = transformerformer.transform(test_df)

In [41]:
predictions = model.transform(vtest_df)

In [42]:
predictions.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- pickup_location_id_idx: double (nullable = false)
 |-- pickup_location_id_onehot: vector (nullable = true)
 |-- dropoff_location_id_idx: double (nullable = false)
 |-- dropoff_location_id_onehot: vector (nullable = true)
 |-- hour_idx: double (nullable = false)
 |-- hour_onehot: vector (nullable = true)
 |-- day_of_week_idx: double (nullable = false)
 |-- day_of_week_onehot: vector (nullable = true)
 |-- passenger_count_vector: vector (nullable = true)
 |-- passenger_count_std: vector (nullable = true)
 |-- trip_distance_vector: vector (nullable = true)
 |-- trip_distance_std: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: dou

In [45]:
predictions.select(["total_amount", "prediction"]).show()

+------------+------------------+
|total_amount|        prediction|
+------------+------------------+
|        12.3|14.522882130200006|
|       23.15|19.761763709280743|
|        16.3|16.895246125594852|
|         5.8| 9.051188847590547|
|        65.3|   46.011215010444|
|        13.3| 47.20738990668044|
|        17.8| 49.01962796688548|
|        76.3| 68.24258234983881|
|        17.3|19.743503253461622|
|        24.3|27.466475017600615|
|       27.35| 25.72265257589921|
|       32.75|29.928878940094307|
|         8.8|12.294447521574881|
|        12.8|16.950455884490687|
|        15.8|17.564126318254218|
|       20.75|20.505773970449674|
|        20.3| 22.09136579049914|
|       19.56| 17.82913196009853|
|        24.3|23.004838152456742|
|         8.3|13.235918828146922|
+------------+------------------+
only showing top 20 rows



In [46]:
model.summary.rootMeanSquaredError

5.624055358189599

In [47]:
model.summary.r2

0.8101430588034768