In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("taxi-fare-prediction")\
            .config("spark.executor.memory", MAX_MEMORY)\
            .config("spark.driver.memory", MAX_MEMORY)\
            .getOrCreate()

22/04/07 18:24:35 WARN Utils: Your hostname, devkhk-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.30.1.27 instead (on interface en0)
22/04/07 18:24:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/07 18:24:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
trip_files = "/Users/devkhk/Documents/data-engineering-study/data/trips/*"

In [6]:
trips_df = spark.read.csv(f"file:///{trip_files}", inferSchema=True, header=True)

                                                                                

In [7]:
trips_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [8]:
trips_df.createOrReplaceTempView("trips")

In [9]:
query = """
SELECT
    trip_distance,
    total_amount
FROM
    trips
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance > 0
    AND trip_distance < 500
    AND passenger_count < 4
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-08-01'
"""

In [11]:
data_df = spark.sql(query)

In [12]:
data_df.createOrReplaceTempView("data")

In [13]:
data_df.show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|         16.5|       70.07|
|         1.13|       11.16|
|         2.68|       18.59|
|         12.4|        43.8|
|          9.7|        32.3|
|          9.3|       43.67|
|         9.58|        46.1|
|         16.2|        45.3|
|         3.58|        19.3|
|         0.91|        14.8|
|         2.57|        12.8|
|          0.4|         5.3|
|         3.26|        17.3|
|        13.41|       47.25|
|         18.3|       61.42|
|         1.53|       14.16|
|          2.0|        11.8|
|         16.6|       54.96|
|         15.5|       56.25|
|          1.3|        16.8|
+-------------+------------+
only showing top 20 rows



In [14]:
data_df.describe().show()



+-------+------------------+------------------+
|summary|     trip_distance|      total_amount|
+-------+------------------+------------------+
|  count|          13126040|          13126040|
|   mean|2.8820930920520915|17.973158757890285|
| stddev| 3.820306480671185|12.975904680786682|
|    min|              0.01|              0.01|
|    max|             475.5|            4973.3|
+-------+------------------+------------------+



                                                                                

In [17]:
# 학습 데이터와 테스트 데이터를 나눈다.

train_df, test_df = data_df.randomSplit([.8, .2], seed=1)

In [18]:
print(train_df.count())
print(test_df.count())

                                                                                

10500253




2625787


                                                                                

In [19]:
# 트레이닝이 가능한 상태로 만들기 위한 모듈
# 특정 칼럼을 트레이닝이 가능할 수 있도록 features 벡터 열로 만들어 주는 모듈!
from pyspark.ml.feature import VectorAssembler

In [20]:
vassembler = VectorAssembler(inputCols=["trip_distance"], outputCol="features")

In [21]:
vtrain_df = vassembler.transform(train_df)

In [22]:
# 스칼라였던 trip_distance를 features 벡터열로 새롭게 만들어 준다.
vtrain_df.show()

[Stage 12:>                                                         (0 + 1) / 1]

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|        3.05|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
+-------------+------------+--------+
only showing top 20 rows



                                                                                

In [23]:
from pyspark.ml.regression import LinearRegression

In [24]:
lr = LinearRegression(
        maxIter=50,
        labelCol="total_amount",
        featuresCol='features'
)

In [25]:
# 모델 학습
model = lr.fit(vtrain_df)

22/04/07 22:03:38 WARN Instrumentation: [d53e1496] regParam is zero, which might cause numerical instability and overfitting.
22/04/07 22:03:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/04/07 22:03:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/04/07 22:03:49 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [26]:
# 테스트용 df

vtest_df = vassembler.transform(test_df)

In [27]:
# 예측
prediction =  model.transform(vtest_df)

In [28]:
prediction.show()

[Stage 17:>                                                         (0 + 1) / 1]

+-------------+------------+--------+-----------------+
|trip_distance|total_amount|features|       prediction|
+-------------+------------+--------+-----------------+
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.8|  [0.01]|9.430440745

                                                                                

In [30]:
# 예측 모델 평가
model.summary.rootMeanSquaredError

6.30781413196623

In [31]:
model.summary.r2

0.7648633777017714

In [33]:
from pyspark.sql.types import DoubleType
distance_list = [1.1, 5.5, 10.5, 30.0]
distance_df = spark.createDataFrame(distance_list, DoubleType()).toDF('trip_distance')

In [34]:
distance_df.show()

                                                                                

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.5|
|         10.5|
|         30.0|
+-------------+



In [35]:
vditance_df = vassembler.transform(distance_df)

In [36]:
vditance_df.show()

+-------------+--------+
|trip_distance|features|
+-------------+--------+
|          1.1|   [1.1]|
|          5.5|   [5.5]|
|         10.5|  [10.5]|
|         30.0|  [30.0]|
+-------------+--------+



In [37]:
model.transform(vditance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|12.672809485363317|
|          5.5|   [5.5]|25.761270454374163|
|         10.5|  [10.5]| 40.63452155552285|
|         30.0|  [30.0]| 98.64020085000274|
+-------------+--------+------------------+



In [38]:
spark.stop()