In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName('taxi-fare-prediction-review')\
                            .config("spark.executor.memory", MAX_MEMORY)\
                            .config("spark.driver.memory",MAX_MEMORY)\
                            .getOrCreate()

22/04/13 15:27:57 WARN Utils: Your hostname, devkhk-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.30.1.27 instead (on interface en0)
22/04/13 15:27:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/13 15:27:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.sparkContext.getConf().getAll()

[('spark.app.name', 'taxi-fare-prediction-review'),
 ('spark.app.id', 'local-1649831278848'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.port', '49283'),
 ('spark.executor.memory', '5g'),
 ('spark.driver.memory', '5g'),
 ('spark.sql.warehouse.dir',
  'file:/Users/devkhk/Documents/data-engineering-study/review/spark-warehouse'),
 ('spark.driver.host', '172.30.1.27'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.startTime', '1649831277582')]

In [4]:
trips_dir = "/Users/devkhk/Documents/data-engineering-study/data/trips/*"
taxi_fare_df = spark.read.csv(path=trips_dir, header=True, inferSchema=True)

                                                                                

In [27]:
taxi_fare_df.printSchema()
taxi_fare_df.createOrReplaceTempView('taxi_fare')

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [30]:
query = """
SELECT
    tpep_pickup_datetime as pickup_date,
    DATE_TRUNC('MM',tpep_pickup_datetime) AS month,
    passenger_count,
    trip_distance,
    total_amount
FROM
    taxi_fare
"""

In [31]:
data_df = spark.sql(query)
data_df.createOrReplaceTempView('data')

In [32]:
query = """
SELECT
    month,
    COUNT(*) as trips
FROM
    data
GROUP BY
    month
ORDER BY
    month
"""

In [33]:
spark.sql(query).show()



+-------------------+-------+
|              month|  trips|
+-------------------+-------+
|2002-12-01 00:00:00|      1|
|2003-01-01 00:00:00|      1|
|2004-04-01 00:00:00|      1|
|2008-12-01 00:00:00|     26|
|2009-01-01 00:00:00|    111|
|2020-12-01 00:00:00|     16|
|2021-01-01 00:00:00|1369749|
|2021-02-01 00:00:00|1371688|
|2021-03-01 00:00:00|1925130|
|2021-04-01 00:00:00|2171215|
|2021-05-01 00:00:00|2507075|
|2021-06-01 00:00:00|2834204|
|2021-07-01 00:00:00|2821430|
|2021-08-01 00:00:00|     36|
|2021-09-01 00:00:00|      3|
|2021-10-01 00:00:00|      3|
|2021-11-01 00:00:00|      5|
|2021-12-01 00:00:00|      5|
|2029-05-01 00:00:00|      1|
+-------------------+-------+



                                                                                

In [34]:
data_df.printSchema()

root
 |-- pickup_date: string (nullable = true)
 |-- month: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- total_amount: double (nullable = true)



In [35]:
data_df.describe().show()



+-------+-------------------+------------------+-----------------+-----------------+
|summary|        pickup_date|   passenger_count|    trip_distance|     total_amount|
+-------+-------------------+------------------+-----------------+-----------------+
|  count|           15000700|          14166672|         15000700|         15000700|
|   mean|               null|1.4253783104458126|6.628629402627818|18.75545205708744|
| stddev|               null|  1.04432704905968|671.7293482115828|145.7442452805979|
|    min|2002-12-31 23:07:20|                 0|              0.0|           -647.8|
|    max|2029-05-05 08:37:39|                 9|        332541.19|         398469.2|
+-------+-------------------+------------------+-----------------+-----------------+



                                                                                

In [40]:
query = """
SELECT
    trip_distance,
    total_amount
FROM
    data
WHERE
    pickup_date >= '2021-01-01'
    AND 
    pickup_date < '2021-08-01'
    AND
    passenger_count < 5
    AND
    passenger_count > 0
    AND
    trip_distance < 500
    AND
    trip_distance > 0
    AND
    total_amount > 0
    AND
    total_amount < 3000

"""

In [41]:
comb_df = spark.sql(query)
comb_df.createOrReplaceTempView('comb')

In [42]:
comb_df.describe().show()



+-------+------------------+------------------+
|summary|     trip_distance|      total_amount|
+-------+------------------+------------------+
|  count|          13019756|          13019756|
|   mean|2.8926618732332208| 18.00666366960982|
| stddev| 3.841778343922863|12.959646686649386|
|    min|              0.01|              0.01|
|    max|             474.1|            2292.4|
+-------+------------------+------------------+



                                                                                

In [43]:
comb_df.show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|         1.13|       11.16|
|         2.68|       18.59|
|         12.4|        43.8|
|          9.7|        32.3|
|          9.3|       43.67|
|         9.58|        46.1|
|         16.2|        45.3|
|         3.58|        19.3|
|         0.91|        14.8|
|         2.57|        12.8|
|          0.4|         5.3|
|         3.26|        17.3|
|        13.41|       47.25|
|         18.3|       61.42|
|         1.53|       14.16|
|          2.0|        11.8|
|         16.6|       54.96|
|         15.5|       56.25|
|          1.3|        16.8|
|         4.19|        17.8|
+-------------+------------+
only showing top 20 rows



In [44]:
from pyspark.ml.feature import VectorAssembler

In [49]:
vtrips = VectorAssembler(inputCols=['trip_distance'], outputCol='features', handleInvalid="skip")

In [52]:
fare_df = vtrips.transform(comb_df)

In [87]:
fare_df

DataFrame[trip_distance: double, total_amount: double, features: vector]

In [53]:
from pyspark.ml.regression import LinearRegression

In [54]:
lr = LinearRegression(
            maxIter= 30,
            regParam=0.01,
            labelCol="total_amount"
)

In [56]:
# 데이터 나누기
train_df, test_df = fare_df.randomSplit([0.8, 0.2], seed=1)

In [57]:
model = lr.fit(train_df)

                                                                                

In [58]:
prediction = model.transform(test_df)

In [59]:
prediction.show()

[Stage 36:>                                                         (0 + 1) / 1]

+-------------+------------+--------+-----------------+
|trip_distance|total_amount|features|       prediction|
+-------------+------------+--------+-----------------+
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982186078|
|         0.01|         3.3|  [0.01]|9.419388982

                                                                                

In [77]:
model.summary.rootMeanSquaredError

6.1065335697141

In [78]:
model.summary.r2

0.778036339867748

In [83]:
from pyspark.sql.types import DoubleType
test_list = [31.0, 59.6, 15.0, 1.9]
test_data_df = spark.createDataFrame(test_list, DoubleType()).toDF('trip_distance')

In [84]:
test_data_df.show()

+-------------+
|trip_distance|
+-------------+
|         31.0|
|         59.6|
|         15.0|
|          1.9|
+-------------+



In [85]:
test_data_df = vtrips.transform(test_data_df)

In [88]:
test_data_df.show()

+-------------+--------+
|trip_distance|features|
+-------------+--------+
|         31.0|  [31.0]|
|         59.6|  [59.6]|
|         15.0|  [15.0]|
|          1.9|   [1.9]|
+-------------+--------+



In [89]:
prediction_select_list = model.transform(test_data_df)

In [90]:
prediction_select_list.show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|         31.0|  [31.0]|  101.746154522537|
|         59.6|  [59.6]|186.95252736713323|
|         15.0|  [15.0]| 54.07825363045521|
|          1.9|   [1.9]| 15.05015977506324|
+-------------+--------+------------------+



In [91]:
spark.stop()