In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = '5g'
spark = SparkSession.builder.appName('taxi-fare-prediction')\
                    .config('spark.executor.memory', MAX_MEMORY)\
                    .config('spark.driver.memory', MAX_MEMORY)\
                    .getOrCreate()

22/07/05 21:09:02 WARN Utils: Your hostname, dongwoo.local resolves to a loopback address: 127.0.0.1; using 192.168.55.122 instead (on interface en0)
22/07/05 21:09:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/07/05 21:09:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/07/05 21:09:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# 데이터프레임 생성

In [3]:
trip_files = '/Users/dongwoo/new_york/data/trips/*' # 모든 파일을 가져온다.
zone_file = '/Users/dongwoo/new_york/data/taxi+_zone_lookup.csv' 

In [4]:
trips_df = spark.read.parquet(f"file:///{trip_files}", inferSchema=True, header=True)
zone_df = spark.read.csv(f"file:///{zone_file}", inferSchema=True, header=True)

                                                                                

### 스키마 생성

In [5]:
trips_df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [7]:
trips_df.createOrReplaceTempView('trips')

In [8]:
query = """
SELECT
    trip_distance,
    total_amount
FROM
    trips
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance < 500
    AND passenger_count < 5
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2022-01-01'

"""
data_df = spark.sql(query)
data_df.createOrReplaceTempView('df')

In [9]:
data_df.show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|          7.4|        40.8|
|         14.1|        42.8|
|         5.64|       28.56|
|          5.5|        34.3|
|          3.7|        21.3|
|          4.2|       21.96|
|         6.97|       24.05|
|          4.0|       21.35|
|          1.9|       14.16|
|         66.6|      196.35|
|         0.75|       11.16|
|          3.0|       18.96|
|         2.52|       18.48|
|         1.16|        14.3|
|         2.79|        15.3|
|         2.08|       15.99|
|         4.19|       20.16|
|          1.5|       12.95|
|         11.1|       47.15|
|         9.65|       39.96|
+-------------+------------+
only showing top 20 rows



In [10]:
data_df.describe().show()



+-------+------------------+------------------+
|summary|     trip_distance|      total_amount|
+-------+------------------+------------------+
|  count|          28075891|          28075891|
|   mean|3.0586407341441615|19.206020264792702|
| stddev| 4.117759204328298|14.642710918710762|
|    min|               0.0|              0.01|
|    max|             482.1|            4973.3|
+-------+------------------+------------------+



                                                                                

- 평균 3.0 마일 정도 거리를 가고 19불 정도 금액을 평균적으로 내는 것으로 보인다.

# Data Split

In [13]:
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=1)

In [16]:
print("train : ",train_df.count())
print("test : ",test_df.count())

                                                                                

train :  22461938




test :  5613953


                                                                                

In [17]:
from pyspark.ml.feature import VectorAssembler

In [20]:
vassembler = VectorAssembler(inputCols=["trip_distance"], outputCol="features")
# trip_distance -> features로 들어가게됨.

In [21]:
vtrain_df = vassembler.transform(train_df)

In [22]:
vtrain_df.show()

[Stage 19:>                                                         (0 + 1) / 1]

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|          0.0|        0.01|   [0.0]|
|          0.0|        0.01|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
|          0.0|         0.3|   [0.0]|
+-------------+------------+--------+
only showing top 20 rows



                                                                                

# Regression Modeling

In [23]:
from pyspark.ml.regression import LinearRegression

In [25]:
lr = LinearRegression(
    maxIter=50, # 반복수
    labelCol="total_amount", # Label(Traget)
    featuresCol='features' # features
    )

In [26]:
model = lr.fit(vtrain_df)

22/07/05 21:26:32 WARN Instrumentation: [78c4eff9] regParam is zero, which might cause numerical instability and overfitting.
22/07/05 21:26:36 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/07/05 21:26:36 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/07/05 21:26:49 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [27]:
vtest_df = vassembler.transform(test_df)

In [28]:
prediction = model.transform(vtest_df)

In [29]:
prediction.show()

[Stage 24:>                                                         (0 + 1) / 1]

+-------------+------------+--------+-----------------+
|trip_distance|total_amount|features|       prediction|
+-------------+------------+--------+-----------------+
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|         0.3|   [0.0]|10.02494342783143|
|          0.0|        0.31|   [0.0]|10.02494342783143|
|          0.0|        0.31|   [0.0]|10.02494342783143|
|          0.0|        0.31|   [0.0]|10.02494342783143|
|          0.0|        0.31|   [0.0]|10.02494342783143|
|          0.0|        0.31|   [0.0]|10.02494342

                                                                                

## Evaluation

In [30]:
model.summary.rootMeanSquaredError 

7.917755939489453

In [31]:
model.summary.r2

0.708351522988141

- R2 : 70%가 나온것으로 봤을때, 좋은 성능은 아니지만 적당하다는 것을 알 수 있다.

In [32]:
from pyspark.sql.types import DoubleType
distance_list = [1.1, 5.5, 10.5, 30.0]
distance_df = spark.createDataFrame(distance_list, DoubleType()).toDF('trip_distance')

In [33]:
distance_df.show()

[Stage 25:>                                                         (0 + 1) / 1]                                                                                

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.5|
|         10.5|
|         30.0|
+-------------+



In [34]:
vdistance_df = vassembler.transform(distance_df)

In [35]:
vdistance_df.show()

+-------------+--------+
|trip_distance|features|
+-------------+--------+
|          1.1|   [1.1]|
|          5.5|   [5.5]|
|         10.5|  [10.5]|
|         30.0|  [30.0]|
+-------------+--------+



In [36]:
model.transform(vdistance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]| 13.32749535190366|
|          5.5|   [5.5]|26.537703048192583|
|         10.5|  [10.5]| 41.54930270306636|
|         30.0|  [30.0]|100.09454135707409|
+-------------+--------+------------------+



- 10마일 정도 가는데 41불으로 예측된다.
- 거리가 커질수록 예측률도 커지는 것으로 알 수 있었다.