In [5]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("trip_count_sql").getOrCreate()

In [11]:
directory="C:\\Users\\daesi\\Downloads\\빅데이터 소스코드\\소스코드\\study_spark\\data"
trip_file="fhvhv_tripdata_2020-03.csv"

In [12]:
# inferSchema : 자동으로 스키마 예측하게 하기
data=spark.read.csv(f"file:///{directory}\\{trip_file}", inferSchema=True, header=True)

불러온 데이터 확인하기

In [13]:
data.show(5)

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   null|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   null|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   null|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   null|
+-----------------+--------------------+-------------------+-------------------+

SQL 사용을 위해서 임시 view를 생성

In [14]:
data.createOrReplaceTempView("mobility_data") # 데이터 프레임에 SQL 사용 가능!

In [19]:
query="""
select  *
from mobility_data
limit 5
"""

spark.sql(query).show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   null|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   null|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   null|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   null|
+-----------------+--------------------+-------------------+-------------------+

# 스파크 SQL을 사용하는 이유
- 집계
- 승차 년-월-일-별 카우트 구하기

In [31]:
query="""
select split(pickup_datetime, ' ')[0] as pickup_date, count(*) as trips
from mobility_data

group by pickup_date
"""
spark.sql(query).show()

+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-03|697880|
| 2020-03-02|648986|
| 2020-03-01|784246|
| 2020-03-05|731165|
| 2020-03-04|707879|
| 2020-03-06|872012|
| 2020-03-07|886071|
| 2020-03-10|626474|
| 2020-03-09|628940|
| 2020-03-08|731222|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-13|660914|
| 2020-03-15|448125|
| 2020-03-14|569397|
| 2020-03-16|391518|
| 2020-03-20|261900|
| 2020-03-19|252773|
| 2020-03-17|312298|
| 2020-03-18|269232|
+-----------+------+
only showing top 20 rows



In [34]:
# 실행 계획 살표보기
spark.sql(query).explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['split('pickup_datetime,  )[0] AS pickup_date#230, 'count(1) AS trips#231]
+- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [split(pickup_datetime#18,  , -1)[0]], [split(pickup_datetime#18,  , -1)[0] AS pickup_date#230, count(1) AS trips#231L]
+- SubqueryAlias mobility_data
   +- View (`mobility_data`, [hvfhs_license_num#16,dispatching_base_num#17,pickup_datetime#18,dropoff_datetime#19,PULocationID#20,DOLocationID#21,SR_Flag#22])
      +- Relation [hvfhs_license_num#16,dispatching_base_num#17,pickup_datetime#18,dropoff_datetime#19,PULocationID#20,DOLocationID#21,SR_Flag#22] csv

== Optimized Logical Plan ==
Aggregate [_groupingexpression#235], [_groupingexpression#235 AS pickup_date#230, count(1) AS trips#231L]
+- Project [split(pickup_datetime#18,  , -1)[0] AS _groupingexpression#235]
   +- Relation [hvfhs_license_num#16,dispatching_base_num#17,pickup_dateti

In [35]:
spark.sql("""select pickup_date,count(*) as trips
             from ( select split(pickup_datetime, ' ')[0] as pickup_date
                    from mobility_data )
             group by pickup_date""").explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['pickup_date, 'count(1) AS trips#239]
+- 'SubqueryAlias __auto_generated_subquery_name
   +- 'Project ['split('pickup_datetime,  )[0] AS pickup_date#238]
      +- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [pickup_date#238], [pickup_date#238, count(1) AS trips#239L]
+- SubqueryAlias __auto_generated_subquery_name
   +- Project [split(pickup_datetime#18,  , -1)[0] AS pickup_date#238]
      +- SubqueryAlias mobility_data
         +- View (`mobility_data`, [hvfhs_license_num#16,dispatching_base_num#17,pickup_datetime#18,dropoff_datetime#19,PULocationID#20,DOLocationID#21,SR_Flag#22])
            +- Relation [hvfhs_license_num#16,dispatching_base_num#17,pickup_datetime#18,dropoff_datetime#19,PULocationID#20,DOLocationID#21,SR_Flag#22] csv

== Optimized Logical Plan ==
Aggregate [pickup_date#238], [pickup_date#238, count(1) AS trips#239L]
+- Project [split(pick

In [36]:
spark.stop()