In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("spark_sql_basic2")
sc   = SparkContext(conf=conf)

# RDD만을 이용한 데이터 추출

In [2]:
movies_rdd = sc.parallelize([
    (1, ("어벤져스", "마블")),
    (2, ("슈퍼맨", "DC")),
    (3, ("배트맨", "DC")),
    (4, ("겨울왕국", "디즈니")),
    (5, ("아이언맨", "마블"))
])


attendances_rdd = sc.parallelize([
    (1, (13934592, "KR")),
    (2, (2182227,"KR")),
    (3, (4226242, "KR")),
    (4, (10303058, "KR")),
    (5, (4300365, "KR"))
])

In [3]:
# 마블 영화 중 관객 수가 500만 이상인 영화를 가져오기

# CASE1. join 먼저, filter 나중에
movie_att = movies_rdd.join(attendances_rdd)
movie_att.take(5)

[(2, (('슈퍼맨', 'DC'), (2182227, 'KR'))),
 (4, (('겨울왕국', '디즈니'), (10303058, 'KR'))),
 (1, (('어벤져스', '마블'), (13934592, 'KR'))),
 (3, (('배트맨', 'DC'), (4226242, 'KR'))),
 (5, (('아이언맨', '마블'), (4300365, 'KR')))]

In [4]:
movie_att.filter(
    lambda x : x[1][0][1] == "마블" and x[1][1][0] > 5000000
).collect()

[(1, (('어벤져스', '마블'), (13934592, 'KR')))]

In [5]:
# CASE 2. filter 먼저, join 나중에
filtered_movies = movies_rdd.filter(lambda x : x[1][1] == '마블')
filtered_att = attendances_rdd.filter(lambda x : x[1][0] > 5000000)

filtered_movies.join(filtered_att).collect()

[(1, (('어벤져스', '마블'), (13934592, 'KR')))]

choice CASE2 > 초반에 필터를 하고 join을 하는게 더 효율적임.

In [6]:
sc.stop()

# Spark SQL 사용해 보기

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()

In [8]:
# 컬럼 추가
movies = [
    (1, "어벤져스", "마블", 2012, 4, 26),
    (2, "슈퍼맨", "DC", 2013, 6, 13),
    (3, "배트맨", "DC", 2008, 8, 6),
    (4, "겨울왕국", "디즈니", 2014, 1, 16),
    (5, "아이언맨", "마블", 2008, 4, 30)
]

In [9]:
#스키마를 알아야 한다.
movie_schema = ["id", "name", "company", "year", "month", "day"]

## 데이터 프레임 만들기

In [10]:
df = spark.createDataFrame(data=movies, schema=movie_schema)

In [11]:
df.dtypes

[('id', 'bigint'),
 ('name', 'string'),
 ('company', 'string'),
 ('year', 'bigint'),
 ('month', 'bigint'),
 ('day', 'bigint')]

In [12]:
df.select("name").show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



In [13]:
df.show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  4|겨울왕국| 디즈니|2014|    1| 16|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [14]:
df.filter(df.year >= 2010).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  4|겨울왕국| 디즈니|2014|    1| 16|
+---+--------+-------+----+-----+---+



In [15]:
df.select('year','month','day').show()

+----+-----+---+
|year|month|day|
+----+-----+---+
|2012|    4| 26|
|2013|    6| 13|
|2008|    8|  6|
|2014|    1| 16|
|2008|    4| 30|
+----+-----+---+



In [16]:
df.filter(df.year > 2008).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  4|겨울왕국| 디즈니|2014|    1| 16|
+---+--------+-------+----+-----+---+



In [17]:
df.filter(df.company == '마블').show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



## 뷰 만들기

In [18]:
df.createOrReplaceTempView("movies")
# PySpark에서 DataFrame을 SQL 테이블처럼 쓰기 위해

In [22]:
# 영화 이름만 가져오기

query = """

SELECT name
  FROM movies

"""
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



In [24]:
# 2010 년 이후에 개봉한 영화 조회

query = '''
SELECT NAME,YEAR FROM MOVIES WHERE YEAR > 2010
'''

spark.sql(query).show()

+--------+----+
|    NAME|YEAR|
+--------+----+
|어벤져스|2012|
|  슈퍼맨|2013|
|겨울왕국|2014|
+--------+----+



In [27]:
# like 문자열 데이터에서 특정 단어나 문장을 포함한 데이터를 찾을 때
# % 기호를 사용해서 문장이 매칭되는지 확인 가능!
# 제목이 ~~맨으로 끝나는 데이터의 모든 정보를 조회

query = '''
SELECT * FROM MOVIES WHERE NAME LIKE"%맨"
'''

spark.sql(query).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [28]:
# BETWEEN 특정 데이터와 데이터 사이를 조회

# 개봉 월이 4 ~ 8월 사이. 4 <= 개봉월 <= 8

query = '''
SELECT * FROM MOVIES WHERE MONTH BETWEEN 4 AND 8
'''

spark.sql(query).show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [29]:
# Join 구현하기

attendances = [
    (1, 13934592., "KR"),
    (2, 2182227.,"KR"),
    (3, 4226242., "KR"),
    (4, 10303058., "KR"),
    (5, 4300365., "KR")
]

In [30]:
# 직접 스키마 지정해 보기
from pyspark.sql.types import StringType, FloatType\
    , IntegerType\
    , StructType, StructField

In [31]:
att_schema = StructType([ # 모든 컬럼의 타입을 통칭 - 컬럼 데이터의 집합
    StructField("id", IntegerType(), True), # StructField : 컬럼
    StructField("att", FloatType(), True),
    StructField("theater_country", StringType(), True)
])

In [32]:
att_df = spark.createDataFrame(
    data=attendances,
    schema=att_schema
)

att_df.dtypes

[('id', 'int'), ('att', 'float'), ('theater_country', 'string')]

In [33]:
att_df.createOrReplaceTempView("att")

In [35]:
att_df.show()

+---+-----------+---------------+
| id|        att|theater_country|
+---+-----------+---------------+
|  1|1.3934592E7|             KR|
|  2|  2182227.0|             KR|
|  3|  4226242.0|             KR|
|  4|1.0303058E7|             KR|
|  5|  4300365.0|             KR|
+---+-----------+---------------+



In [38]:
# join

query = '''
SELECT MOVIES.ID, MOVIES.NAME, MOVIES.COMPANY, ATT.ATT
FROM MOVIES
JOIN ATT ON MOVIES.ID = ATT.ID

'''

spark.sql(query).show()

+---+--------+-------+-----------+
| ID|    NAME|COMPANY|        ATT|
+---+--------+-------+-----------+
|  1|어벤져스|   마블|1.3934592E7|
|  2|  슈퍼맨|     DC|  2182227.0|
|  3|  배트맨|     DC|  4226242.0|
|  4|겨울왕국| 디즈니|1.0303058E7|
|  5|아이언맨|   마블|  4300365.0|
+---+--------+-------+-----------+



In [39]:
spark.stop()

# SQL최적화

In [41]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("trip_count_sql").getOrCreate()

In [42]:
trip_file = "learning_spark_data/fhvhv_tripdata_2020-03.csv"

In [43]:
# inferSchema : 자동으로 스키마 예측하게 하기
data = spark.read.csv(trip_file, inferSchema=True, header=True)

In [44]:
data.createOrReplaceTempView("mobility_data")

In [45]:
query = """
select *
from mobility_data
limit 5
"""
spark.sql(query).show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   NULL|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   NULL|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   NULL|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   NULL|
+-----------------+--------------------+-------------------+-------------------+

## 스파크 SQL을 사용하는 이유

In [46]:
query = """

select split(pickup_datetime, ' ')[0] as pickup_date, count(*) as trips
from mobility_data

group by pickup_date
"""

spark.sql(query).show()

+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-03|697880|
| 2020-03-02|648986|
| 2020-03-01|784246|
| 2020-03-06|872012|
| 2020-03-05|731165|
| 2020-03-04|707879|
| 2020-03-09|628940|
| 2020-03-08|731222|
| 2020-03-07|886071|
| 2020-03-10|626474|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-16|391518|
| 2020-03-13|660914|
| 2020-03-15|448125|
| 2020-03-14|569397|
| 2020-03-26|141607|
| 2020-03-25|141088|
| 2020-03-20|261900|
| 2020-03-24|141686|
+-----------+------+
only showing top 20 rows



In [47]:
# 실행 계획 살펴보기
spark.sql(query).explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['split('pickup_datetime,  )[0] AS pickup_date#382, 'count(1) AS trips#383]
+- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [split(cast(pickup_datetime#309 as string),  , -1)[0]], [split(cast(pickup_datetime#309 as string),  , -1)[0] AS pickup_date#382, count(1) AS trips#383L]
+- SubqueryAlias mobility_data
   +- View (`mobility_data`, [hvfhs_license_num#307,dispatching_base_num#308,pickup_datetime#309,dropoff_datetime#310,PULocationID#311,DOLocationID#312,SR_Flag#313])
      +- Relation [hvfhs_license_num#307,dispatching_base_num#308,pickup_datetime#309,dropoff_datetime#310,PULocationID#311,DOLocationID#312,SR_Flag#313] csv

== Optimized Logical Plan ==
Aggregate [_groupingexpression#387], [_groupingexpression#387 AS pickup_date#382, count(1) AS trips#383L]
+- Project [split(cast(pickup_datetime#309 as string),  , -1)[0] AS _groupingexpression#387]
   +- Rel

**Optimized Logical Plan**

Spark Catalyst가 최적화한 로직.

중간 컬럼명을 _groupingexpression#387 이런 내부 표현으로 치환해서 중복 제거

, 필요 없는 컬럼 제거 등의 최적화를 수행.

In [48]:
# 두번째 쿼리
spark.sql("""select 
                pickup_date, 
                count(*) as trips
             from ( select
                          split(pickup_datetime, ' ')[0] as pickup_date
                          from mobility_data )
             group by pickup_date""").explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['pickup_date, 'count(1) AS trips#391]
+- 'SubqueryAlias __auto_generated_subquery_name
   +- 'Project ['split('pickup_datetime,  )[0] AS pickup_date#390]
      +- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [pickup_date#390], [pickup_date#390, count(1) AS trips#391L]
+- SubqueryAlias __auto_generated_subquery_name
   +- Project [split(cast(pickup_datetime#309 as string),  , -1)[0] AS pickup_date#390]
      +- SubqueryAlias mobility_data
         +- View (`mobility_data`, [hvfhs_license_num#307,dispatching_base_num#308,pickup_datetime#309,dropoff_datetime#310,PULocationID#311,DOLocationID#312,SR_Flag#313])
            +- Relation [hvfhs_license_num#307,dispatching_base_num#308,pickup_datetime#309,dropoff_datetime#310,PULocationID#311,DOLocationID#312,SR_Flag#313] csv

== Optimized Logical Plan ==
Aggregate [pickup_date#390], [pickup_date#390, count(1) AS tri

| 단계                     | 설명                              |
| ---------------------- | ------------------------------- |
| Parsed Logical Plan    | 구문 해석만 한 단계                     |
| Analyzed Logical Plan  | 컬럼 존재 확인, 데이터 타입 분석             |
| Optimized Logical Plan | 필요 없는 컬럼 제거, 내부 연산 단순화          |
| Physical Plan          | 실제 실행 방식 결정 (파일 읽기, 파티셔닝, 집계 등) |


**🔍 두 쿼리의 공통 목적**
    
둘 다 목적은 pickup_datetime에서 날짜(pickup_date)를 추출한 뒤, 
    
해당 날짜별로  **트립 수(trips)** 를 집계하는 것입니다.

| 항목                                     | 첫 번째 쿼리                                                  | 두 번째 쿼리                                               |
| -------------------------------------- | -------------------------------------------------------- | ----------------------------------------------------- |
| 날짜 추출 위치                               | `'split(pickup_datetime, " ")[0]`를 `Aggregate` 안에서 직접 사용 | 먼저 `Project`로 `pickup_date` 컬럼 생성한 후 `Aggregate`에서 사용 |
| **Alias 이름**                           | `pickup_date#382` (split 바로 사용)                          | `pickup_date#390` (프로젝션에서 만든 컬럼)                      |
| **Parsed Logical Plan**                | 날짜 파싱 연산이 `Aggregate` 안에 있음                              | 날짜 파싱은 `Project`, 집계는 그 위에서                           |
| **SubqueryAlias 이름**                   | `mobility_data` 직접 사용                                    | `__auto_generated_subquery_name` (서브쿼리 감쌈)            |
| **쿼리 재사용성**                            | 직접 표현 → 다소 비효율                                           | 컬럼 분리 → 재사용성과 최적화 용이                                  |
| **실제 처리 구조 (Optimized/Physical Plan)** | 날짜 추출 → 그룹화 → 집계                                         | 날짜 추출 → 그룹화 → 집계 (거의 동일)                              |

**최적화 측면 비교**
| 측면        | 첫 번째 쿼리     | 두 번째 쿼리                   |
| --------- | ----------- | ------------------------- |
| 중복 연산 제거  | ❌ split 반복됨 | ✅ 한번만 계산                  |
| 최적화 플랜 구조 | 다소 중첩됨      | Project → Aggregate 명확 분리 |
| 재사용성      | 떨어짐         | 높음                        |
| 가독성       | 낮음          | 높음                        |

In [51]:
spark.stop()

# EDA

In [52]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("data_eda").getOrCreate()

In [54]:
trip_file = "learning_spark_data/fhvhv_tripdata_2020-03.csv"
trip_data = spark.read.csv(trip_file, inferSchema=True, header=True)

In [57]:
zone_file = "learning_spark_data/taxi+_zone_lookup.csv"
zone_data = spark.read.csv(zone_file, inferSchema=True, header=True)

In [58]:
trip_data.createOrReplaceTempView("trip_data")
zone_data.createOrReplaceTempView("zone_data")

In [59]:
trip_data.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: integer (nullable = true)



In [60]:
zone_data.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



## 승차 Location(PULocationID)별 개수 세기

In [61]:
query = '''
SELECT * FROM TRIP_DATA
'''

spark.sql(query).show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   NULL|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   NULL|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   NULL|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   NULL|
|           HV0003|              B02682|2020-03-01 00:17:23|2020-03-01 00:39:35|

In [63]:
query = '''
SELECT PULocationID, COUNT(*) AS count
FROM trip_data
GROUP BY PULocationID
'''
spark.sql(query).show()

+------------+------+
|PULocationID| count|
+------------+------+
|         148|116205|
|         243| 87431|
|          31|  5285|
|         137| 85552|
|          85| 46120|
|         251|  9080|
|          65| 66622|
|         255|113947|
|          53| 17571|
|         133| 27200|
|          78| 76155|
|         108| 20378|
|         155| 39527|
|         211| 61075|
|         193| 20111|
|          34| 11823|
|         115| 10806|
|         126| 52833|
|         101|  8983|
|          81| 41425|
+------------+------+
only showing top 20 rows



## 하차 Location(DOLocationID)별 개수 세기

In [64]:
query = '''
SELECT DOLocationID, COUNT(*) AS count
FROM trip_data
GROUP BY DOLocationID
'''
spark.sql(query).show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         148| 91601|
|         243| 86795|
|          31|  5526|
|          85| 44509|
|         137| 80098|
|         251|  8525|
|          65| 58888|
|         255|105051|
|          53| 19013|
|         133| 27760|
|          78| 74447|
|         155| 42239|
|         108| 21354|
|         211| 54176|
|         193| 19104|
|          34| 12392|
|         115|  9809|
|         101|  7218|
|         126| 59027|
|          81| 38445|
+------------+------+
only showing top 20 rows



## HV0003 운송사업자의 승차 지역별 트립 건수를 집계하고, 

## 가장 많은 운송사업자순으로 정렬하는 분석 쿼리  hvfhs_license_num

In [66]:
query = '''
SELECT hvfhs_license_num, PULocationID, COUNT(*) AS count
FROM trip_data
WHERE hvfhs_license_num = 'HV0003'
GROUP BY hvfhs_license_num, PULocationID
ORDER BY count desc
'''
spark.sql(query).show()

+-----------------+------------+------+
|hvfhs_license_num|PULocationID| count|
+-----------------+------------+------+
|           HV0003|          61|163091|
|           HV0003|          76|134198|
|           HV0003|         132|114179|
|           HV0003|          79|112017|
|           HV0003|          37|110150|
|           HV0003|          42|108070|
|           HV0003|         138|104119|
|           HV0003|         244| 97324|
|           HV0003|          89| 95724|
|           HV0003|          39| 94484|
|           HV0003|         231| 94155|
|           HV0003|           7| 92676|
|           HV0003|          17| 90352|
|           HV0003|         161| 90261|
|           HV0003|         225| 88749|
|           HV0003|         234| 88372|
|           HV0003|         230| 86870|
|           HV0003|         188| 84347|
|           HV0003|          35| 82764|
|           HV0003|         168| 82396|
+-----------------+------------+------+
only showing top 20 rows



## 운송사별 운행 건수 비교

In [70]:
query = '''
SELECT hvfhs_license_num, COUNT(*) AS COUNT
FROM TRIP_DATA
GROUP BY hvfhs_license_num
'''
spark.sql(query).show()

+-----------------+-------+
|hvfhs_license_num|  COUNT|
+-----------------+-------+
|           HV0004| 336606|
|           HV0005|3219535|
|           HV0003|9836763|
+-----------------+-------+



## 승차 위치 Borough별 운행 건수

In [69]:
zone_data.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [72]:
query = '''
SELECT Borough, COUNT(*) AS COUNT
FROM ZONE_DATA
GROUP BY Borough
'''
spark.sql(query).show()

+-------------+-----+
|      Borough|COUNT|
+-------------+-----+
|       Queens|   69|
|          EWR|    1|
|      Unknown|    2|
|     Brooklyn|   61|
|Staten Island|   20|
|    Manhattan|   69|
|        Bronx|   43|
+-------------+-----+



## 서비스 존별 승차/하차 건수

In [79]:
# JOIN. 승차
joined_df = trip_data.join(zone_data, trip_data['PULocationID'] == zone_data['LocationID'])
joined_df.select('PULocationID','service_zone').groupby('service_zone').count().show()

+------------+-------+
|service_zone|  count|
+------------+-------+
|         EWR|    362|
|         N/A|    845|
| Yellow Zone|4025190|
|    Airports| 319610|
|   Boro Zone|9046897|
+------------+-------+



In [80]:
# JOIN. 하차
joined_df = trip_data.join(zone_data, trip_data['DOLocationID'] == zone_data['LocationID'])
joined_df.select('DOLocationID','service_zone').groupby('service_zone').count().show()

+------------+-------+
|service_zone|  count|
+------------+-------+
|         EWR|  65066|
|         N/A| 387759|
| Yellow Zone|3643787|
|    Airports| 411156|
|   Boro Zone|8885136|
+------------+-------+

