## 호선을 기준으로 월별 지하철 이용객 수

### 1. 데이터 전처리 과정

In [1]:
import pyspark
import numpy as np
import pandas as pd

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [3]:
pq = spark.read.parquet('metro_per_date')
pq.show()

+----------+------+------------+------------+------------+--------+
|  사용일자|노선명|        역명|승차총승객수|하차총승객수|이용객수|
+----------+------+------------+------------+------------+--------+
|2018-01-10|분당선|압구정로데오|       16346|       18420|   34766|
|2018-01-10|분당선|      서울숲|        7914|        7922|   15836|
|2018-01-10|분당선|        수원|        9461|        6280|   15741|
|2018-01-10|분당선|        수서|       15403|       15627|   31030|
|2018-01-10|분당선|  대모산입구|        3431|        3182|    6613|
|2018-01-10|분당선|      개포동|        3582|        3723|    7305|
|2018-01-10|분당선|        구룡|        2105|        2005|    4110|
|2018-01-10|분당선|        도곡|        7855|        8147|   16002|
|2018-01-10|분당선|        한티|       16276|       17214|   33490|
|2018-01-10|분당선|        선릉|       10792|       22141|   32933|
|2018-01-10|과천선|        범계|       32318|       30961|   63279|
|2018-01-10|과천선|        평촌|       19691|       19258|   38949|
|2018-01-10|과천선|      인덕원|       28817|       28564|   57381|
|2018-01-10|

In [4]:
#지하철 노선 현황
pq.select("노선명").distinct().collect()

[Row(노선명='일산선'),
 Row(노선명='장항선'),
 Row(노선명='경부선'),
 Row(노선명='우이신설선'),
 Row(노선명='분당선'),
 Row(노선명='7호선'),
 Row(노선명='수인선'),
 Row(노선명='안산선'),
 Row(노선명='4호선'),
 Row(노선명='1호선'),
 Row(노선명='경의선'),
 Row(노선명='3호선'),
 Row(노선명='경강선'),
 Row(노선명='경춘선'),
 Row(노선명='9호선'),
 Row(노선명='6호선'),
 Row(노선명='5호선'),
 Row(노선명='8호선'),
 Row(노선명='공항철도'),
 Row(노선명='과천선'),
 Row(노선명='경인선'),
 Row(노선명='중앙선'),
 Row(노선명='2호선'),
 Row(노선명='경원선')]

In [5]:
#노선명 전처리과정
pq = pq.withColumn("노선명", F.when(pq.노선명 == "일산선","3호선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "장항선","1호선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "경부선","1호선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "안산선","4호선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "경인선","1호선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "경원선","1호선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "과천선","4호선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "수인선","수인분당선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "분당선","수인분당선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "경의선","경의중앙선").otherwise(pq.노선명))
pq = pq.withColumn("노선명", F.when(pq.노선명 == "중앙선","경의중앙선").otherwise(pq.노선명))

In [6]:
#지하철 노선 전처리 확인
pq.select("노선명").distinct().collect()

[Row(노선명='우이신설선'),
 Row(노선명='7호선'),
 Row(노선명='4호선'),
 Row(노선명='1호선'),
 Row(노선명='3호선'),
 Row(노선명='경강선'),
 Row(노선명='경춘선'),
 Row(노선명='9호선'),
 Row(노선명='6호선'),
 Row(노선명='경의중앙선'),
 Row(노선명='5호선'),
 Row(노선명='8호선'),
 Row(노선명='공항철도'),
 Row(노선명='2호선'),
 Row(노선명='수인분당선')]

In [10]:
#모든 노선 parquet 저장
pq.write.parquet("all_line")

In [11]:
#필요없는 데이터 삭제
pq = pq.drop("하차총승객수","역명", "이용객수")
pq.show()

+----------+----------+------------+
|  사용일자|    노선명|승차총승객수|
+----------+----------+------------+
|2018-01-10|수인분당선|       16346|
|2018-01-10|수인분당선|        7914|
|2018-01-10|수인분당선|        9461|
|2018-01-10|수인분당선|       15403|
|2018-01-10|수인분당선|        3431|
|2018-01-10|수인분당선|        3582|
|2018-01-10|수인분당선|        2105|
|2018-01-10|수인분당선|        7855|
|2018-01-10|수인분당선|       16276|
|2018-01-10|수인분당선|       10792|
|2018-01-10|     4호선|       32318|
|2018-01-10|     4호선|       19691|
|2018-01-10|     4호선|       28817|
|2018-01-10|     4호선|       16093|
|2018-01-10|     4호선|        4888|
|2018-01-10|     4호선|        2297|
|2018-01-10|     4호선|        2089|
|2018-01-10|     4호선|        9809|
|2018-01-10|     4호선|        4848|
|2018-01-10|     4호선|        9162|
+----------+----------+------------+
only showing top 20 rows



In [12]:
#판다스로 변환
df = pq.select("*").toPandas()
df

Unnamed: 0,사용일자,노선명,승차총승객수
0,2018-01-10,수인분당선,16346
1,2018-01-10,수인분당선,7914
2,2018-01-10,수인분당선,9461
3,2018-01-10,수인분당선,15403
4,2018-01-10,수인분당선,3431
...,...,...,...
809438,2019-12-31,9호선,3525
809439,2019-12-31,9호선,38032
809440,2019-12-31,공항철도,15568
809441,2019-12-31,공항철도,2488


In [13]:
#사용일자 컬럼명 변경
df.rename(columns={"사용일자" : "월간사용일자"}, inplace=True)
df

Unnamed: 0,월간사용일자,노선명,승차총승객수
0,2018-01-10,수인분당선,16346
1,2018-01-10,수인분당선,7914
2,2018-01-10,수인분당선,9461
3,2018-01-10,수인분당선,15403
4,2018-01-10,수인분당선,3431
...,...,...,...
809438,2019-12-31,9호선,3525
809439,2019-12-31,9호선,38032
809440,2019-12-31,공항철도,15568
809441,2019-12-31,공항철도,2488


In [14]:
#사용일자 컬럼이 string이라서 datetime 데이터타입으로 변환해줘야함.
df['월간사용일자'] = pd.to_datetime(df['월간사용일자'])
df.dtypes

월간사용일자    datetime64[ns]
노선명               object
승차총승객수             int32
dtype: object

### 2. 데이터 분석과정

In [15]:
#1호선인 노선만 추출
first = df[df["노선명"] == "1호선"]
first

Unnamed: 0,월간사용일자,노선명,승차총승객수
31,2018-01-10,1호선,2715
32,2018-01-10,1호선,2069
33,2018-01-10,1호선,1653
34,2018-01-10,1호선,3763
35,2018-01-10,1호선,6811
...,...,...,...
809183,2019-12-31,1호선,2039
809184,2019-12-31,1호선,3035
809185,2019-12-31,1호선,1151
809186,2019-12-31,1호선,4975


In [16]:
#1호선 월간 승차승객 수
first_df = first.groupby(first['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
first_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,37718857
1,18-02,34000484
2,18-03,42911210
3,18-04,41567796
4,18-05,43021069
5,18-06,39837698
6,18-07,38947613
7,18-08,37764767
8,18-09,38615609
9,18-10,42855259


In [17]:
#1호선 parquet 저장
first_df.to_parquet("first_line.parquet")

In [18]:
#1호선 parquet 확인
pq = spark.read.parquet("first_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|    37718857|
|       18-02|    34000484|
|       18-03|    42911210|
|       18-04|    41567796|
|       18-05|    43021069|
|       18-06|    39837698|
|       18-07|    38947613|
|       18-08|    37764767|
|       18-09|    38615609|
|       18-10|    42855259|
|       18-11|    41913321|
|       18-12|    40684701|
|       19-01|    38968076|
|       19-02|    34157858|
|       19-03|    42272671|
|       19-04|    42134644|
|       19-05|    43814344|
|       19-06|    39846569|
|       19-07|    39854041|
|       19-08|    38708227|
+------------+------------+
only showing top 20 rows



In [19]:
#2호선 데이터 추출
second = df[df["노선명"] == "2호선"]
second_df = second.groupby(second['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
second_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,45887286
1,18-02,40241722
2,18-03,49033108
3,18-04,47273444
4,18-05,48249395
5,18-06,45485564
6,18-07,47356784
7,18-08,45610478
8,18-09,42607082
9,18-10,48343354


In [20]:
#2호선 parquet 저장 및 확인
second_df.to_parquet("second_line.parquet")
pq = spark.read.parquet("second_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|    45887286|
|       18-02|    40241722|
|       18-03|    49033108|
|       18-04|    47273444|
|       18-05|    48249395|
|       18-06|    45485564|
|       18-07|    47356784|
|       18-08|    45610478|
|       18-09|    42607082|
|       18-10|    48343354|
|       18-11|    48332245|
|       18-12|    47703685|
|       19-01|    46899825|
|       19-02|    39681520|
|       19-03|    48049519|
|       19-04|    48093445|
|       19-05|    49356480|
|       19-06|    45250485|
|       19-07|    48199757|
|       19-08|    45869019|
+------------+------------+
only showing top 20 rows



In [21]:
#3호선
third = df[df["노선명"] == "3호선"]
third_df = third.groupby(third['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
third_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,19761386
1,18-02,17542544
2,18-03,21525999
3,18-04,20899990
4,18-05,21604538
5,18-06,20131725
6,18-07,20571184
7,18-08,19909354
8,18-09,19287414
9,18-10,21657022


In [22]:
#3호선 parquet 저장 및 확인
third_df.to_parquet("third_line.parquet")
pq = spark.read.parquet("third_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|    19761386|
|       18-02|    17542544|
|       18-03|    21525999|
|       18-04|    20899990|
|       18-05|    21604538|
|       18-06|    20131725|
|       18-07|    20571184|
|       18-08|    19909354|
|       18-09|    19287414|
|       18-10|    21657022|
|       18-11|    21525019|
|       18-12|    20964224|
|       19-01|    20490054|
|       19-02|    17604272|
|       19-03|    21305186|
|       19-04|    21447508|
|       19-05|    22192756|
|       19-06|    20171647|
|       19-07|    21075547|
|       19-08|    20351217|
+------------+------------+
only showing top 20 rows



In [23]:
#4호선
fourth = df[df["노선명"] == "4호선"]
fourth_df = fourth.groupby(fourth['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
fourth_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,23688734
1,18-02,20940626
2,18-03,26626818
3,18-04,25962968
4,18-05,26748223
5,18-06,24767398
6,18-07,24333066
7,18-08,23350000
8,18-09,23251720
9,18-10,26409690


In [24]:
#4호선 parquet 저장 및 확인
fourth_df.to_parquet("fourth_line.parquet")
pq = spark.read.parquet("fourth_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|    23688734|
|       18-02|    20940626|
|       18-03|    26626818|
|       18-04|    25962968|
|       18-05|    26748223|
|       18-06|    24767398|
|       18-07|    24333066|
|       18-08|    23350000|
|       18-09|    23251720|
|       18-10|    26409690|
|       18-11|    25943657|
|       18-12|    25389776|
|       19-01|    24077811|
|       19-02|    20764721|
|       19-03|    25938466|
|       19-04|    26145804|
|       19-05|    27046323|
|       19-06|    24545564|
|       19-07|    24776634|
|       19-08|    23757272|
+------------+------------+
only showing top 20 rows



In [25]:
#5호선
fifth = df[df["노선명"] == "5호선"]
fifth_df = fifth.groupby(fifth['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
fifth_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,17821981
1,18-02,15512971
2,18-03,19241969
3,18-04,18973480
4,18-05,19264987
5,18-06,18014832
6,18-07,18276688
7,18-08,17600570
8,18-09,16833874
9,18-10,19277023


In [26]:
#5호선 parquet 저장 및 확인
fifth_df.to_parquet("fifth_line.parquet")
pq = spark.read.parquet("fifth_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|    17821981|
|       18-02|    15512971|
|       18-03|    19241969|
|       18-04|    18973480|
|       18-05|    19264987|
|       18-06|    18014832|
|       18-07|    18276688|
|       18-08|    17600570|
|       18-09|    16833874|
|       18-10|    19277023|
|       18-11|    19113547|
|       18-12|    18484813|
|       19-01|    18286102|
|       19-02|    15446519|
|       19-03|    18834177|
|       19-04|    19439027|
|       19-05|    19767409|
|       19-06|    17915051|
|       19-07|    18849460|
|       19-08|    17953795|
+------------+------------+
only showing top 20 rows



In [27]:
#6호선
sixth = df[df["노선명"] == "6호선"]
sixth_df = sixth.groupby(sixth['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
sixth_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,10132174
1,18-02,9006419
2,18-03,11496843
3,18-04,11108151
4,18-05,11567943
5,18-06,10602634
6,18-07,10632494
7,18-08,10229063
8,18-09,10185000
9,18-10,11811628


In [28]:
#6호선 parquet 저장 및 확인
sixth_df.to_parquet("sixth_line.parquet")
pq = spark.read.parquet("sixth_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|    10132174|
|       18-02|     9006419|
|       18-03|    11496843|
|       18-04|    11108151|
|       18-05|    11567943|
|       18-06|    10602634|
|       18-07|    10632494|
|       18-08|    10229063|
|       18-09|    10185000|
|       18-10|    11811628|
|       18-11|    11286820|
|       18-12|    10938420|
|       19-01|    10439337|
|       19-02|     8930481|
|       19-03|    11313592|
|       19-04|    11276357|
|       19-05|    11707916|
|       19-06|    10596634|
|       19-07|    10779493|
|       19-08|    10275673|
+------------+------------+
only showing top 20 rows



In [29]:
#7호선
seventh = df[df["노선명"] == "7호선"]
seventh_df = seventh.groupby(seventh['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
seventh_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,20808674
1,18-02,18321937
2,18-03,22747342
3,18-04,22251238
4,18-05,22847198
5,18-06,21242126
6,18-07,21467154
7,18-08,20765977
8,18-09,20102942
9,18-10,22709436


In [30]:
#7호선 parquet 저장 및 확인
seventh_df.to_parquet("seventh_line.parquet")
pq = spark.read.parquet("seventh_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|    20808674|
|       18-02|    18321937|
|       18-03|    22747342|
|       18-04|    22251238|
|       18-05|    22847198|
|       18-06|    21242126|
|       18-07|    21467154|
|       18-08|    20765977|
|       18-09|    20102942|
|       18-10|    22709436|
|       18-11|    22594694|
|       18-12|    21729774|
|       19-01|    21451865|
|       19-02|    18303458|
|       19-03|    22454520|
|       19-04|    22872658|
|       19-05|    23561336|
|       19-06|    21218818|
|       19-07|    22005565|
|       19-08|    20847518|
+------------+------------+
only showing top 20 rows



In [31]:
#8호선
eighth = df[df["노선명"] == "8호선"]
eighth_df = eighth.groupby(eighth['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
eighth_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,5451090
1,18-02,4772563
2,18-03,5905182
3,18-04,5793719
4,18-05,5916680
5,18-06,5523115
6,18-07,5630083
7,18-08,5445725
8,18-09,5176534
9,18-10,5973011


In [32]:
#8호선 parquet 저장 및 확인
eighth_df.to_parquet("eighth_line.parquet")
pq = spark.read.parquet("eighth_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|     5451090|
|       18-02|     4772563|
|       18-03|     5905182|
|       18-04|     5793719|
|       18-05|     5916680|
|       18-06|     5523115|
|       18-07|     5630083|
|       18-08|     5445725|
|       18-09|     5176534|
|       18-10|     5973011|
|       18-11|     5944278|
|       18-12|     5786609|
|       19-01|     5738365|
|       19-02|     4882939|
|       19-03|     5999691|
|       19-04|     6132745|
|       19-05|     6255062|
|       19-06|     5679380|
|       19-07|     5918725|
|       19-08|     5574809|
+------------+------------+
only showing top 20 rows



In [33]:
#9호선
ninth = df[df["노선명"] == "9호선"]
ninth_df = ninth.groupby(ninth['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
ninth_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,9018708
1,18-02,7876626
2,18-03,9528474
3,18-04,9509410
4,18-05,9518366
5,18-06,9031818
6,18-07,9488403
7,18-08,9180575
8,18-09,8502836
9,18-10,9696979


In [34]:
#9호선 parquet 저장 및 확인
ninth_df.to_parquet("ninth_line.parquet")
pq = spark.read.parquet("ninth_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|     9018708|
|       18-02|     7876626|
|       18-03|     9528474|
|       18-04|     9509410|
|       18-05|     9518366|
|       18-06|     9031818|
|       18-07|     9488403|
|       18-08|     9180575|
|       18-09|     8502836|
|       18-10|     9696979|
|       18-11|     9716119|
|       18-12|    10618857|
|       19-01|    10588419|
|       19-02|     9004458|
|       19-03|    10914816|
|       19-04|    11359517|
|       19-05|    11410780|
|       19-06|    10560647|
|       19-07|    11344500|
|       19-08|    10820214|
+------------+------------+
only showing top 20 rows



In [35]:
#우이신설선
wooi = df[df["노선명"] == "우이신설선"]
wooi_df = wooi.groupby(wooi['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
wooi_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,1115984
1,18-02,1020053
2,18-03,1315328
3,18-04,1285479
4,18-05,1362035
5,18-06,1262314
6,18-07,1259765
7,18-08,1222945
8,18-09,1249766
9,18-10,1400225


In [37]:
#우이신설선 parquet 저장 및 확인
wooi_df.to_parquet("wooi_line.parquet")
pq = spark.read.parquet("wooi_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|     1115984|
|       18-02|     1020053|
|       18-03|     1315328|
|       18-04|     1285479|
|       18-05|     1362035|
|       18-06|     1262314|
|       18-07|     1259765|
|       18-08|     1222945|
|       18-09|     1249766|
|       18-10|     1400225|
|       18-11|     1366119|
|       18-12|     1301315|
|       19-01|     1263643|
|       19-02|     1102109|
|       19-03|     1402393|
|       19-04|     1403115|
|       19-05|     1469681|
|       19-06|     1333376|
|       19-07|     1347035|
|       19-08|     1282720|
+------------+------------+
only showing top 20 rows



In [39]:
#경강선
kyeongkang = df[df["노선명"] == "경강선"]
kyeongkang_df = kyeongkang.groupby(kyeongkang['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
kyeongkang_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,699511
1,18-02,640096
2,18-03,812367
3,18-04,801579
4,18-05,853184
5,18-06,789123
6,18-07,777704
7,18-08,775414
8,18-09,797383
9,18-10,901418


In [42]:
#경강선 parquet 저장 및 확인
kyeongkang_df.to_parquet("kyeongkang _line.parquet")
pq = spark.read.parquet("kyeongkang _line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|      699511|
|       18-02|      640096|
|       18-03|      812367|
|       18-04|      801579|
|       18-05|      853184|
|       18-06|      789123|
|       18-07|      777704|
|       18-08|      775414|
|       18-09|      797383|
|       18-10|      901418|
|       18-11|      886851|
|       18-12|      825054|
|       19-01|      785050|
|       19-02|      698935|
|       19-03|      859295|
|       19-04|      878212|
|       19-05|      933785|
|       19-06|      850202|
|       19-07|      860529|
|       19-08|      837487|
+------------+------------+
only showing top 20 rows



In [44]:
#경춘선
kyeongchun = df[df["노선명"] == "경춘선"]
kyeongchun_df = kyeongchun.groupby(kyeongchun['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
kyeongchun_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,1006923
1,18-02,903428
2,18-03,1211193
3,18-04,1242069
4,18-05,1344895
5,18-06,1221239
6,18-07,1198189
7,18-08,1193829
8,18-09,1200294
9,18-10,1322133


In [45]:
#경춘선 parquet 저장 및 확인
kyeongchun_df.to_parquet("kyeongchun _line.parquet")
pq = spark.read.parquet("kyeongchun _line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|     1006923|
|       18-02|      903428|
|       18-03|     1211193|
|       18-04|     1242069|
|       18-05|     1344895|
|       18-06|     1221239|
|       18-07|     1198189|
|       18-08|     1193829|
|       18-09|     1200294|
|       18-10|     1322133|
|       18-11|     1229829|
|       18-12|     1123454|
|       19-01|     1070089|
|       19-02|      964241|
|       19-03|     1236373|
|       19-04|     1272843|
|       19-05|     1375585|
|       19-06|     1235094|
|       19-07|     1220670|
|       19-08|     1240651|
+------------+------------+
only showing top 20 rows



In [46]:
#경의중앙선
kyeongui = df[df["노선명"] == "경의중앙선"]
kyeongui_df = kyeongui.groupby(kyeongui['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
kyeongui_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,5514174
1,18-02,4948412
2,18-03,6426656
3,18-04,6337781
4,18-05,6625444
5,18-06,6133822
6,18-07,6036182
7,18-08,5841090
8,18-09,5999981
9,18-10,6715749


In [47]:
#경의중앙선 parquet 저장 및 확인
kyeongui_df.to_parquet("kyeongui _line.parquet")
pq = spark.read.parquet("kyeongui _line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|     5514174|
|       18-02|     4948412|
|       18-03|     6426656|
|       18-04|     6337781|
|       18-05|     6625444|
|       18-06|     6133822|
|       18-07|     6036182|
|       18-08|     5841090|
|       18-09|     5999981|
|       18-10|     6715749|
|       18-11|     6544118|
|       18-12|     6106411|
|       19-01|     5856979|
|       19-02|     5117671|
|       19-03|     6512327|
|       19-04|     6644988|
|       19-05|     6955620|
|       19-06|     6279728|
|       19-07|     6330932|
|       19-08|     6079058|
+------------+------------+
only showing top 20 rows



In [48]:
#공항철도
airline = df[df["노선명"] == "공항철도"]
airline_df = airline.groupby(airline['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
airline_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,3061504
1,18-02,2795040
2,18-03,3225826
3,18-04,3137155
4,18-05,3237594
5,18-06,3133360
6,18-07,3265937
7,18-08,3388878
8,18-09,3110974
9,18-10,3524138


In [49]:
#공항철도 parquet 저장 및 확인
airline_df.to_parquet("airline.parquet")
pq = spark.read.parquet("airline.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|     3061504|
|       18-02|     2795040|
|       18-03|     3225826|
|       18-04|     3137155|
|       18-05|     3237594|
|       18-06|     3133360|
|       18-07|     3265937|
|       18-08|     3388878|
|       18-09|     3110974|
|       18-10|     3524138|
|       18-11|     3445346|
|       18-12|     3422177|
|       19-01|     3429836|
|       19-02|     3055208|
|       19-03|     3584527|
|       19-04|     3600140|
|       19-05|     3736138|
|       19-06|     3550850|
|       19-07|     3715001|
|       19-08|     3761882|
+------------+------------+
only showing top 20 rows



In [50]:
#수인분당선
sooin = df[df["노선명"] == "수인분당선"]
sooin_df = sooin.groupby(sooin['월간사용일자'].dt.strftime("%y-%m")).sum().reset_index()
sooin_df

Unnamed: 0,월간사용일자,승차총승객수
0,18-01,12023736
1,18-02,10587901
2,18-03,13288031
3,18-04,12827443
4,18-05,13091576
5,18-06,12249951
6,18-07,12391528
7,18-08,11937535
8,18-09,11640007
9,18-10,13141959


In [52]:
#수인분당선 parquet 저장 및 확인
sooin_df.to_parquet("sooin_line.parquet")
pq = spark.read.parquet("sooin_line.parquet")
pq.show()

+------------+------------+
|월간사용일자|승차총승객수|
+------------+------------+
|       18-01|    12023736|
|       18-02|    10587901|
|       18-03|    13288031|
|       18-04|    12827443|
|       18-05|    13091576|
|       18-06|    12249951|
|       18-07|    12391528|
|       18-08|    11937535|
|       18-09|    11640007|
|       18-10|    13141959|
|       18-11|    13118463|
|       18-12|    12754731|
|       19-01|    12502563|
|       19-02|    10621422|
|       19-03|    13135724|
|       19-04|    13227568|
|       19-05|    13754838|
|       19-06|    12512994|
|       19-07|    12976959|
|       19-08|    12313111|
+------------+------------+
only showing top 20 rows

