### 1. PySpark 시작 및 데이터셋 로딩
+ App name: shuffling-and-partitioning

In [4]:
# [+] PySpark 시작
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("shffuling_and_partition")
sc = SparkContext(conf = conf)

In [6]:
# 데이터셋 경로 및 파일 이름
path = "./data/"
filename = "fhvhv_tripdata_2020-03_short.csv"

In [8]:
# [+] 데이터셋 로딩 및 헤더 추출
lines = sc.textFile(path + filename)
header = lines.first()

In [9]:
header

'hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag'

In [10]:
# [+] 헤더 제거: filter()
filtered_lines = lines.filter(lambda row:row != header)

*filtered_lines 데이터*
```
HV0005,B02510,2020-03-01 00:03:40,2020-03-01 00:23:39,81,159, 
HV0005,B02510,2020-03-01 00:28:05,2020-03-01 00:38:57,168,119,
HV0003,B02764,2020-03-01 00:03:07,2020-03-01 00:15:04,137,209,1
HV0003,B02764,2020-03-01 00:18:42,2020-03-01 00:38:42,209,80,
HV0003,B02764,2020-03-01 00:44:24,2020-03-01 00:58:44,256,226,
...
```

In [None]:
# [+] 승차일자 추출하기(K-V RDD로 저장): map()

dates = 

In [None]:
# 첫 번째 값 출력
dates.first()

---

### 2. 일별 승차횟수 계산

#### 2.1 reduceByKey()

In [None]:
# [+] reduceByKey()를 이용한 일별 승차횟수 계산
reduced = 

```%%time```: 셀 단위 실행 시간 측정(셀 최상단에 작성)



In [None]:
%%time

# 처리 시간 측정
reduced.collect()

---

#### 2.2 groupByKey()

In [None]:
# [+] groupByKey()를 이용한 일별 데이터 그룹핑
groups = 

In [None]:
# 첫 번째 값 출력
groups.first()

In [None]:
# [+] mapValues()를 이용한 일별 승차횟수 계산
counts = groups.mapValues(len)

```%time```: 문장 단위 실행 시간 측정(문장 앞에 작성)

In [None]:
# [+] 처리 시간 측정
%time counts.collect()

---

### 3. RDD 파티셔닝(Partitioning)

In [11]:
# 데이터셋 불러오기
path = './data/'
filename = 'id_and_name.csv'
lines = sc.textFile(path + filename)

In [12]:
# [+] lines의 값 10개 출력
lines.take(10)

['id,name',
 '201958114,최창성',
 '202058062,유예지',
 '202122035,이원모',
 '202158050,맹서희',
 '202158083,신동민',
 '201755009,권진용',
 '201755068,윤여준',
 '201855056,김성식',
 '201855095,강진엽']

In [13]:
# [+] 헤더 제거: first(), filter()
header = lines.first()
filtered_lines = lines.filter(lambda row:row != header)

In [14]:
# 파티션 수 설정
n = 5

#### 3.1 K-V RDD로 변환
*filtered_lines*
```
['201958114,최창성',
 '202058062,유예지',
 '202122035,이원모',
 '202158050,맹서희',
 '202158083,신동민',
 ...
```

*kv_lines*
```
[['201958114', '최창성'],
 ['202058062', '유예지'],
 ['202122035', '이원모'],
 ['202158050', '맹서희'],
 ['202158083', '신동민'],
 ...
```


In [16]:
# K-V RDD로 변환
kv_lines = filtered_lines.map(lambda x:x.split(','))
kv_lines.take(5)

[['201958114', '최창성'],
 ['202058062', '유예지'],
 ['202122035', '이원모'],
 ['202158050', '맹서희'],
 ['202158083', '신동민']]

#### 3.2 Hash Partitioning
key(x)에 대한 해시 함수를 정의하여 파티셔닝을 수행

In [18]:
# 모듈로 해시 함수를 이용한 파티셔닝: partitionBy()
parted = kv_lines.partitionBy(n, lambda x : int(x) % n)

In [19]:
# [+] parted의 파티션 별 RDD 값 출력
parted.glom().collect()

[[('202122035', '이원모'),
  ('202158050', '맹서희'),
  ('201855095', '강진엽'),
  ('201955005', '김보석'),
  ('201855035', '편해수'),
  ('202055040', '신현종'),
  ('202155100', '김도훈'),
  ('201752030', '박창준'),
  ('201855040', '오승준'),
  ('201955010', '황민태'),
  ('202055005', '조흥진')],
 [('201855056', '김성식'),
  ('202055046', '정진욱'),
  ('201955096', '정원석'),
  ('202055066', '윤혜진'),
  ('202255061', '권오형'),
  ('201955006', '김지혜'),
  ('201955026', '이신우'),
  ('201955036', '김승재'),
  ('202155111', '김찬용')],
 [('202058062', '유예지'),
  ('202055032', '김미현'),
  ('201855017', '차정현'),
  ('202155002', '이예은'),
  ('201955017', '장유찬'),
  ('201955037', '최호진'),
  ('201955042', '이우용'),
  ('201955082', '신민섭'),
  ('202055012', '이진용'),
  ('201655017', '김태균'),
  ('201755062', '김민종'),
  ('202155057', '강유진'),
  ('202155107', '임영진'),
  ('202167047', '응우옌 쩐 까오 탄 ')],
 [('202158083', '신동민'),
  ('201755068', '윤여준'),
  ('201955063', '이재형'),
  ('201955068', '최한결'),
  ('201855003', '김기준'),
  ('202155043', '이주헌'),
  ('202155098', '김태훈'),
  ('2

In [24]:
# 파티션 수 조정: repartition()
reparted = parted.repartition(10)


In [25]:
# [+] reparted의 파티션 별 RDD 값 출력
reparted.glom().collect()

[[],
 [('202122035', '이원모'),
  ('202158050', '맹서희'),
  ('201855095', '강진엽'),
  ('201955005', '김보석'),
  ('201855035', '편해수'),
  ('202055040', '신현종'),
  ('202155100', '김도훈'),
  ('201752030', '박창준'),
  ('201855040', '오승준'),
  ('201955010', '황민태')],
 [('202055005', '조흥진'),
  ('202058062', '유예지'),
  ('202055032', '김미현'),
  ('201855017', '차정현'),
  ('202155002', '이예은'),
  ('201955017', '장유찬'),
  ('201955037', '최호진'),
  ('201955042', '이우용'),
  ('201955082', '신민섭'),
  ('202055012', '이진용'),
  ('201655017', '김태균'),
  ('202158083', '신동민'),
  ('201755068', '윤여준'),
  ('201955063', '이재형'),
  ('201955068', '최한결'),
  ('201855003', '김기준'),
  ('202155043', '이주헌'),
  ('202155098', '김태훈'),
  ('201752013', '전수복'),
  ('201752038', '이준석'),
  ('201852033', '함민규')],
 [('201855056', '김성식'),
  ('202055046', '정진욱'),
  ('201955096', '정원석'),
  ('202055066', '윤혜진'),
  ('202255061', '권오형'),
  ('201955006', '김지혜'),
  ('201955026', '이신우'),
  ('201955036', '김승재'),
  ('202155111', '김찬용'),
  ('201755062', '김민종'),
  ('20215

In [26]:
# 파티션 수 조정: coalesce()
combined = reparted.coalesce(3)

In [27]:
# [+] combined의 파티션 별 RDD 값 출력
combined.glom().collect()

[[('202122035', '이원모'),
  ('202158050', '맹서희'),
  ('201855095', '강진엽'),
  ('201955005', '김보석'),
  ('201855035', '편해수'),
  ('202055040', '신현종'),
  ('202155100', '김도훈'),
  ('201752030', '박창준'),
  ('201855040', '오승준'),
  ('201955010', '황민태'),
  ('201955014', '이현우'),
  ('201955024', '정우기'),
  ('201955049', '한우진'),
  ('201955054', '이승재'),
  ('201755054', '조영준'),
  ('202055079', '하수진'),
  ('202155004', '강혜경'),
  ('202155054', '이규도'),
  ('202155084', '이은수'),
  ('202155099', '이종호')],
 [('202155109', '염건호'),
  ('202055005', '조흥진'),
  ('202058062', '유예지'),
  ('202055032', '김미현'),
  ('201855017', '차정현'),
  ('202155002', '이예은'),
  ('201955017', '장유찬'),
  ('201955037', '최호진'),
  ('201955042', '이우용'),
  ('201955082', '신민섭'),
  ('202055012', '이진용'),
  ('201655017', '김태균'),
  ('202158083', '신동민'),
  ('201755068', '윤여준'),
  ('201955063', '이재형'),
  ('201955068', '최한결'),
  ('201855003', '김기준'),
  ('202155043', '이주헌'),
  ('202155098', '김태훈'),
  ('201752013', '전수복'),
  ('201752038', '이준석'),
  ('201852033',

#### 3.3 Range Partitioning
+ Range partitioning 수행을 위해 RDD 객체를 DataFrame으로 변환
+ DataFrame 생성을 위해 SparkSession 사용

In [28]:
# SparkSession 임포트 및 객체 생성
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RepartitionByRangeExample").getOrCreate()

In [29]:
# DataFrame 생성
df = spark.createDataFrame(parted, ["id", "name"])

In [30]:
# DataFrame 출력
df.show()

+---------+------+
|       id|  name|
+---------+------+
|202122035|이원모|
|202158050|맹서희|
|201855095|강진엽|
|201955005|김보석|
|201855035|편해수|
|202055040|신현종|
|202155100|김도훈|
|201752030|박창준|
|201855040|오승준|
|201955010|황민태|
|202055005|조흥진|
|201855056|김성식|
|202055046|정진욱|
|201955096|정원석|
|202055066|윤혜진|
|202255061|권오형|
|201955006|김지혜|
|201955026|이신우|
|201955036|김승재|
|202155111|김찬용|
+---------+------+
only showing top 20 rows



In [31]:
# Range Partitioning: repartitionByRange()
reparted = df.repartitionByRange(3, 'id')

파티션별 RDD 값 출력
+ ```.rdd```: DataFrame으로부터 RDD 객체 획득

In [32]:
# reparted의 RDD의 파티션 별 값 출력
reparted.rdd.glom().collect()

[[Row(id='201855095', name='강진엽'),
  Row(id='201955005', name='김보석'),
  Row(id='201855035', name='편해수'),
  Row(id='201752030', name='박창준'),
  Row(id='201855040', name='오승준'),
  Row(id='201955010', name='황민태'),
  Row(id='201855056', name='김성식'),
  Row(id='201955006', name='김지혜'),
  Row(id='201855017', name='차정현'),
  Row(id='201655017', name='김태균'),
  Row(id='201755062', name='김민종'),
  Row(id='201755068', name='윤여준'),
  Row(id='201855003', name='김기준'),
  Row(id='201752013', name='전수복'),
  Row(id='201752038', name='이준석'),
  Row(id='201852033', name='함민규'),
  Row(id='201755018', name='김지훈'),
  Row(id='201755033', name='백준열'),
  Row(id='201855008', name='박제민'),
  Row(id='201955008', name='김진우'),
  Row(id='201755009', name='권진용'),
  Row(id='201855039', name='조익범'),
  Row(id='201955009', name='윤명규'),
  Row(id='201955014', name='이현우'),
  Row(id='201755054', name='조영준')],
 [Row(id='202055040', name='신현종'),
  Row(id='202055005', name='조흥진'),
  Row(id='202055046', name='정진욱'),
  Row(id='201955096