In [4]:
# [+] PySpark 설정
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("shffuling_and_partition")
sc = SparkContext(conf = conf)

In [6]:
# 데이터셋
filename = "fhvhv_tripdata_2020-03_short.csv"

In [11]:
# [+] 데이터셋 로딩 및 헤더 추출
lines = sc.textFile("./data/" + filename)
header = lines.first()

In [12]:
header

'hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag'

In [13]:
# [+] filter()를 이용한 헤더 제거
filtered_lines = lines.filter(lambda row: row != header)

In [14]:
# [+] map()을 이용한 승차일자 추출하기(K-V RDD로 저장)

dates = filtered_lines.map(lambda x: (x.split(',')[2].split(" ")[0], 1))

In [16]:
# 첫 번째 값 출력하기, 데이터가 클 때 collect() 사용은 비효율적
dates.first()

('2020-03-01', 1)

In [19]:
"""
  일별 승차횟수 계산: reduceByKey()
"""

# [+] reduceByKey()를 이용한 일별 승차횟수 계산
reduced = dates.reduceByKey(lambda x, y : x + y)

In [30]:
%%time  # 걸리는 시간
reduced.collect()

Wall time: 7.01 s


[('2020-03-04', 707879),
 ('2020-03-01', 784246),
 ('2020-03-03', 697880),
 ('2020-03-02', 648986),
 ('2020-03-06', 872012),
 ('2020-03-07', 418828),
 ('2020-03-05', 731165)]

In [25]:
"""
  일별 승차횟수 계산: groupByKey()
"""

# [+] groupByKey()를 이용한 일별 데이터 그룹핑
groups = dates.groupByKey()

In [26]:
groups.first()

('2020-03-04', <pyspark.resultiterable.ResultIterable at 0x21126b09040>)

In [27]:
# [+] mapValues()를 이용한 일별 승차횟수 계산
counts = groups.mapValues(len)

In [33]:
%%time
counts.collect()

Wall time: 5.69 s


[('2020-03-04', 707879),
 ('2020-03-01', 784246),
 ('2020-03-03', 697880),
 ('2020-03-02', 648986),
 ('2020-03-06', 872012),
 ('2020-03-07', 418828),
 ('2020-03-05', 731165)]

In [None]:
# partitionBy
x = sc.parallelize([(0,1),(1,2),(2,3)],2)
y = x.partitionBy(numPartitions=3, partitionFunc=lambda x: x)
print(x.glom().collect())
print(y.glom().collect())

In [None]:
# repartition

x = sc.parallelize([1,2,3,4,5],2)
y = x.repartition(numPartitions=3)
print(x.glom().collect())
print(y.glom().collect())

In [None]:
# coalesce

x = sc.parallelize([1,2,3,4,5],2)
y = x.coalesce(numPartitions=1)

print(x.glom().collect())
print(y.glom().collect())