In [1]:
from pyspark import SparkConf, SparkContext
# 스파크 환경 설정 객체 생성
conf = SparkConf().setMaster("local").setAppName("241204_01_RDD_API")
spark = SparkContext(conf=conf).getOrCreate()
spark

24/12/04 11:39:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
foods = spark.parallelize([ "짜장면", "마라탕", "짬뽕", "떡볶이", "쌀국수", "짬뽕", "짜장면", "짜장면", "짜장면", "라면", "우동", "라면" ] )
foods

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [3]:
# 모두 메모리에 올리기
foods.collect()

                                                                                

['짜장면', '마라탕', '짬뽕', '떡볶이', '쌀국수', '짬뽕', '짜장면', '짜장면', '짜장면', '라면', '우동', '라면']

In [4]:
# 값을 기준으로 카운트
foods.countByValue()

[Stage 1:>                                                          (0 + 1) / 1]                                                                                

defaultdict(int,
            {'짜장면': 4,
             '마라탕': 1,
             '짬뽕': 2,
             '떡볶이': 1,
             '쌀국수': 1,
             '라면': 2,
             '우동': 1})

In [5]:
# 상위 3개
foods.take(3)

['짜장면', '마라탕', '짬뽕']

In [6]:
# 처음 1개
foods.first()

'짜장면'

In [7]:
# RDD 개수
foods.count()

12

In [8]:
# 중복 제거
foods.distinct()

PythonRDD[9] at RDD at PythonRDD.scala:53

In [9]:
# action 연산
foods.distinct().collect()

[Stage 5:>                                                          (0 + 1) / 1]                                                                                

['짜장면', '마라탕', '짬뽕', '떡볶이', '쌀국수', '라면', '우동']

In [10]:
# 워커노드에서 실행하는 기능
foods.foreach(lambda x:print(x))

짜장면
마라탕
짬뽕
떡볶이
쌀국수
짬뽕
짜장면
짜장면
짜장면
라면
우동
라면


In [11]:
# Narrow Operation : filter(), map(), flatMap(), sample(), union()
sample_rdd = spark.parallelize([1,2,3,4,5])
sample_rdd2 = sample_rdd.map(lambda x:x+2)
sample_rdd2.collect()

[3, 4, 5, 6, 7]

In [13]:
movies = [
    "Iron Man",
    "The Incredible Hulk",
    "Thor",
    "Captain America: The First Avenger",
    "The Avengers",
    "Guardians of the Galaxy",
    "Ant Man",
    "Doctor Strange",
    "Black Panther",
    "Spider Man: Homecoming"
]

In [14]:
moviesRDD = spark.parallelize(movies)
mapMovies = moviesRDD.map(lambda x:x.split(' '))
mapMovies.collect()

[['Iron', 'Man'],
 ['The', 'Incredible', 'Hulk'],
 ['Thor'],
 ['Captain', 'America:', 'The', 'First', 'Avenger'],
 ['The', 'Avengers'],
 ['Guardians', 'of', 'the', 'Galaxy'],
 ['Ant', 'Man'],
 ['Doctor', 'Strange'],
 ['Black', 'Panther'],
 ['Spider', 'Man:', 'Homecoming']]

In [16]:
flatMapMovies = moviesRDD.flatMap(lambda x:x.split(" "))
flatMapMovies.collect()

['Iron',
 'Man',
 'The',
 'Incredible',
 'Hulk',
 'Thor',
 'Captain',
 'America:',
 'The',
 'First',
 'Avenger',
 'The',
 'Avengers',
 'Guardians',
 'of',
 'the',
 'Galaxy',
 'Ant',
 'Man',
 'Doctor',
 'Strange',
 'Black',
 'Panther',
 'Spider',
 'Man:',
 'Homecoming']

In [17]:
filteredMovies = flatMapMovies.filter(lambda x:x != 'Man')
filteredMovies.collect()

['Iron',
 'The',
 'Incredible',
 'Hulk',
 'Thor',
 'Captain',
 'America:',
 'The',
 'First',
 'Avenger',
 'The',
 'Avengers',
 'Guardians',
 'of',
 'the',
 'Galaxy',
 'Ant',
 'Doctor',
 'Strange',
 'Black',
 'Panther',
 'Spider',
 'Man:',
 'Homecoming']

In [19]:
# 집합연산
num1 = spark.parallelize([1,2,3,4,5])
num2 = spark.parallelize([4,5,6,7,8,9,10])

# intersection
num1.intersection(num2).collect()

[4, 5]

In [20]:
# union
num1.union(num2).collect()

[1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10]

In [24]:
# subtract
num1.subtract(num2).collect()

[2, 1, 3]

In [25]:
# sample(withReplacement, fraction, seed)
numlist = num1.union(num2)
numlist.collect()

[1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10]

In [26]:
numlist.sample(True, 0.3).collect()

[3, 6]

In [27]:
numlist.sample(True, 0.5).collect()

[1, 1, 2, 7, 9, 9]

In [28]:
numlist.sample(True, 0.3, seed=42).collect()

[6, 6, 6]

In [30]:
# wide transformation : groupby(), reduce()
foods.collect()

['짜장면', '마라탕', '짬뽕', '떡볶이', '쌀국수', '짬뽕', '짜장면', '짜장면', '짜장면', '라면', '우동', '라면']

In [33]:
foodsGroup = foods.groupBy(lambda x:x[0])
result = foodsGroup.collect()
result

[('짜', <pyspark.resultiterable.ResultIterable at 0x7f7cff0a71f0>),
 ('마', <pyspark.resultiterable.ResultIterable at 0x7f7cff10cb20>),
 ('짬', <pyspark.resultiterable.ResultIterable at 0x7f7cff10c0d0>),
 ('떡', <pyspark.resultiterable.ResultIterable at 0x7f7cff10c760>),
 ('쌀', <pyspark.resultiterable.ResultIterable at 0x7f7cff10ccd0>),
 ('라', <pyspark.resultiterable.ResultIterable at 0x7f7cff10cca0>),
 ('우', <pyspark.resultiterable.ResultIterable at 0x7f7cff10c040>)]

In [35]:
for (k,v) in result:
    print(k, list(v))

짜 ['짜장면', '짜장면', '짜장면', '짜장면']
마 ['마라탕']
짬 ['짬뽕', '짬뽕']
떡 ['떡볶이']
쌀 ['쌀국수']
라 ['라면', '라면']
우 ['우동']


In [37]:
spark.stop()