In [1]:
from pyspark import SparkConf, SparkContext
# 스파크 환경 설정 객체 생성
conf = SparkConf().setMaster('local').setAppName('restaurant-review-average')
spark = SparkContext(conf=conf).getOrCreate()

24/12/04 11:02:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
import os
directory = os.path.join(os.getcwd(), 'data')
filename = 'restaurant_reviews.csv'
filepath = os.path.join(directory, filename)

In [3]:
# RDD 생성
lines = spark.textFile('file:///'+filepath.replace('\\', '/'))
lines.take(5)

                                                                                

['id,item,cateogry,reviews,',
 '0,짜장면,중식,125,',
 '1,짬뽕,중식,235,',
 '2,김밥,분식,32,',
 '3,떡볶이,분식,534,']

In [4]:
header = lines.first()
filtered_lines = lines.filter(lambda row : row != header)

In [5]:
filtered_lines.take(5)

['0,짜장면,중식,125,',
 '1,짬뽕,중식,235,',
 '2,김밥,분식,32,',
 '3,떡볶이,분식,534,',
 '4,라멘,일식,223,']

In [6]:
def parse(row):
    fields = row.split(',')
    category = fields[2]
    review = int(fields[3])
    
    return category, review

In [7]:
parse('0, 짜장면, 중식, 125,')

(' 중식', 125)

### RDD내 모든 row에 대해 parse()를 적용한 다음, map() 추출

In [8]:
category_review = filtered_lines.map(parse)
category_review.take(5)

[('중식', 125), ('중식', 235), ('분식', 32), ('분식', 534), ('일식', 223)]

In [9]:
category_review_count = category_review.mapValues(lambda x: (x, 1))
category_review_count.take(6)

[('중식', (125, 1)),
 ('중식', (235, 1)),
 ('분식', (32, 1)),
 ('분식', (534, 1)),
 ('일식', (223, 1)),
 ('일식', (52, 1))]

'중식', (125, 1)<br>
첫번째 중식 > (125, 1) -> x<br>
두번째 중식 > (235, 1) -> y<br>
리뷰의 개수 합 = x[0] + y[0]<br>
건수의 합 = x[1] +y[1]

In [10]:
# 카테고리별 합계

reduce_rdd = category_review_count.reduceByKey(lambda x, y : (x[0] + y[0], x[1] + y[1]))
reduce_rdd.take(5)

[('중식', (360, 2)),
 ('분식', (566, 2)),
 ('일식', (287, 3)),
 ('아시안', (312, 1)),
 ('패스트푸드', (35, 2))]

In [11]:
reduce_rdd.collect()

[('중식', (360, 2)),
 ('분식', (566, 2)),
 ('일식', (287, 3)),
 ('아시안', (312, 1)),
 ('패스트푸드', (35, 2))]

In [12]:
# 평균
averge = reduce_rdd.mapValues(lambda x : x[0]/x[1])

In [13]:
averge.collect()

[('중식', 180.0),
 ('분식', 283.0),
 ('일식', 95.66666666666667),
 ('아시안', 312.0),
 ('패스트푸드', 17.5)]

# 연습

In [29]:
# 데이터 추가 > 메뉴별 합계와 평균
def parse_menu(row):
    fields = row.split(',')
    menu = fields[1]
    review = int(fields[3])
    
    return menu, review

In [30]:
menu_review = filtered_lines.map(menu)
menu_review.collect()

[('짜장면', 125),
 ('짬뽕', 235),
 ('김밥', 32),
 ('떡볶이', 534),
 ('라멘', 223),
 ('돈가스', 52),
 ('우동', 12),
 ('쌀국수', 312),
 ('햄버거', 12),
 ('치킨', 23)]

In [31]:
additional_data = [
    ('탕수육', 400),
    ('짜장면', 200),
    ('김치찌개', 350),
    ('비빔밥', 300)
]
add_rdd = spark.parallelize(additional_data)

In [32]:
cat_rdd = menu_review.union(add_rdd)

In [33]:
menu_review_count = cat_rdd.mapValues(lambda x: (x, 1))
menu_review_count.collect()

[('짜장면', (125, 1)),
 ('짬뽕', (235, 1)),
 ('김밥', (32, 1)),
 ('떡볶이', (534, 1)),
 ('라멘', (223, 1)),
 ('돈가스', (52, 1)),
 ('우동', (12, 1)),
 ('쌀국수', (312, 1)),
 ('햄버거', (12, 1)),
 ('치킨', (23, 1)),
 ('탕수육', (400, 1)),
 ('짜장면', (200, 1)),
 ('김치찌개', (350, 1)),
 ('비빔밥', (300, 1))]

In [34]:
# 메뉴별 합계, 리뷰 개수 계산
menu_reduce_rdd = menu_review_count.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
menu_reduce_rdd.collect()

[('짜장면', (325, 2)),
 ('짬뽕', (235, 1)),
 ('떡볶이', (534, 1)),
 ('쌀국수', (312, 1)),
 ('치킨', (23, 1)),
 ('탕수육', (400, 1)),
 ('김치찌개', (350, 1)),
 ('김밥', (32, 1)),
 ('라멘', (223, 1)),
 ('돈가스', (52, 1)),
 ('우동', (12, 1)),
 ('햄버거', (12, 1)),
 ('비빔밥', (300, 1))]

In [35]:
# 메뉴별 평균 계산
menu_average = menu_reduce_rdd.mapValues(lambda x: (x[0], x[0] / x[1]))
menu_average.collect()

[('짜장면', (325, 162.5)),
 ('짬뽕', (235, 235.0)),
 ('떡볶이', (534, 534.0)),
 ('쌀국수', (312, 312.0)),
 ('치킨', (23, 23.0)),
 ('탕수육', (400, 400.0)),
 ('김치찌개', (350, 350.0)),
 ('김밥', (32, 32.0)),
 ('라멘', (223, 223.0)),
 ('돈가스', (52, 52.0)),
 ('우동', (12, 12.0)),
 ('햄버거', (12, 12.0)),
 ('비빔밥', (300, 300.0))]

In [36]:
print(f"메뉴별 합계와 평균: {menu_average.collect()}")

메뉴별 합계와 평균: [('짜장면', (325, 162.5)), ('짬뽕', (235, 235.0)), ('떡볶이', (534, 534.0)), ('쌀국수', (312, 312.0)), ('치킨', (23, 23.0)), ('탕수육', (400, 400.0)), ('김치찌개', (350, 350.0)), ('김밥', (32, 32.0)), ('라멘', (223, 223.0)), ('돈가스', (52, 52.0)), ('우동', (12, 12.0)), ('햄버거', (12, 12.0)), ('비빔밥', (300, 300.0))]


In [6]:
spark.stop()

In [None]:
spark.stop()