•혼자 연습하는 빅데이터/스파크 예제

"shoes_purchases" 에서  각 브랜드 별 구매횟수와 구매횟수 평균 그리고 각 제품별 A/S 확률 출력 

In [1]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
import pandas as pd

In [2]:
conf = SparkConf().setMaster("local").setAppName("key-value-shoes")
sc = SparkContext(conf = conf)

In [3]:
filename = "shoes_purchases.csv"
path = "./data/"
lines = sc.textFile(path + filename)

In [4]:
lines.collect()

['ID,Shoes,Brand,Purchases,A/S',
 '1,Mercurial,Nike,1256,89',
 '2,Xspeed,Adidas,850,47',
 '3,Future,Puma,820,65',
 '4,Tiempo,Nike,759,56',
 '5,Ultra,Puma,723,45',
 '6,PhantomGX,Nike,642,45',
 '7,Copa,Adidas,533,35',
 '8,One,Puma,211,18',
 '9,PhantomVenom,Nike,125,11',
 '10,Nemessiz,Adidas,119,8',
 '11,Predator,Adidas,98,12',
 '12,King,Puma,97,4',
 '13,Magista,Nike,92,4',
 '14,Premier,Nike,78,8',
 '15,Ace,Adidas,68,6',
 '16,Morelia,Mizuno,57,2',
 '17,Furon,NewBalance,47,4',
 '18,Ignitus,Mizuno,38,4']

In [5]:
# 첫 행 (헤더) 제거
header = lines.first()
filtered_lines = lines.filter(lambda x: x != header)

In [6]:
filtered_lines.collect()

['1,Mercurial,Nike,1256,89',
 '2,Xspeed,Adidas,850,47',
 '3,Future,Puma,820,65',
 '4,Tiempo,Nike,759,56',
 '5,Ultra,Puma,723,45',
 '6,PhantomGX,Nike,642,45',
 '7,Copa,Adidas,533,35',
 '8,One,Puma,211,18',
 '9,PhantomVenom,Nike,125,11',
 '10,Nemessiz,Adidas,119,8',
 '11,Predator,Adidas,98,12',
 '12,King,Puma,97,4',
 '13,Magista,Nike,92,4',
 '14,Premier,Nike,78,8',
 '15,Ace,Adidas,68,6',
 '16,Morelia,Mizuno,57,2',
 '17,Furon,NewBalance,47,4',
 '18,Ignitus,Mizuno,38,4']

In [25]:
# parsing (Brand, purchases) 추출
def parse(row):
    fields = row.split(",")
    brand = fields[2]
    purchases = int(fields[3])
    
    return (brand, purchases)

Brand_Purchases = filtered_lines.map(parse)

In [8]:
Brand_Purchases.collect()

[('Nike', 1256),
 ('Adidas', 850),
 ('Puma', 820),
 ('Nike', 759),
 ('Puma', 723),
 ('Nike', 642),
 ('Adidas', 533),
 ('Puma', 211),
 ('Nike', 125),
 ('Adidas', 119),
 ('Adidas', 98),
 ('Puma', 97),
 ('Nike', 92),
 ('Nike', 78),
 ('Adidas', 68),
 ('Mizuno', 57),
 ('NewBalance', 47),
 ('Mizuno', 38)]

In [9]:
# Key: Brand, Value: (purchases, 1)로 구성된 Key-Value RDD로 변환
Brand_Purchases_Count = Brand_Purchases.mapValues(lambda x: (x, 1))    # Key는 그대로

In [10]:
Brand_Purchases_Count.collect()

[('Nike', (1256, 1)),
 ('Adidas', (850, 1)),
 ('Puma', (820, 1)),
 ('Nike', (759, 1)),
 ('Puma', (723, 1)),
 ('Nike', (642, 1)),
 ('Adidas', (533, 1)),
 ('Puma', (211, 1)),
 ('Nike', (125, 1)),
 ('Adidas', (119, 1)),
 ('Adidas', (98, 1)),
 ('Puma', (97, 1)),
 ('Nike', (92, 1)),
 ('Nike', (78, 1)),
 ('Adidas', (68, 1)),
 ('Mizuno', (57, 1)),
 ('NewBalance', (47, 1)),
 ('Mizuno', (38, 1))]

In [96]:
# 각 브랜드 별 구매 수, 제품 수
reduced = Brand_Purchases_Count.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [97]:
reduced.collect()

[('Nike', (2952, 6)),
 ('Adidas', (1668, 5)),
 ('Puma', (1851, 4)),
 ('Mizuno', (95, 2)),
 ('NewBalance', (47, 1))]

In [98]:
# 각 브랜드 별 구매 수 출력
reduced_show = reduced.mapValues(lambda x: x[0])

In [99]:
reduced_show.collect()

[('Nike', 2952),
 ('Adidas', 1668),
 ('Puma', 1851),
 ('Mizuno', 95),
 ('NewBalance', 47)]

In [100]:
# 각 브랜드 별 구매 수 평균 출력
Pur_average = reduced.mapValues(lambda x: x[0]/x[1])

In [101]:
Pur_average.collect()

[('Nike', 492.0),
 ('Adidas', 333.6),
 ('Puma', 462.75),
 ('Mizuno', 47.5),
 ('NewBalance', 47.0)]

In [110]:
# 각 제품 별 A/S 확률 구하기
# parsing (Shoes, Brand, purchases, A/S) 추출
Brand_AS = filtered_lines.map(lambda x: (x.split(",")[1], x.split(",")[2], int(x.split(",")[3]), int(x.split(",")[4])))

In [111]:
Brand_AS.collect()

[('Mercurial', 'Nike', 1256, 89),
 ('Xspeed', 'Adidas', 850, 47),
 ('Future', 'Puma', 820, 65),
 ('Tiempo', 'Nike', 759, 56),
 ('Ultra', 'Puma', 723, 45),
 ('PhantomGX', 'Nike', 642, 45),
 ('Copa', 'Adidas', 533, 35),
 ('One', 'Puma', 211, 18),
 ('PhantomVenom', 'Nike', 125, 11),
 ('Nemessiz', 'Adidas', 119, 8),
 ('Predator', 'Adidas', 98, 12),
 ('King', 'Puma', 97, 4),
 ('Magista', 'Nike', 92, 4),
 ('Premier', 'Nike', 78, 8),
 ('Ace', 'Adidas', 68, 6),
 ('Morelia', 'Mizuno', 57, 2),
 ('Furon', 'NewBalance', 47, 4),
 ('Ignitus', 'Mizuno', 38, 4)]

In [102]:
# 각 제품별 A/S 비율 출력
AS_Percent = Brand_AS.map(lambda x:(x[0], x[1], x[3]/x[2]*100))
AS_Percent_show = Brand_AS.map(lambda x:(x[0], x[1], format(x[3]/x[2]*100, ".1f") + "%"))

In [103]:
AS_Percent_show.collect()

[('Mercurial', 'Nike', '7.1%'),
 ('Xspeed', 'Adidas', '5.5%'),
 ('Future', 'Puma', '7.9%'),
 ('Tiempo', 'Nike', '7.4%'),
 ('Ultra', 'Puma', '6.2%'),
 ('PhantomGX', 'Nike', '7.0%'),
 ('Copa', 'Adidas', '6.6%'),
 ('One', 'Puma', '8.5%'),
 ('PhantomVenom', 'Nike', '8.8%'),
 ('Nemessiz', 'Adidas', '6.7%'),
 ('Predator', 'Adidas', '12.2%'),
 ('King', 'Puma', '4.1%'),
 ('Magista', 'Nike', '4.3%'),
 ('Premier', 'Nike', '10.3%'),
 ('Ace', 'Adidas', '8.8%'),
 ('Morelia', 'Mizuno', '3.5%'),
 ('Furon', 'NewBalance', '8.5%'),
 ('Ignitus', 'Mizuno', '10.5%')]

In [104]:
# 각 브랜드의 평균 A/S 비율 구하기
AS_Percent_By_Brand = AS_Percent.map(lambda x:(x[1], x[2]))

In [105]:
AS_Percent_By_Brand.collect()

[('Nike', 7.085987261146497),
 ('Adidas', 5.529411764705882),
 ('Puma', 7.926829268292683),
 ('Nike', 7.378129117259552),
 ('Puma', 6.224066390041494),
 ('Nike', 7.009345794392523),
 ('Adidas', 6.566604127579738),
 ('Puma', 8.530805687203792),
 ('Nike', 8.799999999999999),
 ('Adidas', 6.722689075630252),
 ('Adidas', 12.244897959183673),
 ('Puma', 4.123711340206185),
 ('Nike', 4.3478260869565215),
 ('Nike', 10.256410256410255),
 ('Adidas', 8.823529411764707),
 ('Mizuno', 3.508771929824561),
 ('NewBalance', 8.51063829787234),
 ('Mizuno', 10.526315789473683)]

In [106]:
AS_average_count = AS_Percent_By_Brand.mapValues(lambda x: (round(x, 1), 1))
#AS_Percent_By_Brand.collect()
reduced_AS = AS_average_count.reduceByKey(lambda x, y: ((x[0] + y[0]), (x[1] + y[1])))                     

In [107]:
reduced_AS.collect() 

[('Nike', (44.900000000000006, 6)),
 ('Adidas', (39.8, 5)),
 ('Puma', (26.700000000000003, 4)),
 ('Mizuno', (14.0, 2)),
 ('NewBalance', (8.5, 1))]

In [108]:
# 각 브랜드 별 평균 AS 비율 출력
AS_average_result = reduced_AS.mapValues(lambda x: round(x[0]/x[1], 1))

In [109]:
AS_average_result.collect()

[('Nike', 7.5),
 ('Adidas', 8.0),
 ('Puma', 6.7),
 ('Mizuno', 7.0),
 ('NewBalance', 8.5)]