# RDD 编程

## Spark 初始化

In [1]:
from pyspark import SparkContext

In [2]:
# Spark 上下文
sc = SparkContext(master='local[*]', appName='rdd_dev')

In [3]:
# Spark 上下文常见属性
sc.version
sc.pythonVer
sc.master
sc.appName

'rdd_dev'

## 创建 RDD

### 2.1 序列化

In [4]:
rdd = sc.parallelize([('a',7),('a',2),('b',2)])
rdd1 = sc.parallelize([2,5,1,8])
rdd2 = sc.parallelize([('a',2),('d',1),('b',1)])
rdd3 = sc.parallelize(range(100))
rdd4 = sc.parallelize([('a',['x','y','z']),('b',['p','r'])])
rdd5 = sc.parallelize([('a',7),('a',2),('b',2)])

### 2.2 读取文件

In [5]:
# text_file_rdd = sc.textFile("File_Path")

## 获取 RDD 信息

### 3.1 基本信息

In [6]:
# 获取 rdd 的分区数
rdd.getNumPartitions()

8

In [7]:
# 查看 rdd 的 key
rdd.keys().collect()

['a', 'a', 'b']

In [8]:
# 查看 rdd 的 value
rdd.values()

PythonRDD[7] at RDD at PythonRDD.scala:53

In [13]:
# 查看 rdd 的内容
rdd.collect()

[('a', 7), ('a', 2), ('b', 2)]

In [10]:
# 判断 rdd 是否为空
rdd.isEmpty()

False

### foldByKey

In [14]:
rdd.foldByKey(0,lambda x,y:x+y).collect()

[('a', 9), ('b', 2)]

### reduceByKey

In [15]:
rdd.reduceByKey(lambda v1,v2:v1+v2).collect()

[('a', 9), ('b', 2)]

In [16]:
text_file_rdd = sc.textFile('./data/score_data.txt')

In [18]:
text_file_rdd.collect()

['Aaron,OperatingSystem,100',
 'Aaron,Python,50',
 'Aaron,ComputerNetwork,30',
 'Aaron,Software,94',
 'Abbott,DataBase,18',
 'Abbott,Python,82',
 'Abbott,ComputerNetwork,76',
 'Abel,Algorithm,30',
 'Abel,DataStructure,38',
 'Abel,OperatingSystem,38',
 'Abel,ComputerNetwork,92',
 'Abraham,DataStructure,12',
 'Abraham,ComputerNetwork,78',
 'Abraham,Software,98',
 'Adair,DataBase,20',
 'Adair,Python,98',
 'Adair,Software,88',
 'Adam,Algorithm,18',
 'Adam,ComputerNetwork,70',
 'Adam,Software,80',
 'Adolph,DataStructure,82',
 'Adolph,CLanguage,100',
 'Adolph,ComputerNetwork,70',
 'Adolph,Software,18',
 'Adonis,DataBase,86',
 'Adonis,Algorithm,34',
 'Adonis,DataStructure,52',
 'Adonis,CLanguage,30',
 'Adonis,Python,86',
 'Alan,Algorithm,48',
 'Alan,OperatingSystem,86',
 'Alan,CLanguage,72',
 'Alan,Python,94',
 'Alan,ComputerNetwork,88',
 'Albert,DataStructure,60',
 'Albert,CLanguage,76',
 'Albert,ComputerNetwork,62',
 'Aldrich,DataBase,42',
 'Aldrich,Python,98',
 'Aldrich,ComputerNetwork,80'

In [20]:
# 学生总数
text_file_rdd.count()

1073

In [23]:
# 课程总数
test_file_rdd_all = text_file_rdd.map( lambda x: x.split(','))

In [27]:
test_file_rdd_all.values().distinct().count()

8

In [28]:
# Tom 总成绩平均分
test_file_rdd_all.filter( lambda x:x[0] == 'Tom' ).map( lambda x: int(x[2]) ).mean()

30.8

In [29]:
# 求每名同学的选修的课程门数
test_file_rdd_all.map( lambda x: (x[0], 1) ).reduceByKey( lambda v1, v2: v1 + v2 ).collect()

[('Aaron', 4),
 ('Adair', 3),
 ('Adonis', 5),
 ('Alan', 5),
 ('Aldrich', 3),
 ('Alexander', 4),
 ('Alvin', 5),
 ('Amos', 5),
 ('Andrew', 4),
 ('Antonio', 3),
 ('Archer', 5),
 ('Armstrong', 2),
 ('Baron', 6),
 ('Barry', 5),
 ('Bartholomew', 5),
 ('Bart', 5),
 ('Beck', 4),
 ('Ben', 4),
 ('Benedict', 6),
 ('Benjamin', 4),
 ('Bennett', 6),
 ('Benson', 4),
 ('Bernard', 2),
 ('Bernie', 3),
 ('Bertram', 3),
 ('Bill', 2),
 ('Bishop', 2),
 ('Blithe', 3),
 ('Booth', 6),
 ('Borg', 4),
 ('Boris', 6),
 ('Boyd', 3),
 ('Brian', 6),
 ('Brook', 4),
 ('Bruno', 5),
 ('Channing', 4),
 ('Christian', 2),
 ('Clarence', 7),
 ('Clark', 6),
 ('Claude', 2),
 ('Clement', 5),
 ('Cleveland', 4),
 ('Cliff', 5),
 ('Colbert', 4),
 ('Colby', 4),
 ('Colin', 5),
 ('Conrad', 2),
 ('Corey', 4),
 ('Dean', 7),
 ('Devin', 4),
 ('Dick', 3),
 ('Don', 2),
 ('Donahue', 5),
 ('Duke', 4),
 ('Duncann', 5),
 ('Edward', 4),
 ('Eli', 5),
 ('Elijah', 4),
 ('Ellis', 4),
 ('Elmer', 4),
 ('Elroy', 5),
 ('Elton', 5),
 ('Enoch', 3),
 ('Ford'

In [31]:
# 该系DataBase课程共有多少人选修
test_file_rdd_all.filter( lambda x:x[1] == 'DataBase' ).map( lambda x: x[0]).distinct().count()

125

In [34]:
# 各门课程的平均分是多少
import numpy as np
rdd_score = test_file_rdd_all.map( lambda x: (x[1], int(x[2]))).groupByKey().mapValues(list)
rdd_score.map( lambda x: (x[0], np.mean(x[1]))).collect()

[('OperatingSystem', 54.940298507462686),
 ('Python', 57.8235294117647),
 ('ComputerNetwork', 51.901408450704224),
 ('DataBase', 50.53968253968254),
 ('Algorithm', 48.833333333333336),
 ('DataStructure', 47.57251908396947),
 ('CLanguage', 50.609375),
 ('Software', 50.90909090909091)]