-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_letters.py
50 lines (43 loc) · 1.01 KB
/
count_letters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# standford cs246 winter 2020 colab 1
from pyspark.sql import *
if __name__=='__main__':
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
documents = sc.textFile('../data/pg100.txt')
counts = documents.flatMap(lambda line: line.split()).filter(lambda word: word.isalpha()) \
.map(lambda word: word[0].lower()) \
.map(lambda letter: (letter, 1)) \
.reduceByKey(lambda n1, n2: n1 + n2).sortByKey(lambda letter_cnt:letter_cnt[0])
counts.toDF(['letter','count']).show(26)
sc.stop()
"""
|letter| count|
+------+------+
| a| 72155|
| b| 34065|
| c| 19618|
| d| 19492|
| e| 12666|
| f| 25624|
| g| 13682|
| h| 44569|
| i| 52986|
| j| 1474|
| k| 5861|
| l| 17256|
| m| 39937|
| n| 19183|
| o| 36408|
| p| 15838|
| q| 1299|
| r| 7973|
| s| 44741|
| t|105860|
| u| 7008|
| v| 3361|
| w| 46934|
| x| 1|
| y| 20646|
| z| 32|
+------+------+
"""