## RDD: text reading and processing with map(), flatMap() and filter()

### Read and process textfile from file directory

In [4]:
pwd

'/Users/binggangliu/gitprojects/myPysparkSnippets'

In [5]:
rdd = sc.textFile("/Users/binggangliu/Downloads/history.csv")

In [154]:
rdd.take(2)

['LAT;;;;;44.9800;44.9800;44.9800;44.9800',
 'LON;;;;;-93.2638;-93.2638;-93.2638;-93.2638']

sc.textFile() work for both csv and txt file

In [72]:
rdd2 = sc.textFile("/Users/binggangliu/Downloads/BusinessPlan-v4.txt")

rdd2.count() #get number of rows in the rdd

21001

In [98]:
rdd2.take(4)

['',
 '      ',
 '      Business Plan v-4',
 'This business plan consists of a narrative and several financial spreadsheets. The narrative template is the body of the business plan. It contains more than 150 questions divided into several sections. Work through the sections in any order you like, except']

In [75]:
rdd3 = sc.textFile("/Users/binggangliu/Downloads/BusinessPlan-v4.txt", minPartitions=4)
rdd3.count()

21001

In [76]:
rdd3.getNumPartitions()

4

### map() and flatMap()

Define a function below to change all letters to lower case and split the line, then use this function for mapping with map() and flatMap() methods.

In [110]:
def to_lower_split(lines):
    lines = lines.lower()
    lines = lines.split()
    return lines

rdd3_split = rdd3.map(to_lower_split)

In [111]:
rdd3_split.take(4)

[[],
 [],
 ['business', 'plan', 'v-4'],
 ['this',
  'business',
  'plan',
  'consists',
  'of',
  'a',
  'narrative',
  'and',
  'several',
  'financial',
  'spreadsheets.',
  'the',
  'narrative',
  'template',
  'is',
  'the',
  'body',
  'of',
  'the',
  'business',
  'plan.',
  'it',
  'contains',
  'more',
  'than',
  '150',
  'questions',
  'divided',
  'into',
  'several',
  'sections.',
  'work',
  'through',
  'the',
  'sections',
  'in',
  'any',
  'order',
  'you',
  'like,',
  'except']]

In [121]:
rdd3_split.count()

21001

Note that the rdds have the same number of rows (rdd3_split and rdd3), and have more rows after flatMapping (see below the row counts after applying flatMap().

In [112]:
rdd3_split_flat = rdd3.flatMap(to_lower_split)

In [113]:
rdd3_split_flat.take(10)

['business',
 'plan',
 'v-4',
 'this',
 'business',
 'plan',
 'consists',
 'of',
 'a',
 'narrative']

In [119]:
rdd3_split_flat.count()

92760

### filter()

In [125]:
import re
filterRDD = rdd3_split_flat.filter(lambda x: x.startswith('m'))
filterRDD.distinct().take(10)

['money',
 'matters',
 'market.)',
 'must',
 'may',
 'matrix,',
 'methods,',
 'matter',
 'materials,',
 'monthly']

In [106]:
rdd_mapped = rdd3_split_flat.map(lambda x: (x,1))
rdd_grouped = rdd_mapped.groupByKey()
rdd_frequency = rdd_grouped.mapValues(sum).map(lambda x: (x[1],x[0])).sortByKey(False)
rdd_mapped.take(10)


[('business', 1),
 ('plan', 1),
 ('v-4', 1),
 ('this', 1),
 ('business', 1),
 ('plan', 1),
 ('consists', 1),
 ('of', 1),
 ('a', 1),
 ('narrative', 1)]

In [107]:
rdd_grouped.take(5)

[('business', <pyspark.resultiterable.ResultIterable at 0x110c94668>),
 ('v-4', <pyspark.resultiterable.ResultIterable at 0x110c94908>),
 ('of', <pyspark.resultiterable.ResultIterable at 0x110c99588>),
 ('narrative', <pyspark.resultiterable.ResultIterable at 0x110c77940>),
 ('spreadsheets.', <pyspark.resultiterable.ResultIterable at 0x110c99710>)]

In [109]:
rdd_frequency.take(15)

[(3300, 'the'),
 (2940, 'you'),
 (2880, '�'),
 (2820, 'and'),
 (2700, 'your'),
 (2680, 'of'),
 (2180, 'to'),
 (1440, 'a'),
 (1220, 'in'),
 (1200, 'is'),
 (1080, 'do'),
 (1080, 'for'),
 (1000, 'or'),
 (900, 'what'),
 (760, 'are')]

In [114]:
skipwords = ['the','you','�','and','your','of','to','a','in','is','do','for','or']
new_rdd = rdd3_split_flat.filter(lambda x: x not in skipwords)
new_rdd.take(15)

['business',
 'plan',
 'v-4',
 'this',
 'business',
 'plan',
 'consists',
 'narrative',
 'several',
 'financial',
 'spreadsheets.',
 'narrative',
 'template',
 'body',
 'business']

In [115]:
new_rdd.count()

66240

In [149]:
new_rdd2 = new_rdd.distinct()

In [150]:
new_rdd2.count()

1494

In [151]:
new_rdd3 = new_rdd2.groupBy(lambda w: w[0:3])
#print[(k, list(v)) for (k, v) in new_rdd3.take(10)]
for key, value in new_rdd3.take(10):
    print(key, list(value))

bus ['business', 'business.', 'business:', 'business,', 'businesses,', 'businesses.', 'business?', 'businesses']
v-4 ['v-4']
150 ['150']
whe ['when', 'where', 'whether', 'when.']
are ['are', 'areas,', 'area.', 'areas']
val ['value', 'valuable']
res ['research', 'research:', 'restaurants)', 'research,', 'responsible', 'research.']
thi ['think', 'thinking', 'this', 'things']
but ['but']
tit ['titled']


In [152]:
sample_rdd = new_rdd2.sample(False, 0.1)
sample_rdd.count()

150

Make a sample rdd (10% of the original data size)

In [153]:
sample_rdd.take(5)

['edit', 'them', 'depending', 'quality', 'finally,']