In [None]:
from pyspark import SparkContext
sc = SparkContext()

## Word Count

#### Load the RDD from the text file

In [None]:
rdd_input = sc.textFile('data/HP.txt')
# Split the text into words and flatten the results. Why?
words = rdd_input.flatMap(lambda line: line.split())
words.collect()

#### Map the words in such a way, so that they can be counted later by aggregation

In [None]:
words_map = words.map(lambda x: (x,1))

#### Apply reduceByKey to complete the counting

In [None]:
words_count = words_map.reduceByKey(lambda a,b: a+b)
words_count.collect()

#### Sort them in the order of counts

In [None]:
words_count_sorted_freq = words_count.sortBy(lambda x: x[1])
# words_count_sorted_freq = words_count.sortBy(lambda x: -x[1]) # Descending
words_count_sorted_freq.collect()

#### ----

### A different scenario

In [None]:
# name of the file is GoT.txt

rdd_got = sc.textFile('data/GOT.txt')
rdd_got.collect()

In [None]:
# data cleaning and preparation: get rid of the full stop at the end of sentences, convert everything to lowercase
# use 'replace' function of python for replacing '.'
rdd_got = rdd_got.map(lambda line: line.replace('.', '')).map(lambda line: line.lower())
rdd_got.collect()

### Now let's try to calculate the frequency of the words "per line" instead of counting them across the whole dataset

In [None]:
rdd_parsed = rdd_got.map(lambda x: x.split())
rdd_parsed.collect()

In [None]:
def count_words_in_docs(doc_record):
    counts = {}
    for word in doc_record:  # Looping, Why?
        if word not in counts:
            counts[word] = 1
        else:
            counts[word] += 1
    return list(counts.items())

In [None]:
rdd_loop_counts = rdd_parsed.map(count_words_in_docs)
rdd_loop_counts.collect()

In [None]:
rdd_counts_docids = rdd_loop_counts.zipWithIndex() # Generate the positional index of each record, record ids are hard to track sometimes
rdd_counts_docids.collect()

### User Exercise: Try to calculate the frequency of the term 'csc' , "across all" documents

The output should be an RDD (containing 4 elements) which looks like this (Only one element shown here):

    [(('csc', 'doc2'), 2),
    ..........
    ..........
    ..........]

In [None]:
rdd_csc = sc.parallelize([
    ('Csc-doc1-na'),
    ('csc-doc2-na'),
    ('cSc-doc2-na'),
    ('csC-doc3-na'),
    ('csc-doc3-na'),
    ('Csc-doc3-na'),
    ('csc-doc4-na'),
    ('cSc-doc4-na'),
    ('CSC-doc4-na'),
    ('csc-doc4-na'),
])

#### NOTE: Please get rid of the term 'na' after you have done the pre-processing and splitting

In [None]:
rdd_csc_parsed = rdd_csc.map(lambda x: x.lower()).map(lambda x: x.split('-'))
rdd_csc_parsed.collect()

In [None]:
rdd_csc_relevant = rdd_csc_parsed.map(lambda x: (x[0], x[1]))
rdd_csc_relevant.collect()

In [None]:
rdd_csc_parsed_1 = rdd_csc_relevant.map(lambda x: (x,1))
rdd_csc_parsed_1.collect()

In [None]:
rdd_csc_counted = rdd_csc_parsed_1.reduceByKey(lambda a,b: a+b)
rdd_csc_counted.collect()