### Palindroms

In [2]:
# This allows to download web pages
import requests

# This is just a way for having a list of lines from a file on the web.
# Let's remember that spark cannot process files from the web by itself.
# This is done only as an example, it cannot be done with huge files.
# Otherwise the machine may have hard times in processing it.
file_content = requests.get('https://raw.githubusercontent.com/forons/BigDataExamples/master/files/data.txt').iter_lines()


def reverse(s):
  rev = ''
  for i in s: 
    rev = i + rev
  return rev

words = sc.parallelize(file_content)
palindroms = words.filter(lambda x: x == reverse(x))
palindroms.collect()

### Words that occur exactly 5 times

In [4]:
# This allows to download web pages
import requests
# This allows the usage of add, which is a shortcut for 'lambda x, y: x + y'
from operator import add

# This is just a way for having a list of lines from a file on the web.
# Let's remember that spark cannot process files from the web by itself.
# This is done only as an example, it cannot be done with huge files.
# Otherwise the machine may have hard times in processing it.
file_content = requests.get('https://raw.githubusercontent.com/forons/BigDataExamples/master/files/inferno.txt').iter_lines()
lines = sc.parallelize(file_content)

# Here we split on the whitespace each element of the 'file' rdd, which is a line of the file we read, into words
words = lines.flatMap(lambda x: x.split(' '))

# We map each word to a tuple with the word itself and a counter initialized to one
word_pairs = words.map(lambda x: (x, 1))

# We group the elements with the same key (the word in our case) and sum the counters
word_count = word_pairs.reduceByKey(add)

# x[0] access the key, and x[1] access to the value of the pair/tuple
word_count.filter(lambda x: x[1] == 5).collect()

### Group by occurrences

In [6]:
result = word_count.groupBy(lambda x: x[1]).collect()

separator = '######'
# Print a line for each element, separate elements with different occurrences with the separator variable
for key, val in result:
  print(separator)
  for elem in val:
    print(elem, key)

### Group Anagrams

In [8]:
# This function takes a string and outputs a pair, where:
# - The key is a string with the characters inside the input ordered in alphabetical order
# - The value is a list with only the input element
# (we put the string in lower case in order to do not distinguish between 'word' a 'Word')
def charsAndSort(x):
  chars = list(x.lower())
  chars.sort()
  return (''.join(chars), [x])

distinct = lines.flatMap(lambda x: x.split(' ')).distinct()
pairs = distinct.map(charsAndSort)
# Here it could have been done:
# from operator import add
# and instead of 'lambda x, y: x + y' we could have done just 'add'
partial = pairs.reduceByKey(lambda x, y: x + y)
anagrams = partial.filter(lambda x: len(x[1]) > 1)

anagrams.collect()