# (Py)Spark Exercises

_Giovanni Simonini - Giuseppe Fiameni_

----
# Exercises
### Try to solve the following exercise using the API introduced above

In [None]:
from pyspark import SparkConf, SparkContext
conf = SparkConf()
sc = SparkContext(conf=conf)

## A. Wordcount
    1. read the file "DivineCommedy.txt", containing "THE DIVINE COMEDY"
    2. select the 10 most frequent words, exluding stopwords

In [None]:
# %%sh
# pip install -U nltk

In [None]:
import nltk
# nltk.download("stopwords")

In [None]:
try:
    import nltk.corpus as corpus
    stopwords = set(corpus.stopwords.words())
except ImportError: 
    stopwords = []

In [None]:
# Most common words in "THE DIVINE COMEDY"

stopwords = stopwords.union(["dante","etc._:","dante's","_the","(_inf._"])
rdd = sc.textFile("./data/DivineComedy.txt")
rdd_sw = rdd.flatMap(lambda line: line.split())\
            .map(lambda word: word.strip().lower())\
            .filter(lambda word: word not in stopwords)\
            .map(lambda word: (word, 1))\
            .reduceByKey(lambda a, b: a + b)\
            .sortBy(lambda a: a[1], ascending=False)
res = rdd_sw.take(10)
#    .map(lambda (key, cnt): (cnt, key)) \
#    .top(10)
res

## B. Estimating PI
This code estimates π by "throwing darts" at a circle.

1. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle.
2. The fraction should be π / 4, so we use this to get our estimate.

In [None]:
# SOLUTION
from random import random
NUM_SAMPLES = 100

def sample(p):
    x, y = random(), random()
    return 1 if x*x + y*y < 1 else 0

#print random()
#sample(10)
#a = [1,2,3]
#map(lambda a: 1 if random()**2 + random()**2 < 1 else 0,a)
#lambda a: 1 if random()**2 + random()**2 < 1 else 0

sample_nums = sc.parallelize(range(0, NUM_SAMPLES))
#sample = sample_nums.map(lambda a: 1 if 1<2 else 0)
sample = sample_nums.map(sample)
count = sample.reduce(lambda a, b: a + b)

print("Pi is roughly {0:.2f}".format(4.0 * count / NUM_SAMPLES))

>The elements of parallelized collection are copied to form a distributed dataset that can be operated on in parallel.

## C. TMax

This code calculates the Max Temperarture registered within a set of measurements

In [None]:
import re
import sys

In [None]:
#function to extract the data from the line
#based on position and filter out the invalid records
def extractData(line):
    val = line.strip()
    (year, temp, q) = (str(val[15:19]), str(val[87:92]), str(val[92:93]))
    if (temp != "+9999" and re.match("[01459]", q)):
        return [(year, temp)]
    else:
        return []

In [None]:
#Create an RDD from the input data in HDFS
weatherData = sc.textFile("./data/1902.txt")

In [None]:
#Transform the data to extract/filter and then find the max temperature
temperature_per_year = weatherData.flatMap(extractData)
max_temperature_per_year = temperature_per_year.reduceByKey(lambda a,b : a if int(a) > int(b) else b)

In [None]:
#temperature_per_year.sortByKey().takeSample(False,10)
years = temperature_per_year.map(lambda a: a[0])
years.distinct().collect()

In [None]:
#Save the RDD back into HDFS
max_temperature_per_year.saveAsTextFile("./data/output.txt")
#max_temperature_per_year.saveAsTextFile("hdfs:///output")

## _pyspark_ doesn't support overwrite or append.

- The function `saveAsTextFile` is
a wrapper around `saveAsHadoopFile` and it's not possible overwrite existing files.

### in scala
It is however trivial to do this using HDFS directly from Scala:
```scala
val hadoopConf = new org.apache.hadoop.conf.Configuration()

val hdfs = org.apache.hadoop.fs.FileSystem.get(new java.net.URI("hdfs://localhost:9000"), hadoopConf)
```
### in shell
- If you need to merge hdfs file, remember to use:
[hadoop getMerge](https://hadoop.apache.org/docs/r2.4.1/hadoop-project-dist/hadoop-common/FileSystemShell.html#getmerge)
- If you simply want to delete it:
```bash
% hdfs dfs -rm -R "hdfs:///output"
```

In [None]:
#weatherData_ = sc.textFile("hdfs:///output")
weatherData_ = sc.textFile("./data/output.txt")

In [None]:
weatherData_.collect()

In [None]:
%%bash
rm -r data/o*
#hdfs dfs -ls /output

In [None]:
#%%bash
#hdfs dfs -rm -R "hdfs:///output"

# http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/FileSystemShell.html#rm

## End of this chapter