### Create a SparkContext object

In [1]:
import findspark
import random
import pyspark
from pyspark import SparkContext

In [3]:
sc = SparkContext(appName="basic_operations")

## Filter Transformation
Filter only the lines with the word "google"

### Loading data
I use **textFile** to load all the files present inside the input path

In [4]:
# Read the content of the input file
inputPath  = "../"
outputPath = "../" 
logRDD = sc.textFile(inputPath)

### Filter transformation
I define a lambda function to apply a filter. 
- First I apply the lower functon to put all to lowercase 
- Then I apply the find function which return the offset of the line if it finds the word, in this case "google"

In [5]:
# Only the elements of the RDD satisfying the filter are selected
googleRDD = logRDD.filter(lambda logLine: logLine.lower().find("google")>=0)

I use collect() method here only for debug purpose to check if everything is ok

In [6]:
test = googleRDD.collect()
test

['66.249.69.97 - - [24/Sep/2014:22:25:44 +0000] "GET http://www.google.com/bot.html”',
 '66.249.69.97 - - [24/Sep/2014:22:26:44 +0000] "GET http://www.google.com/how.html”',
 '71.19.157.179 - - [24/Sep/2014:22:30:12 +0000] "GET http://www.google.com/faq.html”']

### Store the result in the ouput folder
I use **saveAsTextFile** method to save the result in the output folder

In [7]:
# Store the result in the output folder
googleRDD.saveAsTextFile(outputPath)

I stop the SparkContext instance

In [8]:
sc.stop()

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

## Filter+Map Transformation
Select the ip adress of the lines where it is present the word "google"

In [None]:
# Read the content of the input file
inputPath  = "../"
outputPath = "../" 
logRDD = sc.textFile(inputPath)

### Filter Transformation

In [None]:
# Only the elements of the RDD satisfying the filter are selected
googleRDD = logRDD.filter(lambda logLine: logLine.lower().find("www.google.com")>=0)
# for debug
googleRDD.collect()

### Map Transformation

In [None]:
# Use map to select only the IP address. It is the first field.
IPsRDD = googleRDD.map(lambda logLine: logLine.split('-')[0])
IPsRDD.collect()

### Distinct Transformation

In [None]:
# Remove duplicates
distinctIPsRDD = IPsRDD.distinct()

### saveAsTextFile Action

In [None]:
# Store the result in the output folder
distinctIPsRDD.saveAsTextFile(outputPath)
sc.stop()

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

## FlatMap Transformation
Select the ip adress of the lines where it is present the word "google"

In [None]:
# Read the content of the input file
inputPath  = "../"
outputPath = "../" 
logRDD = sc.textFile(inputPath)

### Defining a personalized function

In [None]:
def filterAndExtractIP(line):
    # Inizialize the list that will be returned by this function
    listIPs = []
    
    # If line contains www.google.com add the IP of this line in the returned list
    if line.lower().find("www.google.com")>=0:
        IP = line.split('-')[0]
        listIPs.append(IP)
        
    # return listIPs
    return listIPs

### FlatMap Transformation
FlatMap returns a **list of elements**

In [None]:
# Only the elements of the RDD satisfying the filter are selected
# and the associated IPs are returned
# Those lines that do not contain "www.google.com" return an empty list.
IPsRDD = logRDD.flatMap(filterAndExtractIP)

### Distinct Transformation

In [None]:
# Remove duplicates
distinctIPsRDD = IPsRDD.distinct()

### saveAsTextFile Action

In [None]:
# Store the result in the output folder
distinctIPsRDD.saveAsTextFile(outputPath)

In [None]:
sc.stop()

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

## Map + Reduce
extract the maximum number of pm10 value and print it

In [None]:
inputPath  = "../"
outputPath = "../" 
readingsRDD = sc.textFile(inputPath)

### Map Transformation

In [None]:
# Extract the PM10 values, split each line and select the third field
pm10ValuesRDD = readingsRDD.map(lambda PM10Reading: float(PM10Reading.split(',')[2]))

### Reduce Action

In [None]:
# Select/compute the maximum PM10 value
maxPM10Value = pm10ValuesRDD.reduce(lambda value1, value2: max(value1,value2))

In [None]:
# Print the result on the standard output of the Driver program/notebook
print("maxPM10Value")

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

## TakeOrdered - Top
extract the highest value  of pm10

In [None]:
inputPath  = "../"
outputPath = "../" 
readingsRDD = sc.textFile(inputPath)

### Map Transformation

In [None]:
# Extract the PM10 values
# Split each line and select the third field
pm10ValuesRDD = readingsRDD.map(lambda PM10Reading: float(PM10Reading.split(',')[2]))

### takeOrdered - top Action

In [None]:
# Select the maximum PM10 value by using the takeOrdered action. We need to change the "sort function"
maxPM10Value = pm10ValuesRDD.takeOrdered(1, lambda n: -1*n)[0]
# Select the maximum PM10 value by using the top action
maxPM10Value = pm10ValuesRDD.top(1)[0]

In [None]:
# Print the result on the standard output of the Driver program/notebook
print(maxPM10Value)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

## TakeOrdered + filter
extract all the records whose pm10 value is equal to the highest pm10 value

In [None]:
inputPath  = "../"
outputPath = "../" 
readingsRDD = sc.textFile(inputPath)

### Map Transformation

In [None]:
# Extract the PM10 values, split each line and select the third field
pm10ValuesRDD = readingsRDD.map(lambda PM10Reading: float(PM10Reading.split(',')[2]))

### takeOrdered Action

In [None]:
# Select/compute the maximum PM10 value by using takeOrdered
maxPM10Value = pm10ValuesRDD.takeOrdered(1, lambda num: -num)[0]

### Filter Transformation

In [None]:
# Filter the content of readingsRDD
# Select only the line(s) associated with the maxPM10Value
selectedRecordsRDD = readingsRDD.filter(lambda PM10Reading: float(PM10Reading.split(',')[2])==maxPM10Value)

### saveAsTextFile Action

In [None]:
# Store the result in the output folder
selectedRecordsRDD.saveAsTextFile(outputPath)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

## Extract dates corresponding to the highest pm10 value

In [None]:
inputPath  = "../"
outputPath = "../" 
readingsRDD = sc.textFile(inputPath)

In [None]:
# Extract the PM10 values, split each line and select the third field
pm10ValuesRDD = readingsRDD.map(lambda PM10Reading: float(PM10Reading.split(',')[2]))

In [None]:
# Select/compute the maximum PM10 value
maxPM10Value = pm10ValuesRDD.reduce(lambda value1, value2: max(value1,value2))

In [None]:
# Filter the content of readingsRDD
# Select only the line(s) associated with the maxPM10Value
selectedRecordsRDD = readingsRDD.filter(lambda PM10Reading: float(PM10Reading.split(',')[2])==maxPM10Value)

In [None]:
# Extract the dates from the selected records (second field of each string)
datesRDD = selectedRecordsRDD.map(lambda PM10Reading: PM10Reading.split(',')[1])

In [None]:
# Remove duplicates, if any
distinctDatesRDD = datesRDD.distinct()

In [None]:
# Store the result in the output folder
distinctDatesRDD.saveAsTextFile(outputPath)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

## Compute the average pm10 value

In [None]:
inputPath  = "../"
outputPath = "../" 
readingsRDD = sc.textFile(inputPath)

In [None]:
# Extract the PM10 values and return a tuple(PM10 value, 1)
# PM10 is the third field of each input string
pm10ValuesRDD = readingsRDD.map(lambda PM10Reading: ( float(PM10Reading.split(',')[2]), 1) )

In [None]:
# Compute the sum of the PM10 values and the number of input lines (= sum of onses) by using the reduce action
sumPM10ValuesCountLines = pm10ValuesRDD.reduce(lambda value1, value2: (value1[0]+value2[0], value1[1]+value2[1]))

In [None]:
# Compute the average PM10 value
# sumPM10ValuesCountLines[0] is equal to the sum of the input PM10 values
# sumPM10ValuesCountLines[1] is equal to the number of input lines/input values
print("Average=", sumPM10ValuesCountLines[0]/sumPM10ValuesCountLines[1])

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++