###################################################<br>
File: Getting Started with PySpark <br>
Desc: Introduction to PySpark<br>
Auth: Shreenidhi Bharadwaj<br>
Date: 9/29/2019<br>
ALL RIGHTS RESERVED | DO NOT DISTRIBUTE<br>
###################################################

In [3]:
# lets ignore warnings for now
import warnings
warnings.filterwarnings('ignore')

In [4]:
!pip install pyspark



In [5]:
# Import Libraries
import pyspark
from pyspark import SparkConf, SparkContext
import os

In [6]:
# Initialize Spark Context
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)

In [7]:
# First create an RDD
data = sc.parallelize([1,2,3,3])
data.take(4)

[1, 2, 3, 3]

In [8]:
#let us use a map transformation - use lambda function to return the cube of each element
data.map(lambda x: x * x * x).take(4)

[1, 8, 27, 27]

In [9]:
#use a function instead of a lambda function - we will get the square root of each element
import math
def convert(x):
    return math.sqrt(x)
data.map(convert).take(4)

[1.0, 1.4142135623730951, 1.7320508075688772, 1.7320508075688772]

In [10]:
#use filter to get only odd numbers
data.filter(lambda x: x % 2 != 0).take(4) #there will be just 3 odd elements

[1, 3, 3]

In [11]:
#let us remove duplicates
data.distinct().take(4)

[1, 2, 3]

In [12]:
#let us take a sample - note you may not get the same answer
data.sample(False, 0.2).take(4) #False implies no replacement; 0.2 is the probability of each element being chosen

[1, 3]

In [13]:
from __future__ import print_function
#let us try a few set-like RDD operations
rdd1 = sc.parallelize([1,2,3,4,5])
rdd2 = sc.parallelize([3,5,6,7,8])

#union
rdd1.union(rdd2).collect()

[1, 2, 3, 4, 5, 3, 5, 6, 7, 8]

In [14]:
#intersection
rdd1.intersection(rdd2).collect()

[3, 5]

In [15]:
#subtract
rdd1.subtract(rdd2).collect()

[2, 4, 1]

In [16]:
#cartesian product
rdd1.cartesian(rdd2).collect()

[(1, 3),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (2, 3),
 (2, 5),
 (2, 6),
 (2, 7),
 (2, 8),
 (3, 3),
 (3, 5),
 (3, 6),
 (3, 7),
 (3, 8),
 (4, 3),
 (4, 5),
 (4, 6),
 (4, 7),
 (4, 8),
 (5, 3),
 (5, 5),
 (5, 6),
 (5, 7),
 (5, 8)]

In [17]:
#let us look at some actions
rdd1.collect() #returns all the elements from the RDD

[1, 2, 3, 4, 5]

In [18]:
#number of elements in the RDD
rdd1.count()

5

In [19]:
#Let us create a union and check to see how many times each element occurs in the RDD
u = rdd1.union(rdd2)
u.countByValue()

defaultdict(int, {1: 1, 2: 1, 3: 2, 4: 1, 5: 2, 6: 1, 7: 1, 8: 1})

In [20]:
#return the first element
u.first()

1

In [21]:
#return the first 3 elements
u.take(3)

[1, 2, 3]

In [22]:
#return the top n elements
u.top(3)

[8, 7, 6]

In [23]:
#rdd1 is [1,2,3,4,5]. Let us get the product of the elements
product = rdd1.reduce(lambda x, y: x * y)
print(product)

120


In [24]:
#let us try a few transformations on paired RDDs
#assume that we have multiple sales values per day....not all days of the week are shown
paired_RDD = sc.parallelize([("Mon", 200.00), ("Tue", 1215.50), ("Mon", 300.25),("Wed", 100.00),("Mon", 100.00)])
#get total sales by day
sales_by_day = paired_RDD.reduceByKey(lambda x, y: x + y)
sales_by_day.collect()

[('Mon', 600.25), ('Tue', 1215.5), ('Wed', 100.0)]

In [25]:
#let us group by key - get key and an iterable of items
grouped_RDD = paired_RDD.groupByKey()
grouped_RDD.collect() #note that the value will be an iterator

[('Mon', <pyspark.resultiterable.ResultIterable at 0x1122fffd0>),
 ('Tue', <pyspark.resultiterable.ResultIterable at 0x1122ff350>),
 ('Wed', <pyspark.resultiterable.ResultIterable at 0x1122ff0d0>)]

In [26]:
#let us get a list instead of an iterator ...
list_RDD = paired_RDD.groupByKey().mapValues(list)
list_RDD.collect()

[('Mon', [200.0, 300.25, 100.0]), ('Tue', [1215.5]), ('Wed', [100.0])]

In [27]:
#let us convert list_RDD back to the paired RDD that we started with
original_RDD = list_RDD.flatMapValues(lambda x: x)
original_RDD.collect()

[('Mon', 200.0),
 ('Mon', 300.25),
 ('Mon', 100.0),
 ('Tue', 1215.5),
 ('Wed', 100.0)]

In [28]:
#let us sort out original RDD by key in descending order
sorted_RDD = original_RDD.sortByKey(ascending = False) #default is in ascending order
sorted_RDD.collect()

[('Wed', 100.0),
 ('Tue', 1215.5),
 ('Mon', 200.0),
 ('Mon', 300.25),
 ('Mon', 100.0)]

In [29]:
#let us sort by value
sort_by_value = original_RDD.sortBy(lambda pair: pair[1], True) # use - False to sort in descending order
sort_by_value.collect()

[('Mon', 100.0),
 ('Wed', 100.0),
 ('Mon', 200.0),
 ('Mon', 300.25),
 ('Tue', 1215.5)]

In [30]:
from pyspark import SparkConf, SparkContext
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [31]:
# Load and parse the data
def parsePoint(line):
    aList = line.split(";")
    features = [float(x) for x in aList[:-1]]
    label = labels[aList[-1]]
    
    return LabeledPoint(label, features)

data = sc.textFile("../data/winequality-white.csv")
data.take(3)


['"fixed acidity";"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"',
 '7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6',
 '6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6']