Pyspark

In [6]:
import json

In [3]:
import pyspark

In [4]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
from pyspark.sql import functions as F

Wordcount

In [3]:
words = sc.textFile("data/wordcount.txt").flatMap(lambda line: line.split(" "))

In [4]:
wordCounts=words.map(lambda word: (word, 1)).reduceByKey(lambda a,b:a +b)

In [5]:
wordCounts.saveAsTextFile("data/output/")

Joins

In [6]:
dataset1 = [
  {
  'key' : 'abc',
  'val11' : 1.1,
  'val12' : 1.2
  },
  {
  'key' : 'def',
  'val11' : 3.0,
  'val12' : 3.4
  }
]
dataset2 = [
  {
  'key' : 'abc',
  'val21' : 2.1,
  'val22' : 2.2
  },
  {
  'key' : 'xyz',
  'val21' : 3.1,
  'val22' : 3.2
  }
]
rdd1 = sc.parallelize(dataset1)
df1 = spark.createDataFrame(rdd1)
print('df1')
df1.show()
rdd2 = sc.parallelize(dataset2)
df2 = spark.createDataFrame(rdd2)
print('df2')
df2.show()



df1
+---+-----+-----+
|key|val11|val12|
+---+-----+-----+
|abc|  1.1|  1.2|
|def|  3.0|  3.4|
+---+-----+-----+

df2
+---+-----+-----+
|key|val21|val22|
+---+-----+-----+
|abc|  2.1|  2.2|
|xyz|  3.1|  3.2|
+---+-----+-----+



In [7]:
df = df1.join(df2, on=['key'], how='inner')
df.show()

+---+-----+-----+-----+-----+
|key|val11|val12|val21|val22|
+---+-----+-----+-----+-----+
|abc|  1.1|  1.2|  2.1|  2.2|
+---+-----+-----+-----+-----+



In [None]:
df = df1.join(df2, on=['key'], how='outer')
df.show()

In [None]:
df = df1.join(df2, on=['key'], how='left')
df.show()

In [None]:
df = df1.join(df2, on=['key'], how='right')
df.show()

In [None]:
df  = df1.join(df2, on=['key'], how='left_semi')
df.show()

In [None]:
df  = df1.join(df2, on=['key'], how='left_anti')
df.show()

In [None]:
print('Inner join with condition df1.key == df2.key')
df  = df1.join(df2, df1.key == df2.key, how='inner')
df.show()
print('Inner join with condition df1.key > df2.key')
df  = df1.join(df2, df1.key > df2.key, how='inner')
df.show()
print('Inner join with multiple conditions [df1.val11 < df2.val21, df1.val12 < df2.val22]')
df  = df1.join(df2, [df1.val11 < df2.val21, df1.val12 < df2.val22], how='inner')
df.show()
print('Inner join with multiple or conditions (df1.val11 > df2.val21) | (df1.val12 < df2.val22)')
df  = df1.join(df2, [(df1.val11 < df2.val21) | (df1.val12 > df2.val22)], how='inner')
df.show()

Map vs Flatmap

In [8]:
# Bazic map example in python
x = sc.parallelize(["spark rdd example", "sample example"], 2)
 
# map operation will return Array of Arrays in following case (check the result)
y = x.map(lambda x: x.split(' '))
print(y.collect())
# [['spark', 'rdd', 'example'], ['sample', 'example']]
 
# flatMap operation will return Array of words in following case (check the result)
y = x.flatMap(lambda x: x.split(' '))
print(y.collect())
# ['spark', 'rdd', 'example', 'sample', 'example']

[['spark', 'rdd', 'example'], ['sample', 'example']]
['spark', 'rdd', 'example', 'sample', 'example']


Filter

In [None]:
x = sc.parallelize([1,2,3,4,5,6,7,8,9,10], 2)
 
# filter operation 
y = x.filter(lambda x: x % 5 == 0)
y.collect()
# [2, 4, 6, 8, 10]

Reduce

In [None]:
# reduce numbers 1 to 10 by adding them up
x = sc.parallelize([1,2,3,4,5,6,7,8,9,10], 2)
cSum = x.reduce(lambda accum, n: accum + n)
print(cSum)
# 55
 
# reduce numbers 1 to 10 by multiplying them
cMul = x.reduce(lambda accum, n: accum * n)
print(cMul)
# 3628800
 
# by defining a lambda reduce function 
def cumulativeSum(accum, n):
    return accum + n
 
cSum = x.reduce(cumulativeSum)
print(cSum)
# 55

Reduce by key

In [None]:
# Bazic reduceByKey example in python
# creating PairRDD x with key value pairs
x = sc.parallelize([("a", 1), ("b", 1), ("a", 1), ("a", 1),
                    ("b", 1), ("b", 1), ("b", 1), ("b", 1)], 3)
 


In [None]:
# Applying reduceByKey operation on x
y = x.reduceByKey(lambda accum, n: accum + n)
print(y.collect())
# [('b', 5), ('a', 3)]

In [9]:
## Bazic groupByKey example in python
x = sc.parallelize([
    ("USA", 1), ("USA", 2), ("India", 1),
    ("UK", 1), ("India", 4), ("India", 9),
    ("USA", 8), ("USA", 3), ("India", 4),
    ("UK", 6), ("UK", 9), ("UK", 5)], 3)
 
## groupByKey with default partitions
y = x.groupByKey()
 
## Check partitions
print('Output: ',y.getNumPartitions()) 
## Output: 3
 
## With predefined Partitions
y = x.groupByKey(2)
print('Output: ',y.getNumPartitions())
## Output: 2
 
## Print Output
for t in y.collect():
    print(t[0], [v for v in t[1]])
    
## USA [1, 2, 8, 3]
## India [1, 4, 9, 4]
## UK [1, 6, 9, 5]

Output:  3
Output:  2
USA [1, 2, 8, 3]
India [1, 4, 9, 4]
UK [1, 6, 9, 5]


### Let's load the ecommerce data and transform it into an RDD
#### the file contains JSON objects of the form

```
{
  "": "9",
  "Clothing ID": "1077",
  "Age": "34",
  "Title": "Such a fun dress!",
  "Review Text": "I'm 5\"5' and 125 lbs. i ordered the s petite to make sure the length wasn't too long. i
 typically wear an xs regular in retailer dresses. if you're less busty (34b cup or smaller), a s petite w
ill fit you perfectly (snug, but not tight). i love that i could dress it up for a party, or down for work
. i love that the tulle is longer then the fabric underneath.",
  "Rating": "5",
  "Recommended IND": "1",
  "Positive Feedback Count": "0",
  "Division Name": "General",
  "Department Name": "Dresses",
  "Class Name": "Dresses"
}
```

In [5]:
ecommerce_data = sc.textFile("./data/")

In [None]:
### First 

In [8]:
ecommerce_json = ecommerce_data.map(json.loads)