### Create Spark Session

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext

In [2]:
pyspark = SparkSession.builder \
.master("local[4]") \
.appName("Create a RDD") \
.config("spark.executor.memory", "4g") \
.config("spark.driver.memory", "2g") \
.getOrCreate()

23/04/14 19:15:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
sc = pyspark.sparkContext
sc.setLogLevel("Error")

In [9]:
ages = [("Batuhan", 26), ("Mert", 30), ("Damla", 15)]
ages_rdd = sc.parallelize(ages)
ages_rdd.filter(lambda kv: kv[1] < 30).take(3)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

[('Batuhan', 26), ('Damla', 15)]

In [12]:
ages_rdd.filter(lambda kv: kv[0] == "Batuhan").take(3)

[('Batuhan', 26)]

In [16]:
ex_list = [(1,2),(3,4),(7,8), (3,8), (1,4)]
rdd = sc.parallelize(ex_list)
rdd.reduceByKey(lambda x,y: x + y).take(3)

[(1, 6), (3, 12), (7, 8)]

In [18]:
rdd.groupByKey().take(5)

[(1, <pyspark.resultiterable.ResultIterable at 0x7f479e6ea2f0>),
 (3, <pyspark.resultiterable.ResultIterable at 0x7f479e6eada0>),
 (7, <pyspark.resultiterable.ResultIterable at 0x7f479e6e8100>)]

In [27]:
rdd.combineByKey(
    lambda value: (value, 1),
    lambda acc, value: (acc[0] + value, acc[1] + 1),
    lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
).collect()


[(1, (6, 2)), (3, (12, 2)), (7, (8, 1))]

In [20]:
rdd.mapValues(lambda x: x*100).collect()

[(1, 200), (3, 400), (7, 800), (3, 800), (1, 400)]

In [21]:
rdd.keys().collect()

[1, 3, 7, 3, 1]

In [22]:
rdd.values().collect()

[2, 4, 8, 8, 4]

In [23]:
rdd.sortByKey().collect()

[(1, 2), (1, 4), (3, 4), (3, 8), (7, 8)]

In [28]:
rdd2 = sc.parallelize([(3,8)])
rdd.subtractByKey(rdd2).collect()

[(1, 2), (1, 4), (7, 8)]

In [29]:
rdd.join(rdd2).collect()

[(3, (4, 8)), (3, (8, 8))]

In [30]:
rdd.rightOuterJoin(rdd2).collect()

[(3, (4, 8)), (3, (8, 8))]

In [31]:
rdd.leftOuterJoin(rdd2).collect()

[(1, (2, None)), (1, (4, None)), (3, (4, 8)), (3, (8, 8)), (7, (8, None))]

In [32]:
rdd.cogroup(rdd2).collect()

[(1,
  (<pyspark.resultiterable.ResultIterable at 0x7f479e447bb0>,
   <pyspark.resultiterable.ResultIterable at 0x7f479e445600>)),
 (3,
  (<pyspark.resultiterable.ResultIterable at 0x7f479e445300>,
   <pyspark.resultiterable.ResultIterable at 0x7f479e4455d0>)),
 (7,
  (<pyspark.resultiterable.ResultIterable at 0x7f479e447760>,
   <pyspark.resultiterable.ResultIterable at 0x7f479e445750>))]