# Umgang mit Pair-RDDs

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("pair-rdd").master("local[4]").getOrCreate()
sc = spark.sparkContext
spark

23/09/08 17:23:36 WARN Utils: Your hostname, pupil-a resolves to a loopback address: 127.0.1.1; using 167.235.141.210 instead (on interface eth0)
23/09/08 17:23:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/08 17:23:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/08 17:23:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# bekannt und gut
numbersRDD = sc.parallelize( [1, 2, 3, 4] )

In [5]:
numbersRDD.collect()

[1, 2, 3, 4]

In [6]:
import math
sqrt_rdd = numbersRDD.map(lambda x: math.sqrt(x))
sqrt_rdd.take(10)

                                                                                

[1.0, 1.4142135623730951, 1.7320508075688772, 2.0]

**Uncool!** jetzt sind es nur komische doubles

In [8]:
numbersWithSquareRoot = numbersRDD.map(
    lambda x : (x, math.sqrt(x)))
numbersWithSquareRoot.collect()

                                                                                

[(1, 1.0), (2, 1.4142135623730951), (3, 1.7320508075688772), (4, 2.0)]

## Aufgabe
1. Lade TaxiZones.csv
1. Splitte die Zeilen an Kommas
1. Erzeuge einen PairRDD der die Location Id als Key und die gesamte Zeile als Value enthält.

## Group By und aufsummieren

Nun gruppieren wir nach dem Bezirk (2. Spalte) und schauen wie viele Zonen in jedem Bezirk sind 

In [15]:
taxi_zones_splitted_rdd = sc.textFile("TaxiZones.csv", 4).map(lambda x: x.split(","))
taxi_zones_splitted_rdd.take(3)

[['1', 'EWR', 'Newark Airport', 'EWR'],
 ['2', 'Queens', 'Jamaica Bay', 'Boro Zone'],
 ['3', 'Bronx', 'Allerton/Pelham Gardens', 'Boro Zone']]

In [23]:
taxi_zones_by_bezirk = taxi_zones_splitted_rdd.map(lambda row: (row[1],1))
taxi_zones_by_bezirk.take(10)

[('EWR', 1),
 ('Queens', 1),
 ('Bronx', 1),
 ('Manhattan', 1),
 ('Staten Island', 1),
 ('Staten Island', 1),
 ('Queens', 1),
 ('Queens', 1),
 ('Queens', 1),
 ('Queens', 1)]

In [25]:
taxi_zones_by_bezirk.reduceByKey(lambda x,y: x + y).collect()

                                                                                

[('Bronx', 43),
 ('Staten Island', 20),
 ('EWR', 1),
 ('Manhattan', 69),
 ('Brooklyn', 61),
 ('Unknown', 2),
 ('Queens', 69)]

## Nun ein wenig Standardmethoden auf Pair-RDDs

In [None]:
taxi_zones_by_bezirk.sortByKey().collect()

[('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Bronx', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn', 1),
 ('Brooklyn',

In [None]:
taxi_zones_by_bezirk.keys().collect()

['EWR',
 'Queens',
 'Bronx',
 'Manhattan',
 'Staten Island',
 'Staten Island',
 'Queens',
 'Queens',
 'Queens',
 'Queens',
 'Brooklyn',
 'Manhattan',
 'Manhattan',
 'Brooklyn',
 'Queens',
 'Queens',
 'Brooklyn',
 'Bronx',
 'Queens',
 'Bronx',
 'Brooklyn',
 'Brooklyn',
 'Staten Island',
 'Manhattan',
 'Brooklyn',
 'Brooklyn',
 'Queens',
 'Queens',
 'Brooklyn',
 'Queens',
 'Bronx',
 'Bronx',
 'Brooklyn',
 'Brooklyn',
 'Brooklyn',
 'Brooklyn',
 'Brooklyn',
 'Queens',
 'Brooklyn',
 'Brooklyn',
 'Manhattan',
 'Manhattan',
 'Manhattan',
 'Staten Island',
 'Manhattan',
 'Bronx',
 'Bronx',
 'Manhattan',
 'Brooklyn',
 'Manhattan',
 'Bronx',
 'Brooklyn',
 'Queens',
 'Brooklyn',
 'Brooklyn',
 'Queens',
 'Queens',
 'Bronx',
 'Bronx',
 'Bronx',
 'Brooklyn',
 'Brooklyn',
 'Brooklyn',
 'Queens',
 'Brooklyn',
 'Brooklyn',
 'Brooklyn',
 'Manhattan',
 'Bronx',
 'Queens',
 'Brooklyn',
 'Brooklyn',
 'Queens',
 'Manhattan',
 'Manhattan',
 'Brooklyn',
 'Brooklyn',
 'Bronx',
 'Manhattan',
 'Brooklyn',
 'Bron

In [None]:
taxi_zones_by_bezirk.keys().distinct().collect()

['Bronx', 'Staten Island', 'EWR', 'Manhattan', 'Brooklyn', 'Unknown', 'Queens']

In [None]:
taxi_zones_by_bezirk.distinct().collect()

[('Bronx', 1),
 ('Staten Island', 1),
 ('Queens', 1),
 ('EWR', 1),
 ('Manhattan', 1),
 ('Brooklyn', 1),
 ('Unknown', 1)]