# Jobs, Stages and Tasks

## Setting up Spark

In [1]:
from pyspark.sql import SparkSession

In [None]:
spark.stop()

In [2]:
spark = SparkSession.builder.appName("jobs-stages-tasks").master("local[4]").getOrCreate()

23/08/21 05:50:57 WARN Utils: Your hostname, pupil-a resolves to a loopback address: 127.0.1.1; using 167.235.141.210 instead (on interface eth0)
23/08/21 05:50:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/21 05:50:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/21 05:51:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark

In [4]:
sc = spark.sparkContext

## Reading the file

In [33]:
zones_with_cols_rdd = sc.textFile("TaxiZones.csv", 4)
zones_with_cols_rdd.take(10)

['1,EWR,Newark Airport,EWR',
 '2,Queens,Jamaica Bay,Boro Zone',
 '3,Bronx,Allerton/Pelham Gardens,Boro Zone',
 '4,Manhattan,Alphabet City,Yellow Zone',
 '5,Staten Island,Arden Heights,Boro Zone',
 '6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone',
 '7,Queens,Astoria,Boro Zone',
 '8,Queens,Astoria Park,Boro Zone',
 '9,Queens,Auburndale,Boro Zone',
 '10,Queens,Baisley Park,Boro Zone']

Überlege wieviele
 * Jobs
 * Transformationen
 * Actions
 * Stages
erzeugt wurden?

## Reading and splitting

In [34]:
# Reading into 4 partitions 
zones_rdd = sc.textFile("TaxiZones.csv", 4)
# Splitting each row by ,
zones_with_cols_rdd = zones_rdd.map(lambda x: x.split(","))

zones_with_cols_rdd.take(10)

[['1', 'EWR', 'Newark Airport', 'EWR'],
 ['2', 'Queens', 'Jamaica Bay', 'Boro Zone'],
 ['3', 'Bronx', 'Allerton/Pelham Gardens', 'Boro Zone'],
 ['4', 'Manhattan', 'Alphabet City', 'Yellow Zone'],
 ['5', 'Staten Island', 'Arden Heights', 'Boro Zone'],
 ['6', 'Staten Island', 'Arrochar/Fort Wadsworth', 'Boro Zone'],
 ['7', 'Queens', 'Astoria', 'Boro Zone'],
 ['8', 'Queens', 'Astoria Park', 'Boro Zone'],
 ['9', 'Queens', 'Auburndale', 'Boro Zone'],
 ['10', 'Queens', 'Baisley Park', 'Boro Zone']]

In [39]:
# Hat sich die Anzahl der Partitionen im Laufe der Transformationen verändert?

print("After reading the file: " + str(zones_rdd.getNumPartitions()))
print("After applying map: " + str(zones_with_cols_rdd.getNumPartitions()))

After reading the file: 4
After applying map: 4


## Reading and creating a pair RDD

In [50]:
# Reading into 4 partitions 
zones_rdd = sc.textFile("TaxiZones.csv", 4)
# Splitting each row by ,
zones_with_cols_rdd = zones_rdd.map(lambda x: x.split(","))

# Now creating a pair RDD for counting the amount of boroughs
zones_pair_rdd = zones_with_cols_rdd.map(lambda x: (x[0],1))

print(zones_rdd.count())
print(zones_with_cols_rdd.count())
print(zones_pair_rdd.count())

zones_with_cols_rdd.take(10)

                                                                                

265
265
265


[['1', 'EWR', 'Newark Airport', 'EWR'],
 ['2', 'Queens', 'Jamaica Bay', 'Boro Zone'],
 ['3', 'Bronx', 'Allerton/Pelham Gardens', 'Boro Zone'],
 ['4', 'Manhattan', 'Alphabet City', 'Yellow Zone'],
 ['5', 'Staten Island', 'Arden Heights', 'Boro Zone'],
 ['6', 'Staten Island', 'Arrochar/Fort Wadsworth', 'Boro Zone'],
 ['7', 'Queens', 'Astoria', 'Boro Zone'],
 ['8', 'Queens', 'Astoria Park', 'Boro Zone'],
 ['9', 'Queens', 'Auburndale', 'Boro Zone'],
 ['10', 'Queens', 'Baisley Park', 'Boro Zone']]

### Now also finding distinct records

In [56]:
# Reading into 4 partitions 
zones_rdd = sc.textFile("TaxiZones.csv", 4)
# Splitting each row by ,
zones_with_cols_rdd = zones_rdd.map(lambda x: x.split(","))

# Now creating a pair RDD for counting the amount of boroughs
zones_pair_rdd = zones_with_cols_rdd.map(lambda x: (x[1],1))

distinct_zones_pair_rdd = zones_pair_rdd.distinct()

print(zones_rdd.count())
print(zones_with_cols_rdd.count())
print(zones_pair_rdd.count())

distinct_zones_pair_rdd.collect()

                                                                                

265


                                                                                

265
265


                                                                                

[('Bronx', 1),
 ('Staten Island', 1),
 ('Queens', 1),
 ('EWR', 1),
 ('Manhattan', 1),
 ('Brooklyn', 1),
 ('Unknown', 1)]

In [66]:
# Reading into 4 partitions 
zones_rdd = sc.textFile("TaxiZones.csv", 4)
# Splitting each row by ,
zones_with_cols_rdd = zones_rdd.map(lambda x: x.split(","))

# Now creating a pair RDD for counting the amount of boroughs
zones_pair_rdd = zones_with_cols_rdd.map(lambda x: (x[1],1))

#distinct_zones_pair_rdd = zones_pair_rdd.distinct()


counted = zones_pair_rdd.reduceByKey(lambda x, y: x + y)

counted.take(10)

                                                                                

[('Bronx', 43),
 ('Staten Island', 20),
 ('EWR', 1),
 ('Manhattan', 69),
 ('Brooklyn', 61),
 ('Unknown', 2),
 ('Queens', 69)]

In [67]:
zones_pair_rdd.getNumPartitions()

4