### Mesleğe Göre Maaş bulma

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext
import os

In [2]:
spark = SparkSession.builder \
.master("local[*]") \
.config("spark.executor.memory", "2g") \
.config("spark.driver.memory", "2g") \
.appName("averageSallary") \
.getOrCreate()

23/04/15 13:44:18 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [5]:
file = f"{os.getcwd()}/simple_data.csv"
raw_dataRDD = sc.textFile(file)
raw_dataRDD.take(5)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

['sirano,isim,yas,meslek,sehir,aylik_gelir',
 '1,Cemal,35,Isci,Ankara,3500',
 '2,Ceyda,42,Memur,Kayseri,4200',
 '3,Timur,30,Müzisyen,Istanbul,9000',
 '4,Burcu,29,Pazarlamaci,Ankara,4200']

In [8]:
raw_dataRDD_without_header = raw_dataRDD.filter(lambda x: "sirano" not in x)
raw_dataRDD_without_header.take(10)

['1,Cemal,35,Isci,Ankara,3500',
 '2,Ceyda,42,Memur,Kayseri,4200',
 '3,Timur,30,Müzisyen,Istanbul,9000',
 '4,Burcu,29,Pazarlamaci,Ankara,4200',
 '5,Yasemin,23,Pazarlamaci,Bursa,4800',
 '6,Ali,33,Memur,Ankara,4250',
 '7,Dilek,29,Pazarlamaci,Istanbul,7300',
 '8,Murat,31,Müzisyen,Istanbul,12000',
 '9,Ahmet,33,Doktor,Ankara,18000',
 '10,Muhittin,46,Berber,Istanbul,12000']

In [16]:
def job_salary(line: str):
    job = line.split(",")[3]
    salary = float(line.split(",")[5])
    
    return (job, salary)

In [17]:
job_salaryRDD = raw_dataRDD_without_header.map(job_salary)
job_salaryRDD.take(5)

[('Isci', 3500.0),
 ('Memur', 4200.0),
 ('Müzisyen', 9000.0),
 ('Pazarlamaci', 4200.0),
 ('Pazarlamaci', 4800.0)]

In [20]:
job_salaryRDD2 = job_salaryRDD.mapValues(lambda v: (v,1))

In [21]:
job_salaryRDD2.take(5)

[('Isci', (3500.0, 1)),
 ('Memur', (4200.0, 1)),
 ('Müzisyen', (9000.0, 1)),
 ('Pazarlamaci', (4200.0, 1)),
 ('Pazarlamaci', (4800.0, 1))]

In [24]:
job_salaryRDD3 = job_salaryRDD2.reduceByKey(lambda x,y: (x[0] + y[0], x[1] + y[1]))

In [25]:
job_salaryRDD3.take(5)

[('Memur', (12200.0, 3)),
 ('Pazarlamaci', (16300.0, 3)),
 ('Tuhafiyeci', (4800.0, 1)),
 ('Tornacı', (4200.0, 1)),
 ('Isci', (3500.0, 1))]

In [26]:
job_salaryRDD4 = job_salaryRDD3.mapValues(lambda value: (value[0] / value[1]))

In [28]:
job_salaryRDD4.collect()

[('Memur', 4066.6666666666665),
 ('Pazarlamaci', 5433.333333333333),
 ('Tuhafiyeci', 4800.0),
 ('Tornacı', 4200.0),
 ('Isci', 3500.0),
 ('Müzisyen', 9900.0),
 ('Doktor', 16125.0),
 ('Berber', 12000.0)]