In [13]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext

In [15]:
pyspark = SparkSession.builder \
.master("local[4]") \
.appName("RDD-Olusturmak") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

sc = pyspark.sparkContext

In [27]:
path = "sources/simple_data.csv"

In [28]:
people = sc.textFile(path)

In [29]:
people.take(5)

['sirano,isim,yas,meslek,sehir,aylik_gelir',
 '1,Cemal,35,Isci,Ankara,3500',
 '2,Ceyda,42,Memur,Kayseri,4200',
 '3,Timur,30,Müzisyen,Istanbul,9000',
 '4,Burcu,29,Pazarlamaci,Ankara,4200']

In [31]:
#get rid of top row
people = people.filter(lambda x: "sirano" not in x)
people.take(5)

['1,Cemal,35,Isci,Ankara,3500',
 '2,Ceyda,42,Memur,Kayseri,4200',
 '3,Timur,30,Müzisyen,Istanbul,9000',
 '4,Burcu,29,Pazarlamaci,Ankara,4200',
 '5,Yasemin,23,Pazarlamaci,Bursa,4800']

In [41]:
#function will run for each element in people csv rdd which is seperated by comma
def get_job_salary(line):
    job = line.split(",")[3]
    salary = float(line.split(",")[5])
    
    return (job, salary)

In [42]:
people_job_salary = people.map(get_job_salary)

In [43]:
people_job_salary.take(5)

[('Isci', 3500.0),
 ('Memur', 4200.0),
 ('Müzisyen', 9000.0),
 ('Pazarlamaci', 4200.0),
 ('Pazarlamaci', 4800.0)]

In [53]:
#map values runs on values otherwise it would be (isci,1),3500 which is not preferrable
people_job_salary_tuple = people_job_salary.mapValues(lambda x: (x,1))
people_job_salary_tuple.take(5)

[('Isci', (3500.0, 1)),
 ('Memur', (4200.0, 1)),
 ('Müzisyen', (9000.0, 1)),
 ('Pazarlamaci', (4200.0, 1)),
 ('Pazarlamaci', (4800.0, 1))]

In [54]:
people_job_salary_tuple = people_job_salary_tuple.reduceByKey(lambda x,y: (x[0] + y[0], x[1] + y[1]))
people_job_salary_tuple.take(5)

[('Memur', (12200.0, 3)),
 ('Pazarlamaci', (16300.0, 3)),
 ('Tuhafiyeci', (4800.0, 1)),
 ('Tornacı', (4200.0, 1)),
 ('Isci', (3500.0, 1))]

In [55]:
#average method1
average1 = people_job_salary_tuple.mapValues(lambda x: x[0] / x[1])
average1.take(8)

[('Memur', 4066.6666666666665),
 ('Pazarlamaci', 5433.333333333333),
 ('Tuhafiyeci', 4800.0),
 ('Tornacı', 4200.0),
 ('Isci', 3500.0),
 ('Müzisyen', 9900.0),
 ('Doktor', 16125.0),
 ('Berber', 12000.0)]

In [58]:
#average method2
average2 = people_job_salary_tuple.map(lambda x: (x[0],x[1][0] / x[1][1]))
average2.take(8)

[('Memur', 4066.6666666666665),
 ('Pazarlamaci', 5433.333333333333),
 ('Tuhafiyeci', 4800.0),
 ('Tornacı', 4200.0),
 ('Isci', 3500.0),
 ('Müzisyen', 9900.0),
 ('Doktor', 16125.0),
 ('Berber', 12000.0)]