In [1]:
# Import the findspark module 
import findspark

# Initialize via the full spark path
findspark.init("/usr/local/spark/")

In [2]:
# Import the SparkSession module
from pyspark.sql import SparkSession

# Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
# new one based on the options set in this builder.
spark = SparkSession.builder \
   .master("local") \
   .appName("Employees_Homework") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()
  
# Main entry point for Spark functionality. A SparkContext represents the
# connection to a Spark cluster, and can be used to create :class:`RDD` and
# broadcast variables on that cluster.    
sc = spark.sparkContext

In [3]:
def parseLine(line):
    fields = line.split(',')
    salary = int(fields[4])
    name = str(fields[0])
    return (name,salary)

In [4]:
data = sc.textFile("./data/employees.csv")
# Return the first element
header = data.first() 
# Parallelized collections are created by calling the parallelize method on an existing iterable or collection in your driver program. The elements of the collection are copied to form a distributed dataset that can be operated on in parallel.
header = sc.parallelize([header])
# Return without header
lines = data.subtract(header)
rdd = lines.map(parseLine)

In [5]:
department = lines.map(lambda x:x.split(",")[7])
result = dict(department.countByValue())
print("Number of employees in each department", result )

min_salary = rdd.sortBy(lambda x: x[1]).first()
# SortBy salary in descending order for maximum salary
max_salary = rdd.sortBy(lambda x: -x[1]).first()

salary = lines.map(lambda x: int(x.split(",")[4]))
sum_salary = salary.sum()
count = salary.count()
mean_salary = sum_salary / count

print("Minimum Salary:", min_salary)
print("Maximum Salary:", max_salary)
print("Mean Salary:", mean_salary)

Number of employees in each department {'Marketing': 74, 'Finance': 80, 'Product': 83, 'Engineering': 79, 'Business Development': 88, 'Legal': 68, 'Human Resources': 76, 'Sales': 72, 'Client Services': 85, 'Distribution': 60}
Minimum Salary: ('Michael', 35013)
Maximum Salary: ('Katherine', 149908)
Mean Salary: 90444.1477124183
