In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder. \
    appName("pyspark-1"). \
    getOrCreate()

### Read data

In [17]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

schema = StructType([
    StructField('Job ID', StringType(), True),
    StructField('Agency', StringType(), True),
    StructField('Posting Type', StringType(), True),
    StructField('# Of Positions', IntegerType(), True),
    StructField('Business Title', StringType(), True),
    StructField('Civil Service Title', StringType(), True),
    StructField('Title Code No', StringType(), True),
    StructField('Level', StringType(), True),
    StructField('Job Category', StringType(), True),
    StructField('Full-Time/Part-Time indicator', StringType(), True),
    StructField('Salary Range From', DoubleType(), True),
    StructField('Salary Range To', DoubleType(), True),
    StructField('Salary Frequency', StringType(), True),
    StructField('Work Location', StringType(), True),
    StructField('Division/Work Unit', StringType(), True),
    StructField('Job Description', StringType(), True),
    StructField('Minimum Qual Requirements', StringType(), True),
    StructField('Preferred Skills', StringType(), True),
    StructField('Additional Information', StringType(), True),
    StructField('To Apply', StringType(), True),
    StructField('Hours/Shift', StringType(), True),
    StructField('Work Location 1', StringType(), True),
    StructField('Recruitment Contact', StringType(), True),
    StructField('Residency Requirement', StringType(), True),
    StructField('Posting Date', StringType(), True),
    StructField('Post Until', StringType(), True),
    StructField('Posting Updated', StringType(), True),
    StructField('Process Date', StringType(), True)
])

In [19]:
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, schema=schema)
df.printSchema()

root
 |-- Job ID: string (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/Shift: string (nullable = true)
 |-- Work Locati

### Sample function

In [28]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col,max,min

In [20]:
def get_salary_frequency(df: DataFrame) -> list:
    row_list = df.select('Salary Frequency').distinct().collect()
    return [row['Salary Frequency'] for row in row_list]

**Whats the number of jobs posting per category (Top 10)?**

In [21]:
def get_top10_jobs_posting_per_category(df: DataFrame) -> list:
    job_posting_per_category = df.groupBy('Job Category').count().orderBy(col('count').desc())
    return [(row[0], row[1]) for row in job_posting_per_category.take(10)]

In [22]:
get_top10_jobs_posting_per_category(df)

[('Engineering, Architecture, & Planning', 504),
 ('Technology, Data & Innovation', 313),
 ('Legal Affairs', 226),
 ('Public Safety, Inspections, & Enforcement', 182),
 ('Building Operations & Maintenance', 181),
 ('Finance, Accounting, & Procurement', 169),
 ('Administration & Human Resources', 134),
 ('Constituent Services & Community Programs', 129),
 ('Health', 125),
 ('Policy, Research & Analysis', 124)]

**Whats the salary distribution per job category?**

In [32]:
def get_salary_distribution_per_job_category(df: DataFrame) -> list:
    salary_distribution_per_job_category = df.groupBy(col('Job Category'))\
                                            .agg(min(col('Salary Range From')).alias('Salary Range From')\
                                                 , max(col('Salary Range To')).alias('Salary Range To'))

    return [(row[0], row[1], row[2]) for row in salary_distribution_per_job_category.collect()]

In [33]:
get_salary_distribution_per_job_category(df)

[('Administration & Human Resources Communications & Intergovernmental Affairs Engineering, Architecture, & Planning Policy, Research & Analysis',
  90000.0,
  100000.0),
 ('Health Policy, Research & Analysis Public Safety, Inspections, & Enforcement',
  82008.0,
  180000.0),
 ('Administration & Human Resources Building Operations & Maintenance Policy, Research & Analysis',
  54100.0,
  83981.0),
 ('Information Technology & Telecommunications Policy & Analysis Social Services',
  68239.0,
  85644.0),
 ('Finance, Accounting, & Procurement Public Safety, Inspections, & Enforcement',
  55659.0,
  70390.0),
 ('Engineering, Architecture, & Planning Building Operations & Maintenance Public Safety, Inspections, & Enforcement',
  539.12,
  118610.0),
 ('Legal Affairs Policy, Research & Analysis Public Safety, Inspections, & Enforcement',
  54165.0,
  168433.0),
 ('Administration & Human Resources Finance, Accounting, & Procurement Building Operations & Maintenance Policy, Research & Analysis',

**Is there any correlation between the higher degree and the salary?**

In [71]:
from pyspark.sql.functions import when,instr
df_target = df.withColumn('Degree Level', when(instr(col('Minimum Qual Requirements'), 'master''s degree') > 0, 2)\
                                        .when(instr(col('Minimum Qual Requirements'), 'baccalaureate degree') > 0, 1)\
                                         .otherwise(0))

#df_target.select('Minimum Qual Requirements', 'Degree Level').show(truncate=False)

In [72]:
df_target.corr('Degree Level', 'Salary Range To')

0.16851524924303374

In [73]:
df_target.corr('Degree Level', 'Salary Range From')

0.21567669052524574

The above exercise shows positive correlation between higher degree and the salary

### Example of test function

In [24]:
mock_data = [('A', 'Annual'), ('B', 'Daily')]
expected_result = ['Annual', 'Daily']

In [25]:
def test_get_salary_frequency(mock_data: list, 
                              expected_result: list,
                              schema: list = ['id', 'Salary Frequency']):  
    mock_df = spark.createDataFrame(data = mock_data, schema = schema)
    assert get_salary_frequency(mock_df) == expected_result

**Test : get_top10_jobs_posting_per_category**

In [39]:
def test_get_top10_jobs_posting_per_category(mock_data: list, 
                              expected_result: list,
                              schema: list = ['Job Category', 'Job Id']):
    mock_df = spark.createDataFrame(data = mock_data, schema = schema)
    assert sorted(get_top10_jobs_posting_per_category(mock_df)) == sorted(expected_result)

In [40]:
mock_data = [('C1', 1), ('C1', 2), ('C1', 3),
            ('C2', 4), ('C2', 5),
            ('C3', 6), ('C3', 7), ('C3', 8),
            ('C4', 9), ('C4', 10), ('C4', 11),
            ('C5', 12), ('C5', 13), ('C5', 14),('C5', 15),
            ('C6', 16), ('C6', 17), ('C6', 18),
            ('C7', 19), ('C7', 20),
            ('C8', 21), ('C8', 22), ('C8', 23),
            ('C9', 24), ('C9', 25), ('C9', 26),('C9', 27), ('C9', 28),
            ('C10', 29), ('C10', 30), ('C10', 31),
            ('C11', 32)]
expected_result = [('C9', 5),
                  ('C5', 4),
                  ('C6', 3),
                  ('C4', 3),
                  ('C10', 3),
                  ('C3', 3),
                  ('C8', 3),
                  ('C1', 3),
                  ('C7', 2),
                  ('C2', 2)]

schema = list = ['Job Category', 'Job Id']
test_get_top10_jobs_posting_per_category(mock_data, expected_result, schema)

**Test: get_salary_distribution_per_job_category**

In [44]:
def test_get_salary_distribution_per_job_category(mock_data: list, 
                              expected_result: list,
                              schema: StructType([StructField('Job Category', StringType(), True),StructField('Salary Range From', DoubleType(), True),
    StructField('Salary Range To', DoubleType(), True),
])):
    mock_df = spark.createDataFrame(data = mock_data, schema = schema)
    assert sorted(get_salary_distribution_per_job_category(mock_df)) == sorted(expected_result)

In [45]:
mock_data = [('C1', 10.00, 20.00), ('C1', 15.00, 30.00), ('C1', 30.00, 40.00),
            ('C2', 52.50, 60.00), ('C2', 63.00, 70.00), ('C2', 22.14, 30.00), ('C2', 21.14, 75.50),
            ('C3', 900.00, 950.00), ('C3', 700.00, 760.00), ('C3', 810.00, 900.00)]
expected_result = [('C1', 10.00, 40.00),
                  ('C2', 21.14, 75.50),
                  ('C3', 700.00, 950.00)]

schema = StructType([
    StructField('Job Category', StringType(), True),
    StructField('Salary Range From', DoubleType(), True),
    StructField('Salary Range To', DoubleType(), True),
])
test_get_salary_distribution_per_job_category(mock_data, expected_result, schema)