In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder. \
    appName("pyspark-1"). \
    getOrCreate()

### Read data

In [3]:
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True)
df.printSchema()

root
 |-- Job ID: string (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: string (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: string (nullable = true)
 |-- Salary Range To: string (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/Shift: string (nullable = true)
 |-- Work Locatio

### Sample function

In [4]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

In [23]:
def get_salary_frequency(df: DataFrame) -> list:
    row_list = df.select('Salary Frequency').distinct().collect()
    return [row['Salary Frequency'] for row in row_list]

**Whats the number of jobs posting per category (Top 10)?**

In [35]:
def get_top10_jobs_posting_per_category(df: DataFrame) -> list:
    job_posting_per_category = df.groupBy('Job Category').count().orderBy(col('count').desc())
    return [(row[0], row[1]) for row in job_posting_per_category.take(10)]

In [36]:
get_top10_jobs_posting_per_category(df)

[('Engineering, Architecture, & Planning', 504),
 ('Technology, Data & Innovation', 313),
 ('Legal Affairs', 226),
 ('Public Safety, Inspections, & Enforcement', 182),
 ('Building Operations & Maintenance', 181),
 ('Finance, Accounting, & Procurement', 169),
 ('Administration & Human Resources', 134),
 ('Constituent Services & Community Programs', 129),
 ('Health', 125),
 ('Policy, Research & Analysis', 124)]

### Example of test function

In [24]:
mock_data = [('A', 'Annual'), ('B', 'Daily')]
expected_result = ['Annual', 'Daily']

In [25]:
def test_get_salary_frequency(mock_data: list, 
                              expected_result: list,
                              schema: list = ['id', 'Salary Frequency']):  
    mock_df = spark.createDataFrame(data = mock_data, schema = schema)
    assert get_salary_frequency(mock_df) == expected_result

Test : get_top10_jobs_posting_per_category

In [45]:
mock_data = [('C1', 1), ('C1', 2), ('C1', 3),
            ('C2', 4), ('C2', 5),
            ('C3', 6), ('C3', 7), ('C3', 8),
            ('C4', 9), ('C4', 10), ('C4', 11),
            ('C5', 12), ('C5', 13), ('C5', 14),('C5', 15),
            ('C6', 16), ('C6', 17), ('C6', 18),
            ('C7', 19), ('C7', 20),
            ('C8', 21), ('C8', 22), ('C8', 23),
            ('C9', 24), ('C9', 25), ('C9', 26),('C9', 27), ('C9', 28),
            ('C10', 29), ('C10', 30), ('C10', 31),
            ('C11', 32)]
expected_result = [('C9', 5),
                  ('C5', 4),
                  ('C6', 3),
                  ('C4', 3),
                  ('C10', 3),
                  ('C3', 3),
                  ('C8', 3),
                  ('C1', 3),
                  ('C7', 2),
                  ('C2', 2)]

def test_get_top10_jobs_posting_per_category(mock_data: list, 
                              expected_result: list,
                              schema: list = ['Job Category', 'Job Id']):
    mock_df = spark.createDataFrame(data = mock_data, schema = schema)
    assert sorted(get_top10_jobs_posting_per_category(mock_df)) == sorted(expected_result)

In [46]:
schema = list = ['Job Category', 'Job Id']
test_get_top10_jobs_posting_per_category(mock_data, expected_result, schema)