# Regression, Classification, and Topic Insights

# Load Dataset

In [19]:

from pyspark.sql import SparkSession
import pandas as pd
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt



spark = SparkSession.builder \
    .appName("JobPostingsAnalysis") \
    .getOrCreate()

file_path = "lightcast_job_postings.csv"

df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("escape", "\"") \
    .csv(file_path)
#df.show(5)



                                                                                

# Multiple Regression

In [20]:
from pyspark.sql.functions import col, lower, when

# AI-related keywords
ai_pattern = r'\b(ai|artificial intelligence|machine learning|deep learning|generative ai|neural network|nlp|computer vision)\b'

# Create AI Role flag (1 = AI job, 0 = Non-AI job)
df = df.withColumn(
    "IS_AI_ROLE",
    when(
        lower(col("TITLE_CLEAN")).rlike(ai_pattern) |
        lower(col("LOT_V6_SPECIALIZED_OCCUPATION_NAME")).rlike(ai_pattern),
        1
    ).otherwise(0)
)
df.select(
    "TITLE_CLEAN",
    "LOT_V6_SPECIALIZED_OCCUPATION_NAME",
    "IS_AI_ROLE"
).orderBy(col("IS_AI_ROLE").desc()) \
 .show(5, truncate=False)


[Stage 87:>                                                         (0 + 1) / 1]

+-------------------------------------------------------+----------------------------------+----------+
|TITLE_CLEAN                                            |LOT_V6_SPECIALIZED_OCCUPATION_NAME|IS_AI_ROLE|
+-------------------------------------------------------+----------------------------------+----------+
|data engineering lead data technology data analytics ai|Data Analyst                      |1         |
|ai ml governance analyst                               |Data Analyst                      |1         |
|data engineering lead data technology data analytics ai|Data Analyst                      |1         |
|data engineering lead data technology data analytics ai|Data Analyst                      |1         |
|ai ml governance analyst                               |Data Analyst                      |1         |
+-------------------------------------------------------+----------------------------------+----------+
only showing top 5 rows



                                                                                

In [21]:
# Missing Value Treatment
from pyspark.sql import Window
from pyspark.sql.functions import col, when, isnan, count, expr, median
from pyspark.sql import functions as F

# Calculate overall median salary
overall_median_salarly = df.approxQuantile("SALARY", [0.5], 0.01)[0]

median_by_employment_type = df.groupBy("EMPLOYMENT_TYPE").agg(expr("percentile_approx(SALARY, 0.5)").alias("median_salary_emp_type"))
median_by_employment_type_name = df.groupBy("EMPLOYMENT_TYPE_NAME").agg(expr("percentile_approx(SALARY, 0.5)").alias("median_salary_emp_type_name"))

# Join median values back to the original dataframe
df_salary_imputed = df.join(median_by_employment_type, on="EMPLOYMENT_TYPE", how = "left").join(median_by_employment_type_name, on="EMPLOYMENT_TYPE_NAME", how = "left")


# Replace missing SALARY values
df_salary_imputed=df_salary_imputed.withColumn("SALARY", when(col("SALARY").isNull(), 
                                when (col("median_salary_emp_type").isNotNull(), col("median_salary_emp_type"))
                                .when(col("median_salary_emp_type_name").isNotNull(), col("median_salary_emp_type_name"))
                                .otherwise(overall_median_salarly)
).otherwise(col("SALARY"))) 

                                                                                

In [22]:
from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.types import BooleanType, StringType, IntegerType
from pyspark.sql.functions import regexp_replace, trim

# Drop rows with NA values 
regression_df = df_salary_imputed.dropna(subset=[
    "SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE",
    "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING", "median_salary_emp_type_name", "IS_AI_ROLE", 
]).select(
    "SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE",
    "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING", "median_salary_emp_type_name", "IS_AI_ROLE", 
   
)

# Cast Duration to integer
regression_df = regression_df.withColumn("DURATION", col("DURATION").cast(IntegerType()))


In [23]:
# Categorical columns
categorical_cols = [ "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME", "IS_INTERNSHIP", "COMPANY_IS_STAFFING"]

# Cast boolean columns to integer
regression_df = regression_df.withColumn("IS_INTERNSHIP", col("IS_INTERNSHIP").cast(IntegerType()))
regression_df = regression_df.withColumn("COMPANY_IS_STAFFING", col("COMPANY_IS_STAFFING").cast(IntegerType()))


# Clean Remote Type Name
regression_df = regression_df.withColumn(
    "REMOTE_TYPE_NAME",
    when(col("REMOTE_TYPE_NAME") == "Remote", "Remote")
    .when(col("REMOTE_TYPE_NAME") == "[None]", "Undefined")
    .when(col("REMOTE_TYPE_NAME") == "Not Remote", "On Premise")
    .when(col("REMOTE_TYPE_NAME") == "Hybrid Remote", "Hybrid")
    .when(col("REMOTE_TYPE_NAME").isNull(), "On Premise")
    .otherwise(col("REMOTE_TYPE_NAME"))
)

# Clean Employment Type Name
regression_df = regression_df.withColumn(
    "EMPLOYMENT_TYPE_NAME",
    when(col("EMPLOYMENT_TYPE_NAME") == "Part-time / full-time", "Flexible")
    .when(col("EMPLOYMENT_TYPE_NAME") == "Part-time (â‰¤ 32 hours)", "Parttime")
    .when(col("EMPLOYMENT_TYPE_NAME") == "Full-time (> 32 hours)", "Fulltime")
    .when(col("EMPLOYMENT_TYPE_NAME").isNull(), "Fulltime")
    .otherwise(col("EMPLOYMENT_TYPE_NAME"))
)

# Clean Education Levels
regression_df = regression_df.withColumn(
    "EDUCATION_LEVELS_NAME",
    trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\[\]\n]", ""))
)


regression_df.show(5, truncate=False)

[Stage 93:>                                                         (0 + 1) / 1]

+--------+--------------------+--------------------+---------------------------------------+--------------------+----------------+--------+-------------+-------------------+---------------------------+----------+
|SALARY  |MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|EDUCATION_LEVELS_NAME                  |EMPLOYMENT_TYPE_NAME|REMOTE_TYPE_NAME|DURATION|IS_INTERNSHIP|COMPANY_IS_STAFFING|median_salary_emp_type_name|IS_AI_ROLE|
+--------+--------------------+--------------------+---------------------------------------+--------------------+----------------+--------+-------------+-------------------+---------------------------+----------+
|117500.0|3                   |3                   |"Bachelor's degree",  "Master's degree"|Flexible            |Undefined       |14      |0            |0                  |100000                     |0         |
|100000.0|3                   |3                   |"Bachelor's degree"                    |Flexible            |Undefined       |42      |0        

                                                                                

In [26]:
# Index and One-Hot Encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid="skip") for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]

#Assemble base features
assembler = VectorAssembler(
       inputCols=["MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", 
                  "DURATION", "IS_AI_ROLE"] + [f"{col}_vec" for col in categorical_cols],
       outputCol="features"
   )

pipeline = Pipeline(stages=indexers + encoders + [assembler])
regression_data =  pipeline.fit(regression_df).transform(regression_df)
regression_data.select("SALARY","features").show(5, truncate=False)

[Stage 244:>                                                        (0 + 1) / 1]

+--------+------------------------------------------------------+
|SALARY  |features                                              |
+--------+------------------------------------------------------+
|117500.0|(29,[0,1,2,6,24,27,28],[3.0,3.0,14.0,1.0,1.0,1.0,1.0])|
|100000.0|(29,[0,1,2,4,24,27,28],[3.0,3.0,42.0,1.0,1.0,1.0,1.0])|
|100000.0|(29,[0,1,2,4,24,27,28],[3.0,3.0,20.0,1.0,1.0,1.0,1.0])|
|100000.0|(29,[0,1,2,9,24,27,28],[1.0,1.0,42.0,1.0,1.0,1.0,1.0])|
|162050.0|(29,[0,1,2,5,27,28],[5.0,5.0,33.0,1.0,1.0,1.0])       |
+--------+------------------------------------------------------+
only showing top 5 rows



                                                                                

In [27]:
# Split Data
regression_train, regression_test = regression_data.randomSplit([0.8, 0.2], seed=42)

print((regression_data.count(), len(regression_data.columns)))
print((regression_train.count(), len(regression_train.columns)))
print((regression_test.count(), len(regression_test.columns)))

                                                                                

(5039, 22)


                                                                                

(4070, 22)


[Stage 276:>                                                        (0 + 1) / 1]

(969, 22)


                                                                                