# Regression, Classification, and Topic Insights

# Load Dataset

In [58]:

from pyspark.sql import SparkSession
import pandas as pd
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt



spark = SparkSession.builder \
    .appName("JobPostingsAnalysis") \
    .getOrCreate()

file_path = "lightcast_job_postings.csv"

df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("escape", "\"") \
    .csv(file_path)
df.show(5)



                                                                                

+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+------

# Multiple Regression

In [59]:
from pyspark.sql.functions import col, lower, when

# Create AI Role flag with VERY specific patterns
df = df.withColumn(
    "IS_AI_ROLE",
    when(
        # Specific AI/ML terms (these are safe)
        lower(col("TITLE_CLEAN")).rlike(r'\b(machine learning|deep learning|artificial intelligence|generative ai|neural network|computer vision|data scientist)\b') |
        lower(col("TITLE_RAW")).rlike(r'\b(machine learning|deep learning|artificial intelligence|generative ai|neural network|computer vision|data scientist)\b') |
        lower(col("LOT_V6_SPECIALIZED_OCCUPATION_NAME")).rlike(r'\b(machine learning|deep learning|artificial intelligence|generative ai|neural network|computer vision|data scientist)\b') |
        
        # AI as a standalone word (with spaces or punctuation around it)
        lower(col("TITLE_CLEAN")).rlike(r'(\s|^)ai(\s|$|/|-)') |
        lower(col("TITLE_RAW")).rlike(r'(\s|^)ai(\s|$|/|-)') |
        lower(col("LOT_V6_SPECIALIZED_OCCUPATION_NAME")).rlike(r'(\s|^)ai(\s|$|/|-)') |
        
        # ML Engineer/Scientist variants
        lower(col("TITLE_CLEAN")).rlike(r'\bml\s+(engineer|scientist|developer|analyst)\b') |
        lower(col("TITLE_RAW")).rlike(r'\bml\s+(engineer|scientist|developer|analyst)\b') |
        
        # NLP specifically
        lower(col("TITLE_CLEAN")).rlike(r'\bnlp\b') |
        lower(col("TITLE_RAW")).rlike(r'\bnlp\b'),
        1
    ).otherwise(0)
)

# Verify
df.select(
    "TITLE_CLEAN",
    "TITLE_RAW",
    "LOT_V6_SPECIALIZED_OCCUPATION_NAME",
    "IS_AI_ROLE"
).orderBy(col("IS_AI_ROLE").desc()) \
 .show(10, truncate=False)


#df.groupBy("IS_AI_ROLE").count().orderBy("IS_AI_ROLE").show()



[Stage 1167:>                                                       (0 + 1) / 1]

+--------------------------------------------------------+--------------------------------------------------------------+----------------------------------+----------+
|TITLE_CLEAN                                             |TITLE_RAW                                                     |LOT_V6_SPECIALIZED_OCCUPATION_NAME|IS_AI_ROLE|
+--------------------------------------------------------+--------------------------------------------------------------+----------------------------------+----------+
|sr bi analyst data scientist                            |Sr BI Analyst/Data Scientist                                  |Data Analyst                      |1         |
|ai ml governance analyst                                |AI/ML Governance Analyst                                      |Data Analyst                      |1         |
|data engineering lead data technology data analytics ai |Data Engineering Lead, Data & Technology, Data Analytics & AI |Data Analyst                      |1   

                                                                                

In [60]:
# Missing Value Treatment
from pyspark.sql import Window
from pyspark.sql.functions import col, when, isnan, count, expr, median
from pyspark.sql import functions as F

# Calculate overall median salary
overall_median_salarly = df.approxQuantile("SALARY", [0.5], 0.01)[0]

median_by_employment_type = df.groupBy("EMPLOYMENT_TYPE").agg(expr("percentile_approx(SALARY, 0.5)").alias("median_salary_emp_type"))
median_by_employment_type_name = df.groupBy("EMPLOYMENT_TYPE_NAME").agg(expr("percentile_approx(SALARY, 0.5)").alias("median_salary_emp_type_name"))

# Join median values back to the original dataframe
df_salary_imputed = df.join(median_by_employment_type, on="EMPLOYMENT_TYPE", how = "left").join(median_by_employment_type_name, on="EMPLOYMENT_TYPE_NAME", how = "left")


# Replace missing SALARY values
df_salary_imputed=df_salary_imputed.withColumn("SALARY", when(col("SALARY").isNull(), 
                                when (col("median_salary_emp_type").isNotNull(), col("median_salary_emp_type"))
                                .when(col("median_salary_emp_type_name").isNotNull(), col("median_salary_emp_type_name"))
                                .otherwise(overall_median_salarly)
).otherwise(col("SALARY"))) 

                                                                                

In [61]:
from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.types import BooleanType, StringType, IntegerType
from pyspark.sql.functions import regexp_replace, trim

# Drop rows with NA values 
regression_df = df_salary_imputed.dropna(subset=[
    "SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE",
    "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING", "IS_AI_ROLE", 
]).select(
    "SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE",
    "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING", "IS_AI_ROLE", 
   
)

# Cast Duration to integer
regression_df = regression_df.withColumn("DURATION", col("DURATION").cast(IntegerType()))


In [62]:
# Categorical columns
categorical_cols = [ "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME", "IS_INTERNSHIP", "COMPANY_IS_STAFFING"]

# Cast boolean columns to integer
regression_df = regression_df.withColumn("IS_INTERNSHIP", col("IS_INTERNSHIP").cast(IntegerType()))
regression_df = regression_df.withColumn("COMPANY_IS_STAFFING", col("COMPANY_IS_STAFFING").cast(IntegerType()))


# Clean Remote Type Name
regression_df = regression_df.withColumn(
    "REMOTE_TYPE_NAME",
    when(col("REMOTE_TYPE_NAME") == "Remote", "Remote")
    .when(col("REMOTE_TYPE_NAME") == "[None]", "Undefined")
    .when(col("REMOTE_TYPE_NAME") == "Not Remote", "On Premise")
    .when(col("REMOTE_TYPE_NAME") == "Hybrid Remote", "Hybrid")
    .when(col("REMOTE_TYPE_NAME").isNull(), "On Premise")
    .otherwise(col("REMOTE_TYPE_NAME"))
)

# Clean Employment Type Name
regression_df = regression_df.withColumn(
    "EMPLOYMENT_TYPE_NAME",
    when(col("EMPLOYMENT_TYPE_NAME") == "Part-time / full-time", "Flexible")
    .when(col("EMPLOYMENT_TYPE_NAME") == "Part-time (â‰¤ 32 hours)", "Parttime")
    .when(col("EMPLOYMENT_TYPE_NAME") == "Full-time (> 32 hours)", "Fulltime")
    .when(col("EMPLOYMENT_TYPE_NAME").isNull(), "Fulltime")
    .otherwise(col("EMPLOYMENT_TYPE_NAME"))
)

# Clean Education Levels
regression_df = regression_df.withColumn(
    "EDUCATION_LEVELS_NAME",
    trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\[\]\n]", ""))
)


regression_df.show(5, truncate=False)

                                                                                

+--------+--------------------+--------------------+---------------------+--------------------+----------------+--------+-------------+-------------------+----------+
|SALARY  |MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|EDUCATION_LEVELS_NAME|EMPLOYMENT_TYPE_NAME|REMOTE_TYPE_NAME|DURATION|IS_INTERNSHIP|COMPANY_IS_STAFFING|IS_AI_ROLE|
+--------+--------------------+--------------------+---------------------+--------------------+----------------+--------+-------------+-------------------+----------+
|116500.0|2                   |2                   |"Bachelor's degree"  |Fulltime            |Undefined       |6       |0            |0                  |0         |
|116500.0|7                   |7                   |"No Education Listed"|Fulltime            |Undefined       |18      |0            |1                  |0         |
|116500.0|1                   |1                   |"No Education Listed"|Fulltime            |Undefined       |8       |0            |1                  |0         

In [63]:
# Index and One-Hot Encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid="skip") for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]

#Assemble base features
assembler = VectorAssembler(
       inputCols=["MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", 
                  "DURATION", "IS_AI_ROLE"] + [f"{col}_vec" for col in categorical_cols],
       outputCol="features"
   )

pipeline = Pipeline(stages=indexers + encoders + [assembler])
regression_data =  pipeline.fit(regression_df).transform(regression_df)
regression_data.select("SALARY","features").show(5, truncate=False)

                                                                                

+--------+-------------------------------------------------------------+
|SALARY  |features                                                     |
+--------+-------------------------------------------------------------+
|116500.0|(29,[0,1,2,4,22,24,27,28],[2.0,2.0,6.0,1.0,1.0,1.0,1.0,1.0]) |
|116500.0|(29,[0,1,2,5,22,24,27],[7.0,7.0,18.0,1.0,1.0,1.0,1.0])       |
|116500.0|(29,[0,1,2,5,22,24,27],[1.0,1.0,8.0,1.0,1.0,1.0,1.0])        |
|116500.0|(29,[0,1,2,4,22,24,27,28],[1.0,1.0,32.0,1.0,1.0,1.0,1.0,1.0])|
|131100.0|(29,[0,1,2,4,22,24,27,28],[2.0,2.0,11.0,1.0,1.0,1.0,1.0,1.0])|
+--------+-------------------------------------------------------------+
only showing top 5 rows



In [64]:
# Split Data
regression_train, regression_test = regression_data.randomSplit([0.8, 0.2], seed=42)

print((regression_data.count(), len(regression_data.columns)))
print((regression_train.count(), len(regression_train.columns)))
print((regression_test.count(), len(regression_test.columns)))

                                                                                

(5039, 21)


                                                                                

(4070, 21)


[Stage 1264:>                                                       (0 + 1) / 1]

(969, 21)


                                                                                

In [65]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Train Multiple Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="SALARY")
lr_model = lr.fit(regression_train)

# Make predictions
predictions = lr_model.transform(regression_test)

# Evaluate model
evaluator = RegressionEvaluator(labelCol="SALARY", predictionCol="prediction")
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})

print(f"RMSE: {rmse}")
print(f"R²: {r2}")
print(f"MAE: {mae}")



25/10/09 21:29:25 WARN Instrumentation: [64cbbb93] regParam is zero, which might cause numerical instability and overfitting.
25/10/09 21:29:34 WARN Instrumentation: [64cbbb93] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
[Stage 1311:>                                                       (0 + 1) / 1]

RMSE: 22058.821158120867
R²: 0.1915025455889091
MAE: 15535.593012196921


                                                                                

In [66]:
predictions.select("SALARY", "prediction", "IS_AI_ROLE") \
    .groupBy("IS_AI_ROLE") \
    .agg(
        F.count("*").alias("job_count"),
        F.avg("SALARY").alias("avg_actual_salary"),
        F.avg("prediction").alias("avg_predicted_salary")
    ).orderBy("IS_AI_ROLE") \
    .show()

[Stage 1318:>                                                       (0 + 1) / 1]

+----------+---------+------------------+--------------------+
|IS_AI_ROLE|job_count| avg_actual_salary|avg_predicted_salary|
+----------+---------+------------------+--------------------+
|         0|      955|113290.49214659687|  111576.33820922484|
|         1|       14|111894.64285714286|  117446.94495596984|
+----------+---------+------------------+--------------------+



                                                                                