# Regression, Classification, and Topic Insights

# Load Dataset

In [19]:

from pyspark.sql import SparkSession
import pandas as pd
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt



spark = SparkSession.builder \
    .appName("JobPostingsAnalysis") \
    .getOrCreate()

file_path = "lightcast_job_postings.csv"

df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("escape", "\"") \
    .csv(file_path)
#df.show(5)



                                                                                

# Multiple Regression

In [20]:
from pyspark.sql.functions import col, lower, when

# AI-related keywords
ai_pattern = r'\b(ai|artificial intelligence|machine learning|deep learning|generative ai|neural network|nlp|computer vision)\b'

# Create AI Role flag (1 = AI job, 0 = Non-AI job)
df = df.withColumn(
    "IS_AI_ROLE",
    when(
        lower(col("TITLE_CLEAN")).rlike(ai_pattern) |
        lower(col("LOT_V6_SPECIALIZED_OCCUPATION_NAME")).rlike(ai_pattern),
        1
    ).otherwise(0)
)
df.select(
    "TITLE_CLEAN",
    "LOT_V6_SPECIALIZED_OCCUPATION_NAME",
    "IS_AI_ROLE"
).orderBy(col("IS_AI_ROLE").desc()) \
 .show(5, truncate=False)


[Stage 30:>                                                         (0 + 1) / 1]

+-------------------------------------------------------+----------------------------------+----------+
|TITLE_CLEAN                                            |LOT_V6_SPECIALIZED_OCCUPATION_NAME|IS_AI_ROLE|
+-------------------------------------------------------+----------------------------------+----------+
|data engineering lead data technology data analytics ai|Data Analyst                      |1         |
|ai ml governance analyst                               |Data Analyst                      |1         |
|data engineering lead data technology data analytics ai|Data Analyst                      |1         |
|data engineering lead data technology data analytics ai|Data Analyst                      |1         |
|ai ml governance analyst                               |Data Analyst                      |1         |
+-------------------------------------------------------+----------------------------------+----------+
only showing top 5 rows



                                                                                