# Regression, Classification, and Topic Insights

# Load Dataset

In [25]:

from pyspark.sql import SparkSession
import pandas as pd
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt



spark = SparkSession.builder \
    .appName("JobPostingsAnalysis") \
    .getOrCreate()

file_path = "lightcast_job_postings.csv"

df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("escape", "\"") \
    .csv(file_path)



                                                                                

# Multiple Regression

In [29]:
from pyspark.sql.functions import lower, col, when

# Define your AI-related pattern
ai_pattern = r'\b(ai|artificial intelligence|generative ai|conversational ai)\b'

# Create binary indicator columns
df = df.withColumn(
    "TITLE_AI", when(lower(col("TITLE_CLEAN")).rlike(ai_pattern), 1).otherwise(0)
).withColumn(
    "SKILLS_AI", when(lower(col("SKILLS_NAME")).rlike(ai_pattern), 1).otherwise(0)
).withColumn(
    "SPECIALIZED_AI", when(lower(col("SPECIALIZED_SKILLS_NAME")).rlike(ai_pattern), 1).otherwise(0)
).withColumn(
    "COMMON_AI", when(lower(col("COMMON_SKILLS_NAME")).rlike(ai_pattern), 1).otherwise(0)
)
df.select("TITLE_CLEAN", "TITLE_AI", "SKILLS_AI", "SPECIALIZED_AI", "COMMON_AI").show(5,truncate=False)


+-------------------------------------------+--------+---------+--------------+---------+
|TITLE_CLEAN                                |TITLE_AI|SKILLS_AI|SPECIALIZED_AI|COMMON_AI|
+-------------------------------------------+--------+---------+--------------+---------+
|enterprise analyst ii iii                  |0       |0        |0             |0        |
|oracle consultant reports                  |0       |0        |0             |0        |
|data analyst                               |0       |0        |0             |0        |
|sr lead data mgmt analyst sas product owner|0       |0        |0             |0        |
|comisiones de por semana comiensa rapido   |0       |0        |0             |0        |
+-------------------------------------------+--------+---------+--------------+---------+
only showing top 5 rows

