# AI Job Market Analysis

## 1. Imports & Configuration

In [1]:
# --- 1. Core Libraries ---
import pandas as pd
import numpy as np
import os, json
import warnings

# --- 2. Visualization Libraries ---
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "notebook_connected"

# --- 3. Scikit-Learn ---
from sklearn.preprocessing import MultiLabelBinarizer

# --- 4. Configuration ---
pio.renderers.default = "notebook_connected"
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output
sns.set_style('whitegrid')

# --- 5. Global Aesthetics ---
# Define a consistent color palette
palette = ['#023047', '#219ebc', '#ffb703', '#8ecae6', '#fb8500']

# Apply to libraries
sns.set_palette(palette)
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=palette)
px.defaults.template = "plotly_white"
px.defaults.color_discrete_sequence = palette

print("Setup Complete. Libraries loaded and palette applied.")

Setup Complete. Libraries loaded and palette applied.


## 2. Helper Functions

In [2]:
# --- A. Data Helpers ---

def get_counts(df, col, top_n=None):
    """Calculates frequency and percentage, optionally returning top N."""
    counts = df[col].value_counts().reset_index()
    counts.columns = [col, 'count']
    if top_n:
        counts = counts.head(top_n)
    counts['percent'] = (counts['count'] / counts['count'].sum() * 100).round(1)
    return counts

def get_time_trend(df, date_col, freq='M'):
    """Aggregates data by time (M=Month, Q=Quarter, Y=Year)."""
    temp_df = df.copy()
    temp_df[date_col] = pd.to_datetime(temp_df[date_col])
    trend = temp_df.set_index(date_col).resample(freq).size().reset_index(name='count')
    return trend

# --- B. Plotting Functions ---

def plot_bar_chart(df, col, title, top_n=15, orientation='v', color=None):
    """Plots a standard Bar Chart with percentages."""
    data = get_counts(df, col, top_n)
    if orientation == 'h':
        x_val, y_val = 'count', col
        data = data.sort_values('count', ascending=True)
    else:
        x_val, y_val = col, 'count'
        
    fig = px.bar(data, x=x_val, y=y_val, title=title, text='percent',
                 color=color, orientation=orientation)
    fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
    fig.update_layout(showlegend=False, height=600)
    return fig

def plot_pie_chart(df, col, title):
    """Plots a Donut Chart for categorical distributions."""
    data = get_counts(df, col)
    fig = px.pie(data, names=col, values='count', title=title, hole=0.4,
                 color_discrete_sequence=palette)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    return fig

def plot_distribution(df, col, title, nbins=40):
    """Plots a Histogram with a marginal Box Plot."""
    fig = px.histogram(df, x=col, nbins=nbins, title=title, marginal="box",
                       color_discrete_sequence=[palette[0]])
    fig.update_layout(bargap=0.1)
    return fig

def plot_box_comparison(df, cat_col, num_col, title, color=None):
    """Plots a Box Plot comparing numerical distributions across categories."""
    order = df.groupby(cat_col)[num_col].median().sort_values().index
    fig = px.box(df, x=cat_col, y=num_col, title=title, color=color,
                 category_orders={cat_col: order})
    return fig

def plot_heatmap(df, x_col, y_col, title):
    """Plots a Heatmap to show density/frequency between two variables."""
    pivot_table = pd.crosstab(df[y_col], df[x_col])
    fig = px.imshow(pivot_table, title=title, aspect='auto',
                    labels=dict(x=x_col, y=y_col, color="Count"))
    return fig

def plot_outlier_analysis(df, col, title="Outlier Analysis (Box + Hist)"):
    """Creates a combined Box and Histogram plot for detailed outlier analysis."""
    s = df[col].dropna()
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                        row_heights=[0.3, 0.7], vertical_spacing=0.05,
                        specs=[[{"type": "box"}], [{"type": "xy"}]])
    
    fig.add_trace(go.Box(x=s, name='Box Plot', marker_color=palette[0], boxpoints='outliers'), row=1, col=1)
    fig.add_trace(go.Histogram(x=s, nbinsx=50, name='Distribution', marker_color=palette[1]), row=2, col=1)
    
    mean_val = s.mean()
    median_val = s.median()
    fig.add_vline(x=mean_val, line_dash='dash', line_color=palette[4], annotation_text=f'Mean: {mean_val:,.0f}')
    fig.add_vline(x=median_val, line_dash='dot', line_color=palette[2], annotation_text=f'Median: {median_val:,.0f}', annotation_position='top left')
    fig.update_layout(title=title, height=700, showlegend=False)
    return fig

def plot_grouped_histogram(df, x_col, group_col, title, top_n=10):
    """Plots a grouped histogram to compare categories."""
    top_items = df[x_col].value_counts().head(top_n).index
    filtered_df = df[df[x_col].isin(top_items)]
    fig = px.histogram(filtered_df, x=x_col, color=group_col, barmode='group',
                       title=title, color_discrete_sequence=palette)
    fig.update_layout(xaxis_tickangle=-45, yaxis_title="Count", height=600)
    return fig

def plot_grouped_box(df, x_col, y_col, group_col, title, top_n=10):
    """Plots a grouped box plot to compare distributions across subgroups."""
    top_items = df[x_col].value_counts().head(top_n).index
    filtered_df = df[df[x_col].isin(top_items)]
    fig = px.box(filtered_df, x=x_col, y=y_col, color=group_col,
                 title=title, color_discrete_sequence=palette)
    fig.update_layout(xaxis_tickangle=-45, height=600)
    return fig

def plot_choropleth_map(df, loc_col, title):
    """Plots a global choropleth map for job distribution."""
    country_counts = df[loc_col].value_counts().reset_index()
    country_counts.columns = ['country', 'count']
    fig = px.choropleth(country_counts, locations="country", locationmode="country names",
                        color="count", hover_name="country",
                        title=title)
    fig.update_layout(geo=dict(showframe=False, showcoastlines=True, projection_type='natural earth'), height=600)
    return fig

def plot_trend(df, x_col, y_col, title, agg_func='count'):
    if agg_func == 'count':
        data = df[x_col].value_counts().reset_index()
        data.columns = [x_col, 'value']
        y_label = 'Number of Jobs'
    else:
        data = df.groupby(x_col)[y_col].mean().reset_index()
        data.columns = [x_col, 'value']
        y_label = 'Avg Salary'


    fig = px.bar(data, x=x_col, y='value', 
                 title=title, text_auto='.2s',
                 color=x_col, 
                 category_orders=orders_dict)
    
    fig.update_layout(xaxis_title=x_col, yaxis_title=y_label, showlegend=False)
    return fig

def plot_time_trend(df, x_col, y_col, title, agg_func='count'):
    """Helper to plot time trends respecting chronological order."""
    if agg_func == 'count':
        data = df[x_col].value_counts().reset_index()
        data.columns = [x_col, 'value']
        y_label = 'Number of Jobs'
    else:
        data = df.groupby(x_col)[y_col].mean().reset_index()
        data.columns = [x_col, 'value']
        y_label = 'Avg Salary'
    data = data.sort_values('value', ascending=False)
    fig = px.bar(data, x=x_col, y='value', title=title, text_auto='.2s')
    fig.update_layout(xaxis_title=x_col, yaxis_title=y_label)
    return fig



## 3. Data Loading & Explore

In [3]:
# Load Data
df = pd.read_csv('data/raw_ai_job_market.csv')

# Basic Inspection
print(f"Dataset Shape: {df.shape}")
df.head()

Dataset Shape: (2000, 12)


Unnamed: 0,job_id,company_name,industry,job_title,skills_required,experience_level,employment_type,location,salary_range_usd,posted_date,company_size,tools_preferred
0,1,Foster and Sons,Healthcare,Data Analyst,"NumPy, Reinforcement Learning, PyTorch, Scikit...",Mid,Full-time,"Tracybury, AR",92860-109598,2025-08-20,Large,"KDB+, LangChain"
1,2,"Boyd, Myers and Ramirez",Tech,Computer Vision Engineer,"Scikit-learn, CUDA, SQL, Pandas",Senior,Full-time,"Lake Scott, CU",78523-144875,2024-03-22,Large,"FastAPI, KDB+, TensorFlow"
2,3,King Inc,Tech,Quant Researcher,"MLflow, FastAPI, Azure, PyTorch, SQL, GCP",Entry,Full-time,"East Paige, CM",124496-217204,2025-09-18,Large,"BigQuery, PyTorch, Scikit-learn"
3,4,"Cooper, Archer and Lynch",Tech,AI Product Manager,"Scikit-learn, C++, Pandas, LangChain, AWS, R",Mid,Full-time,"Perezview, FI",50908-123743,2024-05-08,Large,"TensorFlow, BigQuery, MLflow"
4,5,Hall LLC,Finance,Data Scientist,"Excel, Keras, SQL, Hugging Face",Senior,Contract,"North Desireeland, NE",98694-135413,2025-02-24,Large,"PyTorch, LangChain"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   job_id            2000 non-null   int64 
 1   company_name      2000 non-null   object
 2   industry          2000 non-null   object
 3   job_title         2000 non-null   object
 4   skills_required   2000 non-null   object
 5   experience_level  2000 non-null   object
 6   employment_type   2000 non-null   object
 7   location          2000 non-null   object
 8   salary_range_usd  2000 non-null   object
 9   posted_date       2000 non-null   object
 10  company_size      2000 non-null   object
 11  tools_preferred   2000 non-null   object
dtypes: int64(1), object(11)
memory usage: 187.6+ KB


In [5]:
df.describe(include=[object]).T

Unnamed: 0,count,unique,top,freq
company_name,2000,1909,Johnson LLC,4
industry,2000,7,Automotive,300
job_title,2000,8,Data Analyst,271
skills_required,2000,1985,"Power BI, C++, NumPy",2
experience_level,2000,3,Entry,702
employment_type,2000,4,Internship,574
location,2000,2000,"Tracybury, AR",1
salary_range_usd,2000,2000,92860-109598,1
posted_date,2000,680,2023-10-03,10
company_size,2000,3,Startup,672


In [6]:
cols_to_print = ['industry', 'job_title', 'experience_level', 'employment_type', 'company_size']

for col in cols_to_print:
    print(f"\n=== Unique values for: {col} ===")
    unique_vals = df[col].unique()
    print(unique_vals)


=== Unique values for: industry ===
['Healthcare' 'Tech' 'Finance' 'E-commerce' 'Automotive' 'Education'
 'Retail']

=== Unique values for: job_title ===
['Data Analyst' 'Computer Vision Engineer' 'Quant Researcher'
 'AI Product Manager' 'Data Scientist' 'ML Engineer' 'NLP Engineer'
 'AI Researcher']

=== Unique values for: experience_level ===
['Mid' 'Senior' 'Entry']

=== Unique values for: employment_type ===
['Full-time' 'Contract' 'Remote' 'Internship']

=== Unique values for: company_size ===
['Large' 'Startup' 'Mid']


In [7]:
df.isna().sum()

job_id              0
company_name        0
industry            0
job_title           0
skills_required     0
experience_level    0
employment_type     0
location            0
salary_range_usd    0
posted_date         0
company_size        0
tools_preferred     0
dtype: int64

In [8]:
df.duplicated().sum()

0

## 4.Data Preprocessing & Engineering

### 4.1 Basic Cleaning

In [9]:
df = df.drop(columns=['job_id'])

### 4.2 Feature Engineering: Location (City/Country)

In [10]:
df[['city', 'country_code']] = df['location'].str.split(',', n=1, expand=True).apply(lambda x: x.str.strip())
df = df.drop(columns=['location'])


with open('data/state_map.json', 'r') as f:
    location_map = json.load(f)

df['country_code'] = df['country_code'].apply(lambda x: x.split(', ')[-1] if ',' in x else x)
# Map it using the JSON
df['country'] = df['country_code'].map(location_map).fillna(df['country_code'])
df = df.drop(columns=['country_code'])

### 4.3 Feature Engineering: Salary Parsing

In [11]:
df[['min_salary', 'max_salary']] = df['salary_range_usd'].str.split('-', n=1, expand=True).astype(float)

df['avg_salary'] = (df['min_salary'] + df['max_salary']) / 2

df = df.drop(columns=['salary_range_usd'])

### 4.4 Feature Engineering: Date & Time

In [12]:
df['posted_date'] = pd.to_datetime(df['posted_date'])
df['year'] = df['posted_date'].dt.year
df['month'] = df['posted_date'].dt.month_name()
df['day_name'] = df['posted_date'].dt.day_name()
df['quarter'] = 'Q' + df['posted_date'].dt.quarter.astype(str)

### 4.5 Feature Engineering: Skills (One-Hot Encoding)

In [13]:
df['skills'] = df[['skills_required', 'tools_preferred']].apply(lambda x: [s.strip() for s in (x['skills_required'] + ',' + x['tools_preferred']).split(',') if s.strip()], axis=1)

mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(df['skills'])

skills_df = pd.DataFrame(skills_encoded, columns=mlb.classes_, index=df.index)

df = pd.concat([df, skills_df], axis=1)

df.drop(columns=['skills_required', 'tools_preferred', 'skills'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   company_name            2000 non-null   object        
 1   industry                2000 non-null   object        
 2   job_title               2000 non-null   object        
 3   experience_level        2000 non-null   object        
 4   employment_type         2000 non-null   object        
 5   posted_date             2000 non-null   datetime64[ns]
 6   company_size            2000 non-null   object        
 7   city                    2000 non-null   object        
 8   country                 2000 non-null   object        
 9   min_salary              2000 non-null   float64       
 10  max_salary              2000 non-null   float64       
 11  avg_salary              2000 non-null   float64       
 12  year                    2000 non-null   int32   

## 5. Exploratory Data Analysis (EDA)

### 5.1 Market Overview

In [14]:
# 1. Who is hiring the most?
plot_bar_chart(df, 'company_name', '1. Who is hiring the most?', top_n=10).show()

# 2. What are the most common roles? (Using Grouped Histogram as requested)
plot_grouped_histogram(df, 'job_title', 'experience_level', '2. What are the most common roles? (by Experience)').show()

# 3. Which industries are dominating?
plot_bar_chart(df, 'industry', '3. Which industries are dominating?', top_n=10).show()

# 4. Which employment types are dominating?
plot_pie_chart(df, 'employment_type', '4. Which employment types are dominating?').show()

# 5. Which company sizes are dominating?
plot_pie_chart(df, 'company_size', '5. Which company sizes are dominating?').show()

### 5.2 Salary & Compensation

In [15]:
# 1. How are salaries distributed?
plot_distribution(df, 'avg_salary', '1. How are salaries distributed?').show()

# 2. Does experience pay off? (Grouped Box: Job Title vs Salary colored by Experience)
plot_grouped_box(df, 'job_title', 'avg_salary', 'experience_level', '2. Does experience pay off? (Salary by Role & Exp)').show()

# 3. Who pays better? (Grouped Box: Company Size vs Salary colored by Experience)
plot_grouped_box(df, 'company_size', 'avg_salary', 'experience_level', '3. Who pays better? (Company Size & Exp)').show()

# 4. Salary vs employment type
plot_box_comparison(df, 'employment_type', 'avg_salary', '4. Salary vs Employment Type', color='employment_type').show()

### 5.3 Location & Geography

In [16]:
# 1. Global AI Hubs (Map)
if 'country' in df.columns:
    plot_choropleth_map(df, 'country', 'Global AI Jobs Distribution').show()

# 2. Top Countries by Job Count
plot_bar_chart(df, 'country', 'Top 10 Countries by Job Count', top_n=10).show()

# 3. Location Impact on Salary (Top 10 Countries)
top_countries = df['country'].value_counts().head(10).index
df_top_countries = df[df['country'].isin(top_countries)]
plot_box_comparison(df_top_countries, 'country', 'avg_salary', 'Salary Distribution in Top 10 Countries').show()

# 4. Industry Concentration by Country (Heatmap)
top_industries = df['industry'].value_counts().head(10).index
df_filtered_loc = df[df['country'].isin(top_countries) & df['industry'].isin(top_industries)]
plot_heatmap(df_filtered_loc, 'industry', 'country', 'Industry Concentration by Country').show()

### 5.4 Time Series Analysis

In [17]:
# 1. Jobs per Time Unit
plot_time_trend(df, 'quarter', None, 'Hiring Volume per Quarter', 'count').show()
plot_time_trend(df, 'month', None, 'Hiring Volume per Month', 'count').show()
plot_time_trend(df, 'day_name', None, 'Hiring Volume per Day', 'count').show()

# 2. Salary Trends
plot_time_trend(df, 'quarter', 'avg_salary', 'Average Salary per Quarter', 'mean').show()
plot_time_trend(df, 'month', 'avg_salary', 'Average Salary per Month', 'mean').show()

### 5.5 Skills Analysis

In [18]:
# %%
# Define key skills to analyze (ensure they exist in columns)
target_skills = ['AWS', 'Azure', 'BigQuery', 'C++', 'CUDA', 'Excel', 'FastAPI', 'Flask', 'GCP', 
                 'Hugging Face', 'KDB+', 'Keras', 'LangChain', 'MLflow', 'NumPy', 'Pandas', 
                 'Power BI', 'PyTorch', 'Python', 'R', 'Reinforcement Learning', 'SQL', 
                 'Scikit-learn', 'TensorFlow']

existing_skills = [col for col in target_skills if col in df.columns]

# 16. Standard Tech Stack (Top Skills)
skills_sum = df[existing_skills].sum().sort_values(ascending=False).reset_index()
skills_sum.columns = ['Skill', 'Count']
skills_sum['percent'] = (skills_sum['Count'] / len(df) * 100).round(1)

fig_stack = px.bar(skills_sum.head(20), x='Count', y='Skill', orientation='h', 
                   title='Top 20 In-Demand Skills', text='percent', color='Count')
fig_stack.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig_stack.update_layout(yaxis={'categoryorder':'total ascending'}, showlegend=False, height=600)
fig_stack.show()

# 17. Skill Definition per Role (Heatmap)
job_skill_matrix = df.groupby('job_title')[existing_skills].sum()
fig_roles = px.imshow(job_skill_matrix, title='Skill Frequency per Job Title', aspect='auto',
                      labels=dict(x="Skill", y="Job Title", color="Count"))
fig_roles.show()

# 18. Skill Demand by Company Size
size_skill_matrix = df.groupby('company_size')[existing_skills].sum()
fig_startups = px.imshow(size_skill_matrix, title='Skill Demand by Company Size', aspect='auto',
                         labels=dict(x="Skill", y="Company Size", color="Count"))
fig_startups.show()

# 19. Tech Stack Complexity
df['skills_count'] = df[existing_skills].sum(axis=1)
plot_distribution(df, 'skills_count', 'Tech Stack Complexity (Skills per Job)').show()

## 6. Conclusion

Based on our comprehensive analysis of the AI Job Market dataset, we have derived the following key insights that define the current landscape:

### 1. A Balanced Market Ecosystem
Contrary to markets dominated by a few tech giants, our analysis reveals a **highly balanced ecosystem**.
* **Hiring Volume:** There is no massive gap between top hiring companies; the demand is evenly distributed.
* **Industry & Company Size:** Similarly, job opportunities are spread consistently across various industries and company sizes (Startups vs. Large firms), indicating a healthy and diverse market.

### 2. The "Entry-Level" Boom
The data highlights a **strong preference for Entry-level talent** over Senior roles in terms of volume. This suggests the AI sector is still in an expansion phase, eager to onboard new talent to build foundational teams.

### 3. Location is the Key Differentiator
While industry and company size show uniform distributions, **Geography (Location)** emerges as the most significant factor influencing **Salary**. Where you work matters more than who you work for when it comes to compensation variance.

### 4. Strategic Timing (Seasonality)
For job seekers, timing plays a crucial role:
* **Best Quarter:** **Q1 (First Quarter)** is the most active hiring season, likely due to new fiscal year budgets.
* **Peak Month:** Surprisingly, **August** stands out as the single month with the highest hiring activity, marking a strategic window for applications.

### 5. The Must-Have Tech Stack
To remain competitive, candidates must master the core tools that appear in almost every job description. The top two non-negotiable skills are:
1. **[Tensorflow]**
2. **[FastAPI]**

---
**Final Recommendation:**
Job seekers should focus on mastering the core tech stack and applying aggressively during **Q1 and late summer (August)**. Since the market is friendly to entry-level roles, building a strong portfolio is key to seizing these abundant opportunities.