<a href="https://colab.research.google.com/github/clark1031/australian-ai-salary-analysis/blob/main/australian_ai_salary_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==============================================================================
# A Predictive and Interpretive Analysis of Salary Drivers
# in the Australian AI Skills Landscape
#
# Author: Jiansong Zhang
# Course: Big Data Analysis and Project
# Date: July 2025
# ==============================================================================

# --- Import Core Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub

In [None]:
# ==============================================================================
# PHASE 1: DATA SOURCING, FILTERING & PRE-PROCESSING
# ==============================================================================
print("--- Phase 1: Data Sourcing, Filtering & Pre-processing ---")

# --- Step 1.1: Authenticate and Load Data from Kaggle ---
# This step requires the user to upload their kaggle.json API key.
# try:
#     from google.colab import files
#     print("Please upload your kaggle.json file:")
#     files.upload()
#     !mkdir -p ~/.kaggle
#     !mv kaggle.json ~/.kaggle/
#     !chmod 600 ~/.kaggle/kaggle.json
#     kagglehub.login()
#     print("Kaggle Hub Authenticated.")
# except ImportError:
#     print("Not in a Colab environment. Assuming local Kaggle setup.")

# Load the dataset using kagglehub
# NOTE: For kagglehub execution without local, replace this block with:
# print("\nLoading dataset from Kaggle...")
# df = kagglehub.load_dataset(
#     "bismasajjad/global-ai-job-market-and-salary-trends-2025",
#     file_path="ai_job_dataset.csv",
# )
df = pd.read_csv('path/to/your/ai_job_dataset.csv')
print(f"Dataset loaded successfully. Shape: {df.shape}")

# --- Step 1.2: Filter for Australian Data and Initial Cleaning ---
# Create the Australian subset
df_au = df[df['company_location'] == 'Australia'].copy()

# Drop rows with missing values in critical columns
df_cleaned_au = df_au.dropna(subset=['required_skills', 'salary_usd']).copy()
print(f"Filtered to {len(df_cleaned_au)} clean Australian records.")

In [None]:
# ==============================================================================
# PHASE 2: EXPLORATORY DATA ANALYSIS (EDA) & FEATURE ENGINEERING
# ==============================================================================
print("\n--- Phase 2: EDA & Feature Engineering ---")

# --- Step 2.1: Skill Clustering (Feature Engineering) ---
# Define a mapping to group specific skills into broader categories
skill_map = {
    'tableau': 'business_intelligence', 'power bi': 'business_intelligence', 'data visualization': 'business_intelligence',
    'aws': 'cloud_computing', 'gcp': 'cloud_computing', 'azure': 'cloud_computing',
    'pytorch': 'deep_learning_framework', 'tensorflow': 'deep_learning_framework', 'keras': 'deep_learning_framework',
    'docker': 'containerization', 'kubernetes': 'containerization',
    'nlp': 'natural_language_processing', 'computer vision': 'computer_vision'
}

def apply_skill_mapping(skill_list):
    """Maps a list of raw skills to their higher-level categories."""
    mapped_set = set()
    for skill in skill_list:
        mapped_set.add(skill_map.get(skill, skill))
    return list(mapped_set)

# Apply the skill clustering to the Australian data
df_cleaned_au['skills_list'] = df_cleaned_au['required_skills'].str.lower().str.split(r',\s*')
df_cleaned_au['skills_mapped'] = df_cleaned_au['skills_list'].apply(apply_skill_mapping)
print("\nSkill clustering applied.")

# --- Step 2.2: EDA Visualizations ---
print("Generating EDA visualizations...")
sns.set_style("whitegrid")

# --- New Skill Counting Function ---
def get_mapped_skill_counts(dataframe):
    """
    Takes a DataFrame with a 'skills_mapped' column and returns
    a Series with the counts of each mapped skill.
    """
    return dataframe['skills_mapped'].explode().value_counts()

mapped_au_skill_counts = get_mapped_skill_counts(df_cleaned_au)
top_10_australia_mapped = mapped_au_skill_counts.head(10)

# Visualization 1: Top 10 Mapped Skills in Australia
plt.figure(figsize=(10, 8))
sns.barplot(x=top_10_australia_mapped.values, y=top_10_australia_mapped.index)
plt.title('Figure 1: Top 10 Most In-Demand Skill Categories in Australia', fontsize=16)
plt.xlabel('Number of Job Postings', fontsize=12)
plt.ylabel('Skill Category', fontsize=12)
plt.show()

# Visualization 2: Salary Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned_au['salary_usd'], kde=True, bins=30)
plt.title('Fig. A: Distribution of AI Salaries in Australia (USD)', fontsize=16)
plt.xlabel('Annual Salary (USD)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

# Visualization 3: Salary by Experience Level
plt.figure(figsize=(10, 7))
exp_order = ['EN', 'MI', 'SE', 'EX']
sns.boxplot(data=df_cleaned_au, x='experience_level', y='salary_usd', order=exp_order)
plt.title('Fig. B: Salary Distribution by Experience Level in Australia', fontsize=16)
plt.xlabel('Experience Level (Entry, Mid, Senior, Executive)', fontsize=12)
plt.ylabel('Annual Salary (USD)', fontsize=12)
plt.show()


In [None]:
# ==============================================================================
# PHASE 3: FINAL FEATURE PREPARATION FOR MODELLING
# ==============================================================================
print("\n--- Phase 3: Final Feature Preparation for Modelling ---")

# --- Step 3.1: One-Hot Encode Categorical Features ---
df_model_ready = pd.get_dummies(df_cleaned_au, columns=['experience_level'], prefix='exp', drop_first=True)
print("One-hot encoded 'experience_level'.")

# --- Step 3.2: Create Binary Skill Features ---
top_skills_for_model = ['cloud_computing', 'deep_learning_framework', 'business_intelligence', 'containerization']
for skill in top_skills_for_model:
    df_model_ready[f'has_{skill}'] = df_model_ready['skills_mapped'].apply(lambda skills: 1 if skill in skills else 0)
print("Created binary features for top skills.")

# --- Step 3.3: Define Final Feature Sets and Target Variable ---
y = df_model_ready['salary_usd']
feature_columns_full = ['years_experience', 'exp_MI', 'exp_SE', 'exp_EX'] + [f'has_{skill}' for skill in top_skills_for_model]
X_full = df_model_ready[feature_columns_full]

# --- Step 3.4: Split Data into Training and Testing Sets ---
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=42)
print(f"\nData split into {len(X_train)} training records and {len(X_test)} testing records.")

In [None]:
# ==============================================================================
# PHASE 4: PREDICTIVE MODELLING & EVALUATION
# ==============================================================================
print("\n--- Phase 4: Predictive Modelling & Evaluation ---")

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error

# --- Step 4.1: Train and Evaluate Initial (Default) Models ---
models_initial = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression (Default)": Ridge(alpha=1.0),
    "Random Forest (Default)": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = []
for name, model in models_initial.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results.append({
        "Model": name,
        "R-squared": r2_score(y_test, y_pred),
        "Mean Absolute Error (MAE)": mean_absolute_error(y_test, y_pred)
    })
print("\nInitial models trained and evaluated.")

# --- Step 4.2: Hyperparameter Tuning for Refined Models ---
print("Performing hyperparameter tuning...")

# Tune Ridge Regression
param_grid_ridge = {'alpha': [0.1, 1.0, 10.0, 100.0]}
grid_search_ridge = GridSearchCV(Ridge(), param_grid_ridge, cv=5, scoring='r2')
grid_search_ridge.fit(X_train, y_train)
tuned_ridge_model = grid_search_ridge.best_estimator_
print(f"Best alpha for Ridge Regression found: {grid_search_ridge.best_params_['alpha']}")

# Tune Random Forest Regressor
param_grid_rf = {'n_estimators': [100, 200], 'max_depth': [10, None], 'max_features': ['sqrt']}
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=3, scoring='r2', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
tuned_rf_model = grid_search_rf.best_estimator_
print(f"Best parameters for Random Forest found: {grid_search_rf.best_params_}")

# --- Step 4.3: Evaluate Tuned Models ---
models_tuned = {
    "Ridge Regression (Tuned)": tuned_ridge_model,
    "Random Forest (Tuned)": tuned_rf_model
}

for name, model in models_tuned.items():
    y_pred = model.predict(X_test)
    results.append({
        "Model": name,
        "R-squared": r2_score(y_test, y_pred),
        "Mean Absolute Error (MAE)": mean_absolute_error(y_test, y_pred)
    })
print("\nTuned models evaluated.")

# --- Step 4.4: Display Comprehensive Performance Comparison ---
df_performance = pd.DataFrame(results)
# Formatting for display
df_performance['R-squared'] = df_performance['R-squared'].map('{:.3f}'.format)
df_performance['Mean Absolute Error (MAE)'] = df_performance['Mean Absolute Error (MAE)'].map('${:,.0f}'.format)

print("\n--- Comprehensive Model Performance Comparison on Test Set ---")
print(df_performance.to_string())

In [None]:
# ==============================================================================
# PHASE 5: INTERPRETATION OF BEST MODEL
# ==============================================================================
print("\n--- Phase 5: Interpretation of Best Model (Linear Regression) ---")

# --- Step 5.1: Analyze Multicollinearity in Experience Features ---
experience_features = ['years_experience', 'exp_MI', 'exp_SE', 'exp_EX']
experience_correlation = df_model_ready[experience_features].corr()
print("\n--- Correlation Matrix of Experience-Related Features ---")
print(experience_correlation)

# --- Step 5.2: Extract and Display Model Coefficients ---
# We use the initial Linear Regression model for interpretation as it is unpenalized
lr_model = models_initial["Linear Regression"]
coefficients = pd.DataFrame(lr_model.coef_, X_full.columns, columns=['Coefficient (Monetary Value)'])
print("\n--- Multiple Linear Regression Model Coefficients ---")
print(coefficients)

print("\n--- End of Analysis ---")