<a href="https://colab.research.google.com/github/eliaahadi/notebooks/blob/main/AI_Job_Market_Trends.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
abhishekjaiswal4896_ai_job_market_trends_path = kagglehub.dataset_download('abhishekjaiswal4896/ai-job-market-trends')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/abhishekjaiswal4896/ai-job-market-trends?dataset_version_number=1...


100%|██████████| 91.8k/91.8k [00:00<00:00, 29.4MB/s]

Extracting files...
Data source import complete.





In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from tabulate import tabulate

# display settings
sns.set(style='whitegrid', palette='muted', color_codes=True)

# predictive modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

# set random seed
import random
random.seed(42)
np.random.seed(42)

import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv("/kaggle/input/ai-job-market-trends/ai_job_market.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/ai-job-market-trends/ai_job_market.csv'

In [None]:
df.tail()


In [None]:
df.dtypes


In [None]:
df.describe()


In [None]:
df.isnull().sum()


In [None]:
df.describe().T.plot(kind='bar')


In [None]:
numeric_cols = df.select_dtypes(include=['number']).columns

# Distribution plot for each numerical column
for col in numeric_cols:
    sns.histplot(x=col, data=df, kde=True)
    plt.show()

In [None]:
# Plot 1: Countplot of Industry Distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=df, y='industry', order=df['industry'].value_counts().index)
plt.title('Number of Job Listings per Industry')
plt.xlabel('Count')
plt.ylabel('Industry')
plt.tight_layout()
plt.show()

# Plot 2: Countplot of Employment Type
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='employment_type', order=df['employment_type'].value_counts().index)
plt.title('Distribution of Employment Types')
plt.xlabel('Employment Type')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# Plot 3: Histogram of Posted Dates
plt.figure(figsize=(10, 6))
sns.histplot(df['posted_date'].dropna(), bins=30, kde=False)
plt.title('Distribution of Job Posting Dates')
plt.xlabel('Posted Date')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# If the dataset had more numeric columns, we could create a correlation heatmap.
numeric_df = df.select_dtypes(include=[np.number])
if numeric_df.shape[1] >= 4:
    plt.figure(figsize=(10, 8))
    corr = numeric_df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap of Numeric Features')
    plt.tight_layout()
    plt.show()
else:
    print('Not enough numeric features for a correlation heatmap.')

In [None]:
# For the predictive model, we will use a subset of features that we suspect have predictive power for 'experience_level'.
# Selected features: 'industry', 'employment_type', 'company_size'

# Check that the target and the features exist in the dataset
selected_features = ['industry', 'employment_type', 'company_size']

if all([col in df.columns for col in selected_features + ['experience_level']]):
    model_df = df[selected_features + ['experience_level']].copy()

    # Drop rows with missing values in these columns
    model_df.dropna(inplace=True)

    # Encoding categorical features using pandas' factorize method
    for col in selected_features + ['experience_level']:
        model_df[col] = pd.factorize(model_df[col])[0]

    # Split data into training and testing sets (80/20 split)
    X = model_df[selected_features]
    y = model_df['experience_level']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Instantiate and train a RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('Prediction Accuracy:', accuracy)
    print('\nClassification Report:\n', classification_report(y_test, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix for Experience Level Prediction')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.show()

    # Permutation Importance Plot (simple bar plot for feature importances)
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(8, 6))
    plt.barh(range(len(indices)), importances[indices], align='center')
    plt.yticks(range(len(indices)), [selected_features[i] for i in indices])
    plt.xlabel('Feature Importance')
    plt.title('Permutation Importance')
    plt.tight_layout()
    plt.show()
else:
    print('Required columns for predictive modeling are missing from the dataset.')