<a href="https://colab.research.google.com/github/cheonghf/ML-P4-03/blob/main/V2_Project_SourceCode_P4_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Salary Data CSV from Kaggle to Colab as Dataframe

*   Dataset is sourced from https://www.kaggle.com/datasets/mohithsairamreddy/salary-data/data


In [None]:
import pandas as pd
import numpy as np
import kagglehub

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# Download latest version
dataset_ref = kagglehub.dataset_download('mohithsairamreddy/salary-data', path='Salary_Data.csv')

#dataframe salary data 1
df_sd1 = pd.read_csv(dataset_ref)

#copy dataframe salary data 1 to dataframe salary data 2
df_sd2 = df_sd1.copy()

print('Original Data Frame:', df_sd1.shape)
print('Copied Data Frame:', df_sd2.shape)

#Step 1: Pre-Processing of Data

*   Run Everything



View Columns of 'Salary Data' Dataframe

In [None]:
print(df_sd2.columns)

Rename Columns in 'Salary Data' Dataframe by simplyfing and removing spaces, this allows us to better track for ease of programming

In [None]:
df_sd2.rename(columns={"Education Level": "EduLevel", "Job Title": "JobTitle", "Years of Experience": "YrsExp"}, inplace=True)
print("Updated Columns are:", df_sd2.columns)

## Step 1.1: What did we do if our Dataframe has Null values?

View the full 'Salary Data' Dataframe and Identify if it holds any Null Values

In [None]:
# view dataset salary data 2
print(df_sd2)
print("===Data Frame End===")
print('')

# Identify columns with missing valuesin dataset salary data 2
print('Null Values are at:')
print(df_sd2.isnull().sum())
print('')

1. Remove identified Null Values from the Dataframe
2. Run a check on the DataFrame to confirm if the Null Values are removed
3. We will go a step further and check if hidden Null Values could be represented by a string 'Na'



In [None]:
# Remove rows with missing values
df_sd2.dropna(inplace=True)

# Check if columns with missing values are removed
print(df_sd2.isnull().sum())
print('')

# To check if 'na' is string instead of just null
for column in df_sd2.columns:
    na_count = df_sd2[column].astype(str).apply(lambda x: x == 'na').sum()
    if na_count > 0:
        print(f"Column '{column}' contains {na_count} rows with the string 'na'")
    else:
      print(f"Column '{column}' does not contain the string 'na'")

## Step 1.2: What did we do if our Dataframe has Duplicate Entries?

Check for Duplicate Entries in 'Salary Data' Dataframe

In [None]:
# Check for duplicates and create a boolean Series
duplicates = df_sd2.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicates.sum()

# Calculate the number of non-duplicate rows
num_non_duplicates = len(df_sd2) - num_duplicates

# Calculate the percentage of duplicates
percentage_duplicates = (num_duplicates / len(df_sd2)) * 100

# Print the results
print(f"Number of duplicate rows: {num_duplicates}")
print(f"Number of non-duplicate rows: {num_non_duplicates}")
print(f"Percentage of duplicate rows: {percentage_duplicates:.2f}%")
print('')

# View the actual duplicate rows:
if num_duplicates > 0:
  print("Duplicate rows:")
  print(df_sd2[duplicates])

Filter Duplicate Rows of Data into another Data Frame

In [None]:
# Checking duplicates distribution
duplicate_counts = df_sd2[df_sd2.duplicated()].groupby(df_sd2.columns.tolist()).size().reset_index(name="Count")

# Identifying potential duplication patterns
df_duplicates = df_sd2.groupby(df_sd2.columns.tolist()).size().reset_index(name='Duplicate Count')

Histogram Comparison for 'Age' Duplicate Filter Before Vs After

In [None]:
# Visualising Age distribution before and after removing duplicates
plt.figure(figsize=(12, 5))

# Before removing duplicates
plt.subplot(1, 2, 1)
sns.histplot(df_sd2['Age'].dropna(), bins=20, kde=True, color='blue')
plt.title("Age Distribution Before Removing Duplicates")
plt.xlabel("Age")
plt.ylabel("Frequency")

# After removing duplicates
plt.subplot(1, 2, 2)
sns.histplot(df_duplicates['Age'].dropna(), bins=20, kde=True, color='green')
plt.title("Age Distribution After Removing Duplicates")
plt.xlabel("Age")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

Histogram Comparison for 'Salary' Duplicate Filter Before Vs After

In [None]:
# Visualising salary distribution before and after removing duplicates
plt.figure(figsize=(12, 5))

# Before removing duplicates
plt.subplot(1, 2, 1)
sns.histplot(df_sd2['Salary'].dropna(), bins=20, kde=True, color='blue')
plt.title("Salary Distribution Before Removing Duplicates")
plt.xlabel("Salary")
plt.ylabel("Frequency")

# After removing duplicates
plt.subplot(1, 2, 2)
sns.histplot(df_duplicates['Salary'].dropna(), bins=20, kde=True, color='green')
plt.title("Salary Distribution After Removing Duplicates")
plt.xlabel("Salary")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

Histogram Comparison for 'YrsExp' Duplicate Filter Before Vs After

In [None]:
# Visualising Years of Experience distribution before and after removing duplicates
plt.figure(figsize=(12, 5))

# Before removing duplicates
plt.subplot(1, 2, 1)
sns.histplot(df_sd2['YrsExp'].dropna(), bins=20, kde=True, color='blue')
plt.title("Years of Experience Distribution Before Removing Duplicates")
plt.xlabel("Years of Experience")
plt.ylabel("Frequency")

# After removing duplicates
plt.subplot(1, 2, 2)
sns.histplot(df_duplicates['YrsExp'].dropna(), bins=20, kde=True, color='green')
plt.title("Years of Experience Distribution After Removing Duplicates")
plt.xlabel("Years of Experience")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

Removing Duplicates from the Dataframe

*   We chose to remove them to prevent our model from overfitting



In [None]:
# Remove duplicate rows and keep the first occurrence
df_sd3 = df_sd2.drop_duplicates(keep='first')

# Reset the index
df_sd3 = df_sd3.reset_index(drop=True)

# Print the shape of the DataFrame before and after removing duplicates
print("Original DataFrame shape:", df_sd2.shape)
print("DataFrame shape after removing duplicates:", df_sd3.shape)

## Step 1.3: Identifying if there are any neglible data to remove from the 'Salary Data' Dataframe

Understanding values inside 'Salary Data' Dataframe

* Data Type in each columns
* Summary Statistic for each numerical columns
* Frequency count for unique data in each columns



In [None]:
# view dataset salary data 3 information
print(df_sd3.info())
print('')

# Statistic range of dataset salary data 3
print(df_sd3.describe())
print('')

string_columns = df_sd3.select_dtypes(include=['object']).columns

for column in string_columns:
    frequency_table = df_sd3[column].value_counts().to_frame()
    print(f"Column: {column}")
    print(frequency_table)  # Optionally add .to_string() for better formatting
    print("\n")

To determine if 'Other' in 'Gender' column is significant ( less than 2%)

In [None]:
gender_counts = df_sd3['Gender'].value_counts()
other_percentage = (gender_counts['Other'] / len(df_sd3)) * 100
print(f"Percentage of 'Other' in Gender: {other_percentage:.2f}%")

Remove 'Other' and its relevant row of data from the column 'Gender'

In [None]:
# Filter out rows where Gender is 'Other'
df_sd3 = df_sd3[df_sd3['Gender'] != 'Other']

# Reset the index if needed
df_sd3 = df_sd3.reset_index(drop=True)

# Display unique values after cleaning
print('Updated Unique Values under "Gender":', df_sd3['Gender'].unique())

# Display shape
print("DataFrame shape after cleaning Gender:", df_sd3.shape)

Standardise naming of Data under the column 'EduLevel'

In [None]:
# Define a mapping dictionary for standardisation
education_mapping = {
    "Bachelor's Degree": "BachelorDegree",
    "Bachelor's": "BachelorDegree",
    "Master's Degree": "MasterDegree",
    "Master's": "MasterDegree",
    "phD": "PhD",
    "High School": "HighSchool"
}

# Apply mapping to clean the 'EduLevel' column
df_sd3['EduLevel'] = df_sd3['EduLevel'].replace(education_mapping)

# Display unique values after cleaning
print('Updated Unique Values in Data Frame:', df_sd3['EduLevel'].unique())

# Step 2: Analsying the DataSet (Run Step 1 Before this)

Grouped Bar Chart: Gender Breakdown by Education Level

In [None]:
plt.figure(figsize=(10, 6))  # Adjust figure size if needed
sns.countplot(x='EduLevel', hue='Gender', data=df_sd3)
plt.title('Breakdown of Gender by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

Histogram: Age Distribution of Dataset

In [None]:
plt.figure(figsize=(10, 6))  # Adjust figure size if needed
sns.histplot(df_sd3['Age'], bins=20, kde=True)  # 'bins' controls the number of bars, 'kde' adds a density curve
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

BoxPlot: Salary Distribution by Gender

In [None]:
plt.figure(figsize=(10, 6))  # Adjust figure size if needed
sns.boxplot(x='Gender', y='Salary', data=df_sd3)
plt.title('Salary Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Salary')
plt.tight_layout()
plt.show()

Box Plot Salary Distribution by Education Level

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='EduLevel', y='Salary', data=df_sd3)
plt.title('Salary Distribution by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Salary')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

Scatterplot of Salary Distribution by Years of Experience

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='YrsExp', y='Salary', data=df_sd3)
plt.title('Salary Distribution by Years of Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.tight_layout()
plt.show()

Scatterplot of Salary Distribution by Age

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Age', y='Salary', data=df_sd3)
plt.title('Salary Distribution by Age')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.tight_layout()
plt.show()

Heatmap for numerical feature relationships of Dataset

In [None]:
# Show correlation heat map and matrix
df_corr = df_sd3[['Age','YrsExp','Salary']]

# Calculate the correlation matrix
corrmat = df_corr.corr()

# Title
plt.title("Numerical Correlation Heatmap")

# Generate the heatmap
sns.heatmap(corrmat, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.show()

Heatmap for categorical features relationships of Dataset

In [None]:
def cramers_v(confusion_matrix):
    """Calculate Cramér's V statistic for association between two categorical variables"""
    chi2 = stats.chi2_contingency(confusion_matrix)[0]  # Get Chi-square value
    n = confusion_matrix.sum().sum()  # Total observations
    k = min(confusion_matrix.shape)  # Minimum of row/column count
    return np.sqrt(chi2 / (n * (k - 1)))

# List of categorical variables
categorical_vars = ['Gender', 'EduLevel', 'JobTitle']

# Compute Cramér's V matrix
cramers_v_matrix = pd.DataFrame(np.zeros((len(categorical_vars), len(categorical_vars))),
                                index=categorical_vars, columns=categorical_vars)

for i, var1 in enumerate(categorical_vars):
    for j, var2 in enumerate(categorical_vars):
        if i == j:
            cramers_v_matrix.iloc[i, j] = 1.0  # Diagonal should be 1 (same variable)
        else:
            contingency_table = pd.crosstab(df_sd3[var1], df_sd3[var2])
            cramers_v_matrix.iloc[i, j] = cramers_v(contingency_table)

# Plot heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cramers_v_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Cramér's V Heatmap for Categorical Variables")
plt.show()


# Step 3: Machine Learning Algorithm Implementation with Raw Data (Run Step 1 Before this)

1. We will encode the relevant columns of data that fit our problem statement
2. Next, we will spilt the Dataset
3. Lastly, we will run 5 different types of Machine Learning Algorithms to identify best two


Copy Dataframe for Step 3

In [None]:
df_sd4 = df_sd3.copy()

print('Original Data Frame:', df_sd3.shape)
print('Copied Data Frame:', df_sd4.shape)
print()
df_sd4.columns

Encoding raw data from pre-processed 'Salary Data' Dataframe

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

# Drop 'JobTitle' (high cardinality, too specific)
df_sd5 = df_sd4.drop(columns=['JobTitle'])

# Separate features and target
X = df_sd5.drop(columns=['Salary'])
y = df_sd5['Salary']

# Identify feature types
categorical_ordinal = ['EduLevel']  # Ordinal categories
categorical_nominal = ['Gender']  # Nominal categories
numerical_cols = ['Age', 'YrsExp']

# Apply Label Encoding to ordinal categorical variables
for col in categorical_ordinal:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Apply OneHotEncoding to nominal categorical variables
ohe = OneHotEncoder(sparse_output=False)  # Avoid dummy variable trap
X_encoded = ohe.fit_transform(X[categorical_nominal])
encoded_feature_names = ohe.get_feature_names_out(categorical_nominal)

# Scale numerical variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numerical_cols])

# Combine all transformed features
X_processed = np.hstack((X_scaled, X[categorical_ordinal].values, X_encoded))

# Convert to DataFrame for clarity
X_final = pd.DataFrame(X_processed, columns=numerical_cols + categorical_ordinal + list(encoded_feature_names))

X_final.columns

Spilting of Training Data (Must Run before Algorithm)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

## Step 3.1: Among the five, we will pick the best two performing Machine Learning Algorithms to improve on it

**Algorithm 1**: LinearRegression (Train), **Algorithm 2**: DecisionTreeRegressor (Train), **Algorithm 3**: RandomForestRegressor (Train), **Algorithm 4**: XGBoost

In [None]:
import os
import time
import psutil
import tracemalloc
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def measure_training_time(model, X_train, y_train):
    cpu_before = psutil.cpu_percent(interval=None)
    tracemalloc.start()
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    cpu_after = psutil.cpu_percent(interval=None)
    current_memory, peak_memory = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    peak_ram = peak_memory / (1024 * 1024)  # Convert to MB
    training_time = end_time - start_time
    avg_cpu = (cpu_before + cpu_after) / 2

    print(f"Training Time: {training_time:.2f} seconds")
    print(f"Average CPU Usage: {avg_cpu:.2f}%")
    print(f"Peak RAM Usage: {peak_ram:.2f} MB")

    return training_time, avg_cpu, peak_ram

def measure_inference_time(model, X_test):
    start_time = time.time()
    y_pred = model.predict(X_test)
    end_time = time.time()
    total_time = end_time - start_time
    avg_time = total_time / len(X_test)
    print(f"Total Inference Time: {total_time:.2f} seconds")
    print(f"Average Inference Time per Sample: {avg_time:.6f} seconds")
    return total_time, avg_time, y_pred

def measure_memory_usage():
    process = psutil.Process()
    memory_usage = process.memory_info().rss / (1024 * 1024)
    print(f"Memory Usage: {memory_usage:.2f} MB")
    return memory_usage

def evaluate_model(y_test, y_pred, model_name):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} Performance:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R² Score: {r2:.4f}")
    return mae, mse, r2

def run_models(X_train, y_train, X_test, y_test):
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(max_depth=5, random_state=42),
        "Random Forest": RandomForestRegressor(random_state=42),
        "XGBoost": XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
    }

    results = {}
    for model_name, model in models.items():
        print(f"Running {model_name}...")
        train_time, avg_cpu, peak_ram = measure_training_time(model, X_train, y_train)
        inf_time, avg_inf_time, y_pred = measure_inference_time(model, X_test)
        mae, mse, r2 = evaluate_model(y_test, y_pred, model_name)
        memory_usage = measure_memory_usage()

        results[model_name] = {
            "Training Time (s)": train_time,
            "Average CPU Usage (%)": avg_cpu,
            "Peak RAM Usage (MB)": peak_ram,
            "Total Inference Time (s)": inf_time,
            "Average Inference Time per Sample (s)": avg_inf_time,
            "MAE": mae,
            "MSE": mse,
            "R² Score": r2,
            "Memory Usage (MB)": memory_usage
        }
        print()
    return results

# Assuming X_train, y_train, X_test, y_test are already defined
results = run_models(X_train, y_train, X_test, y_test)

# Convert results dictionary into a DataFrame
results_df = pd.DataFrame.from_dict(results, orient='index')

# Display results
display(results_df)


Overall Table and Comparison Chart for ease of determining each Algorithm strengths

In [None]:
# Re-import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Ensure proper formatting of the dataset
results_df = pd.DataFrame.from_dict(results, orient='index')  # Recreate DataFrame if needed
results_df.reset_index(inplace=True)
results_df.rename(columns={'index': 'Model'}, inplace=True)

# Convert all numerical columns to float for consistency
for col in results_df.columns[1:]:  # Exclude 'Model' column
    results_df[col] = pd.to_numeric(results_df[col], errors='coerce')

# Generate grouped bar charts for better readability
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(18, 12))
fig.suptitle("Comparison of Model Performance Metrics", fontsize=16)

# Flatten axes for easy iteration
axes = axes.flatten()

# List of metrics for visualization
metrics_to_plot = [
    "Training Time (s)", "Average CPU Usage (%)", "Peak RAM Usage (MB)",
    "Total Inference Time (s)", "Average Inference Time per Sample (s)",
    "MAE", "MSE", "R² Score", "Memory Usage (MB)"
]

# Generate bar charts with correct hue assignment
for i, metric in enumerate(metrics_to_plot):
    if metric in results_df.columns:  # Ensure metric exists in DataFrame
        sns.barplot(data=results_df, x="Model", y=metric, hue="Model", dodge=False, ax=axes[i], palette="viridis")
        axes[i].set_title(metric, fontsize=12)
        axes[i].set_xlabel("")
        axes[i].set_ylabel(metric)
        axes[i].tick_params(axis='x', rotation=45)

        # Auto-scale y-axis based on data range
        axes[i].set_ylim([0, results_df[metric].max() * 1.1])  # Add 10% padding


# Adjust layout
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

#Step 4: Additional Features Extraction (Run Step 1 Before this)

*   Run Everything


Copy Dataframe for Step 4

In [None]:
df_sd6 = df_sd3.copy()

print('Original Data Frame:', df_sd3.shape)
print('Copied Data Frame:', df_sd6.shape)
print()

print(df_sd6['JobTitle'].value_counts())

## Step 4.1: Additional Feature 1 ('Generation'), Clustering 'Age'

In [None]:
# Define the reference year based on the dataset's last update
reference_year = 2023

# Calculate the Birth Year
df_sd6['BirthYear'] = reference_year - df_sd6['Age']

# Define bins and labels for generation categories
bins = [0, 1943, 1964, 1979, 1994, float('inf')]
labels = ['SilentGeneration', 'BabyBoomer', 'GenX', 'Millennials', 'GenZ']

# Assign generations based on Birth Year
df_sd6['Generation'] = pd.cut(df_sd6['BirthYear'], bins=bins, labels=labels, right=True)

# Get the frequency of each label under 'Generation'
generation_counts = df_sd6['Generation'].value_counts()

# Print the frequency
print("Frequency of each generation type:")
print(generation_counts)
print('')

# Create a pie chart
plt.figure(figsize=(8, 8))  # Adjust figure size if needed
# Remove categories with zero count
filtered_counts = generation_counts[generation_counts > 0]

plt.pie(filtered_counts, labels=filtered_counts.index,
        autopct='%1.1f%%', startangle=90, textprops={'fontsize': 12})
plt.title('Distribution of Generations')
plt.show()
print('')

print(df_sd6.shape)
print('')
print("Updated Columns of Data Set", df_sd6.columns)

## Step 4.2: Additional Feature 1 ('Seniority'), Clustering 'YrsExp'

In [None]:
def assign_seniority(years):
    if years <= 3:
        return "Entry"
    elif 4 <= years <= 6:
        return "Junior"
    elif 7 <= years <= 10:
        return "Mid"
    elif 11 <= years <= 15:
        return "Senior"
    else:
        return "Executive"

# Apply the mapping function to create the new 'Seniority' column
df_sd6['Seniority'] = df_sd6['YrsExp'].apply(assign_seniority)

# Get the frequency of each label under 'Generation'
seniority_counts = df_sd6['Seniority'].value_counts()

# Print the frequency
print("Frequency of each Seniority:")
print(seniority_counts)
print('')

df_sd6.columns

## Step 4.3: Additional Feature 3 ('Industry'), Clustering of 'JobTitle'

* Imports relevant libaries
* Cleans job titles (lowercase, removes special characters)
* Tokenises job titles into words

In [None]:
!pip install gensim
import gensim
from gensim.models import Word2Vec
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, pairwise_distances
from scipy.cluster.hierarchy import linkage, dendrogram
from collections import Counter

def preprocess_job_title(title):
    title = title.lower()  # Convert to lowercase
    title = re.sub(r'[^a-z\s]', '', title)  # Remove special characters
    return title.split()  # Tokenize by splitting on spaces

# Apply preprocessing to job titles
df_sd6["Cleaned_Job_Title"] = df_sd6["JobTitle"].astype(str).apply(preprocess_job_title)

* Trains a Word2Vec model on job titles
* Converts job titles into vector embeddings

In [None]:
# Step 2: Train Word2Vec for Job Title Embeddings
word2vec_model = Word2Vec(sentences=df_sd6["Cleaned_Job_Title"], vector_size=100, window=5, min_count=1, workers=4)

def get_vector(title_tokens):
    vectors = [word2vec_model.wv[word] for word in title_tokens if word in word2vec_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

# Generate word embeddings
df_sd6["Job_Title_Vector"] = df_sd6["Cleaned_Job_Title"].apply(get_vector)
X = np.vstack(df_sd6["Job_Title_Vector"].values)

* Uses Elbow Method (inertia) and Silhouette Score to find the best number of clusters

In [None]:
# Step 3: Determine the Optimal Number of Clusters
inertia = []
silhouette_scores = []
cluster_range = range(2, 11)

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X, cluster_labels))

# Plot Elbow & Silhouette Score Side-by-Side
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(cluster_range, inertia, marker='o', linestyle='-', label='Inertia')
axes[0].set_xlabel("Number of Clusters")
axes[0].set_ylabel("Inertia")
axes[0].set_title("Elbow Method for Optimal Clusters")
axes[0].legend()
axes[0].grid()

axes[1].plot(cluster_range, silhouette_scores, marker='o', linestyle='-', label='Silhouette Score', color='red')
axes[1].set_xlabel("Number of Clusters")
axes[1].set_ylabel("Silhouette Score")
axes[1].set_title("Silhouette Score for Cluster Quality")
axes[1].legend()
axes[1].grid()

plt.show()

* Selects the best cluster count based on the highest Silhouette Score
* Assigns job titles to clusters

In [None]:
# Step 4: Apply K-Means Clustering Using Optimal Cluster Count
optimal_clusters = cluster_range[silhouette_scores.index(max(silhouette_scores))]  # Best Silhouette Score
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
df_sd6["Cluster"] = kmeans.fit_predict(X)

* Finds the most common words in each cluster to infer industry labels

In [None]:
# Step 5: Extract Common Words from Each Cluster to Infer Industry Labels
def get_most_common_words(cluster_number):
    titles_in_cluster = df_sd6[df_sd6["Cluster"] == cluster_number]["Cleaned_Job_Title"].tolist()
    all_words = [word for title in titles_in_cluster for word in title]
    most_common_words = [word for word, _ in Counter(all_words).most_common(3)]
    return most_common_words

# Assign inferred industry names based on most common words
industry_names = {}
for cluster in range(optimal_clusters):
    common_words = get_most_common_words(cluster)
    industry_names[cluster] = "-".join(common_words)  # Use common words as a proxy for industry

df_sd6["Industry"] = df_sd6["Cluster"].map(industry_names)

* Reduces job title embeddings to 2D using t-SNE for visualization
* Colours each job title by its assigned cluster

In [None]:
# Step 6: Visualize t-SNE Representation
tsne = TSNE(n_components=2, perplexity=15, random_state=42)
X_tsne = tsne.fit_transform(X)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=df_sd6["Cluster"], cmap='tab10', alpha=0.7)
plt.colorbar(scatter, label="Cluster")
plt.title("t-SNE Visualization of Job Titles (Coloured by Cluster)")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()

* Creates a hierarchical clustering dendrogram for job titles
* Colors clusters and adds a legend dynamically

In [None]:
import matplotlib.patches as mpatches

# Generate Dendrogram
linkage_matrix = linkage(X, method='ward')
plt.figure(figsize=(14, 6))

dendro = dendrogram(
    linkage_matrix,
    labels=df_sd6["JobTitle"].values,
    leaf_rotation=45,
    leaf_font_size=8,
    truncate_mode='lastp',
    p=20,
    color_threshold=0.5 * max(linkage_matrix[:, 2])
)

# Extract the actual colors used
cluster_colors = set(dendro['leaves_color_list'])
legend_patches = [mpatches.Patch(color=color, label=f"Cluster {i+1}") for i, color in enumerate(cluster_colors)]

# Add a legend
plt.legend(handles=legend_patches, title="Clusters", loc='upper right')

plt.title("Hierarchical Clustering Dendrogram for Job Titles (With Correct Legend)")
plt.xlabel("Job Title Clusters")
plt.ylabel("Cluster Distance")
plt.show()

print()
df_sd6['Industry'].unique()

## Step 4.4: Heatmap for new categorical features relationships of Dataset

In [None]:
def cramers_v(confusion_matrix):
    """Calculate Cramér's V statistic for association between two categorical variables"""
    chi2 = stats.chi2_contingency(confusion_matrix)[0]  # Get Chi-square value
    n = confusion_matrix.sum().sum()  # Total observations
    k = min(confusion_matrix.shape)  # Minimum of row/column count
    return np.sqrt(chi2 / (n * (k - 1)))

# List of categorical variables
new_categorical_vars = ['Gender', 'EduLevel', 'JobTitle', 'Industry', 'Generation', 'Seniority']

# Compute Cramér's V matrix
cramers_v_matrix = pd.DataFrame(np.zeros((len(new_categorical_vars), len(new_categorical_vars))),
                                index=new_categorical_vars, columns=new_categorical_vars)

for i, var1 in enumerate(new_categorical_vars):
    for j, var2 in enumerate(new_categorical_vars):
        if i == j:
            cramers_v_matrix.iloc[i, j] = 1.0  # Diagonal should be 1 (same variable)
        else:
            contingency_table = pd.crosstab(df_sd6[var1], df_sd6[var2])
            cramers_v_matrix.iloc[i, j] = cramers_v(contingency_table)

# Plot heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cramers_v_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Cramér's V Heatmap for Categorical Variables")
plt.show()

# Step 5: Determine if varying Additional Features Combinations brings improvement to the Baseline Model

In [None]:
import numpy as np
import pandas as pd
import time
import psutil
import tracemalloc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

def measure_training_time(model, X_train, y_train):
    cpu_before = psutil.cpu_percent(interval=None)
    tracemalloc.start()
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    cpu_after = psutil.cpu_percent(interval=None)
    current_memory, peak_memory = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    peak_ram = peak_memory / (1024 * 1024)  # Convert to MB
    training_time = end_time - start_time
    avg_cpu = (cpu_before + cpu_after) / 2

    return training_time, avg_cpu, peak_ram

def measure_inference_time(model, X_test):
    start_time = time.time()
    y_pred = model.predict(X_test)
    end_time = time.time()
    total_time = end_time - start_time
    avg_time = total_time / len(X_test)
    return total_time, avg_time, y_pred

def measure_memory_usage():
    process = psutil.Process()
    memory_usage = process.memory_info().rss / (1024 * 1024)
    return memory_usage

def evaluate_feature_set(feature_set_name, df, categorical_ordinal, categorical_nominal, numerical_cols):
    df_sd7 = df.drop(columns=['JobTitle', 'BirthYear', 'Cleaned_Job_Title', 'Job_Title_Vector', 'Cluster'])

    X = df_sd7.drop(columns=['Salary'])
    y = df_sd7['Salary']

    for col in categorical_ordinal:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

    ohe = OneHotEncoder(sparse_output=False)
    X_encoded = ohe.fit_transform(X[categorical_nominal])
    encoded_feature_names = ohe.get_feature_names_out(categorical_nominal)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X[numerical_cols])

    X_processed = np.hstack((X_scaled, X[categorical_ordinal].values, X_encoded))
    X_final = pd.DataFrame(X_processed, columns=numerical_cols + categorical_ordinal + list(encoded_feature_names))

    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

    results = []

    for model_name, model in zip(["Random Forest", "XGBoost"], [RandomForestRegressor(random_state=42), XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42)]):
        train_time, avg_cpu, peak_ram = measure_training_time(model, X_train, y_train)
        inf_time, avg_inf_time, y_pred = measure_inference_time(model, X_test)
        memory_usage = measure_memory_usage()
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        results.append({
            "Feature Set": feature_set_name,
            "Model": model_name,
            "Training Time (s)": train_time,
            "Average CPU Usage (%)": avg_cpu,
            "Peak RAM Usage (MB)": peak_ram,
            "Total Inference Time (s)": inf_time,
            "Average Inference Time per Sample (s)": avg_inf_time,
            "MAE": mae,
            "MSE": mse,
            "R² Score": r2,
            "Memory Usage (MB)": memory_usage
        })

    return pd.DataFrame(results)

feature_combinations = [
    ("Baseline (No Additional Features)", ["EduLevel"], ["Gender"], ["Age", "YrsExp"]),
    ("Generation", ["EduLevel", "Generation"], ["Gender"], ["Age", "YrsExp"]),
    ("Seniority", ["EduLevel", "Seniority"], ["Gender"], ["Age", "YrsExp"]),
    ("Generation + Seniority", ["EduLevel", "Generation", "Seniority"], ["Gender"], ["Age", "YrsExp"]),
    ("Industry", ["EduLevel"], ["Gender", "Industry"], ["Age", "YrsExp"]),
    ("Generation + Industry", ["EduLevel", "Generation"], ["Gender", "Industry"], ["Age", "YrsExp"]),
    ("Seniority + Industry", ["EduLevel", "Seniority"], ["Gender", "Industry"], ["Age", "YrsExp"]),
    ("All Features", ["EduLevel", "Generation", "Seniority"], ["Gender", "Industry"], ["Age", "YrsExp"])
]

results_df = pd.DataFrame()
for feature_name, cat_ord, cat_nom, num_cols in feature_combinations:
    results_df = pd.concat([results_df, evaluate_feature_set(feature_name, df_sd6, cat_ord, cat_nom, num_cols)])

display(results_df)
print()
display(results_df[results_df['Model']=='Random Forest'])
print()
display(results_df[results_df['Model']=='XGBoost'])


#Step 6: Machine Learning Algorithm Implementation with Additional Features (Run Step 4 Before this)

1. We will encode the relevant columns of data that fit our problem statement, this includes our new Additional Features
2. Next, we will spilt the Dataset
3. Lastly, we will run the best 2 Machine Learning Algorithm we have identified in Step 3 to see if the model improves


## Step 6.1: Encode and Scale all Three Additional Features ('Generation'+'Seniority'+'Industry')

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

# Drop 'JobTitle' (high cardinality, too specific)
df_sd8 = df_sd6.drop(columns=['JobTitle', 'BirthYear', 'Cleaned_Job_Title', 'Job_Title_Vector', 'Cluster'])

# Separate features and target
X = df_sd8.drop(columns=['Salary'])
y = df_sd8['Salary']

# Identify feature types
categorical_ordinal = ['EduLevel', 'Generation', 'Seniority']  # Ordinal categories
categorical_nominal = ['Gender', 'Industry']  # Nominal categories
numerical_cols = ['Age', 'YrsExp']

# Apply Label Encoding to ordinal categorical variables
for col in categorical_ordinal:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Apply OneHotEncoding to nominal categorical variables
ohe = OneHotEncoder(sparse_output=False)  # Avoid dummy variable trap
X_encoded = ohe.fit_transform(X[categorical_nominal])
encoded_feature_names = ohe.get_feature_names_out(categorical_nominal)

# Scale numerical variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numerical_cols])

# Combine all transformed features
X_processed = np.hstack((X_scaled, X[categorical_ordinal].values, X_encoded))

# Convert to DataFrame for clarity
X_final = pd.DataFrame(X_processed, columns=numerical_cols + categorical_ordinal + list(encoded_feature_names))

## Step 6.2: Machine Learning Algorithms

Spilting of Training Data (Must Run before Algorithm)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

Algorithm 3: RandomForestRegressor (Train)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train Random Forest Regressor
# rfr_model = RandomForestRegressor(n_estimators=100, random_state=42)
rfr_model = RandomForestRegressor(random_state=42)
rfr_model.fit(X_train, y_train)

# Predict salaries on the test set
y_pred_rfr = rfr_model.predict(X_test)

# Evaluate performance
mae_rfr = mean_absolute_error(y_test,y_pred_rfr)
mse_rfr = mean_squared_error(y_test, y_pred_rfr)    # Mean Squared Error
r2_rfr = r2_score(y_test, y_pred_rfr)

# Print results
print("Random Forest Regressor Performance:")
print(f"Mean Absolute Error (MAE): {mae_rfr:.2f}")
print(f"Mean Squared Error (MSE): {mse_rfr:.2f}")
print(f"R² Score: {r2_rfr:.4f}")

Algorithm 4: XGBoost (Train)

In [None]:
from xgboost import XGBRegressor

# Train Random Forest Regressor
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict salaries on the test set
y_pred_xgb = xgb_model.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Performance")
print(f"Mean Absolute Error (MAE): {mae_xgb:.2f}")
print(f"Mean Squared Error (MSE): {mse_xgb:.2f}")
print(f"R² Score: {r2_xgb:.4f}")

# Step 7: Fine-Tunning Machine Learning Algorithm (Run Step 6 Before this)

*   Pick either Exhaustive or Faster
*   According to your Algorithm of your Choice



## Step 7.1: (Choose 1 of 2): Fine-Tuning with Grid Search (Exhaustive)
*   Grid Search tests all possible combinations of specified hyperparameters.



**Algorithm 3**

In [None]:
from sklearn.model_selection import GridSearchCV

fine_tune_choice = 1

# Define the parameter grid
param_grid_rfr = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Perform Grid Search
grid_search_rfr = GridSearchCV(rfr_model, param_grid_rfr, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)
grid_search_rfr.fit(X_train, y_train)

# Best parameters and performance
print("Best Parameters:", grid_search_rfr.best_params_)
print("Best MAE:", -grid_search_rfr.best_score_)

**Algorithm 4**

In [None]:
from sklearn.model_selection import GridSearchCV

fine_tune_choice = 1

# Define the parameter grid
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid Search
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, scoring='neg_mean_absolute_error', cv=5, verbose=2, n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

# Best Parameters
print("Best Parameters:", grid_search_xgb.best_params_)
print("Best MAE:", -grid_search_xgb.best_score_)

## Step 7.1: (Choose 2 of 2): Fine-Tuning with Random Search (Faster)
*   Random Search randomly samples hyperparameters instead of testing all combinations.

 **Algorithm 3**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

fine_tune_choice = 2

# Define the parameter grid with distributions
param_dist_rfr = {
    'n_estimators': randint(100, 500),
    'max_depth': [10, 20, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2']
}

# Perform Randomized Search
random_search_rfr = RandomizedSearchCV(rfr_model, param_dist_rfr, n_iter=20, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2, random_state=42)
random_search_rfr.fit(X_train, y_train)

# Best parameters and performance
print("Best Parameters:", random_search_rfr.best_params_)
print("Best MAE:", -random_search_rfr.best_score_)


 **Algorithm 4**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

fine_tune_choice = 2

# Define the parameter distribution
param_dist_xgb = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 0.5),
    'reg_lambda': uniform(0, 0.5)
}

# Randomized Search
random_search_xgb = RandomizedSearchCV(xgb_model, param_dist_xgb, n_iter=20, scoring='neg_mean_absolute_error', cv=5, verbose=2, n_jobs=-1, random_state=42)
random_search_xgb.fit(X_train, y_train)

# Best Parameters
print("Best Parameters:", random_search_xgb.best_params_)
print("Best MAE:", -random_search_xgb.best_score_)

# Step 8: Retraining of Both Algorithm Models (Run only after Step 7)

## Step 8.1: Retrain (Algorithm 3: RandomForest) & (Algorithm 4: XGBoost)

**Algorithm 3**: Random Forest RegressorRetrain

In [None]:
if fine_tune_choice == 1:
  best_params = grid_search_rfr.best_params_
elif fine_tune_choice == 2:
  best_params = random_search_rfr.best_params_

best_model_rfr = RandomForestRegressor(**best_params, random_state=42)
best_model_rfr.fit(X_train, y_train)

# Predict and evaluate
y_pred_rfr = best_model_rfr.predict(X_test)

mae_tuned = mean_absolute_error(y_test, y_pred_rfr)
mse_tuned = mean_squared_error(y_test, y_pred_rfr)
r2_tuned = r2_score(y_test, y_pred_rfr)

print(f"Tuned Model MAE: {mae_tuned:.2f}")
print(f"Tuned Model MSE: {mse_tuned:.2f}")
print(f"Tuned Model R² Score: {r2_tuned:.4f}")

**Algorithm 4**: XGBoost

In [None]:
if fine_tune_choice == 1:
  best_params = grid_search_xgb.best_params_
elif fine_tune_choice == 2:
  best_params = random_search_xgb.best_params_

best_model_xgb = XGBRegressor(**best_params, random_state=42)
best_model_xgb.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = best_model_xgb.predict(X_test)

mae_tuned = mean_absolute_error(y_test, y_pred_xgb)
mse_tuned = mean_squared_error(y_test, y_pred_xgb)
r2_tuned = r2_score(y_test, y_pred_xgb)

print(f"Tuned Model MAE: {mae_tuned:.2f}")
print(f"Tuned Model MSE: {mse_tuned:.2f}")
print(f"Tuned Model R² Score: {r2_tuned:.4f}")

## Step 8.2: Comparing Both Metrics Before & After Tuning

In [None]:
import os
import time
import psutil
import tracemalloc
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def measure_training_time(model, X_train, y_train):
    cpu_before = psutil.cpu_percent(interval=None)
    tracemalloc.start()
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    cpu_after = psutil.cpu_percent(interval=None)
    current_memory, peak_memory = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    peak_ram = peak_memory / (1024 * 1024)  # Convert to MB
    training_time = end_time - start_time
    avg_cpu = (cpu_before + cpu_after) / 2

    print(f"Training Time: {training_time:.2f} seconds")
    print(f"Average CPU Usage: {avg_cpu:.2f}%")
    print(f"Peak RAM Usage: {peak_ram:.2f} MB")

    return training_time, avg_cpu, peak_ram

def measure_inference_time(model, X_test):
    start_time = time.time()
    y_pred = model.predict(X_test)
    end_time = time.time()
    total_time = end_time - start_time
    avg_time = total_time / len(X_test)
    print(f"Total Inference Time: {total_time:.2f} seconds")
    print(f"Average Inference Time per Sample: {avg_time:.6f} seconds")
    return total_time, avg_time, y_pred

def measure_memory_usage():
    process = psutil.Process()
    memory_usage = process.memory_info().rss / (1024 * 1024)
    print(f"Memory Usage: {memory_usage:.2f} MB")
    return memory_usage

def evaluate_model(y_test, y_pred, model_name):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} Performance:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R² Score: {r2:.4f}")
    return mae, mse, r2


### 📌 Compare **RandomForestRegressor** Before & After Tuning

print("\n--- Original Random Forest Model Performance ---")
train_time_orig, avg_cpu_orig, peak_ram_orig = measure_training_time(rfr_model, X_train, y_train)
inf_time_orig, avg_inf_time_orig, y_pred_orig = measure_inference_time(rfr_model, X_test)
mae_orig, mse_orig, r2_orig = evaluate_model(y_test, y_pred_orig, "Random Forest (Before Tune)")
memory_usage_orig = measure_memory_usage()

print("\n--- Fine-Tuned Random Forest Model Performance ---")
train_time_tuned, avg_cpu_tuned, peak_ram_tuned = measure_training_time(best_model_rfr, X_train, y_train)
inf_time_tuned, avg_inf_time_tuned, y_pred_tuned = measure_inference_time(best_model_rfr, X_test)
mae_tuned, mse_tuned, r2_tuned = evaluate_model(y_test, y_pred_tuned, "Random Forest (After Tune)")
memory_usage_tuned = measure_memory_usage()

# 📊 Create comparison table for Random Forest
rf_comparison = {
    "Metric": ["Training Time (s)", "Avg CPU Usage (%)", "Peak RAM Usage (MB)",
               "Total Inference Time (s)", "Avg Inference Time per Sample (s)",
               "MAE", "MSE", "R² Score", "Memory Usage (MB)"],
    "RandomForest_Before": [train_time_orig, avg_cpu_orig, peak_ram_orig,
                             inf_time_orig, avg_inf_time_orig,
                             mae_orig, mse_orig, r2_orig, memory_usage_orig],
    "RandomForest_After": [train_time_tuned, avg_cpu_tuned, peak_ram_tuned,
                            inf_time_tuned, avg_inf_time_tuned,
                            mae_tuned, mse_tuned, r2_tuned, memory_usage_tuned]
}

rf_comparison_df = pd.DataFrame(rf_comparison)
display(rf_comparison_df)


### 📌 Compare **XGBoost** Before & After Tuning

print("\n--- Original XGBoost Model Performance ---")
train_time_orig_xgb, avg_cpu_orig_xgb, peak_ram_orig_xgb = measure_training_time(xgb_model, X_train, y_train)
inf_time_orig_xgb, avg_inf_time_orig_xgb, y_pred_orig_xgb = measure_inference_time(xgb_model, X_test)
mae_orig_xgb, mse_orig_xgb, r2_orig_xgb = evaluate_model(y_test, y_pred_orig_xgb, "XGBoost (Before Tune)")
memory_usage_orig_xgb = measure_memory_usage()

print("\n--- Fine-Tuned XGBoost Model Performance ---")
train_time_tuned_xgb, avg_cpu_tuned_xgb, peak_ram_tuned_xgb = measure_training_time(best_model_xgb, X_train, y_train)
inf_time_tuned_xgb, avg_inf_time_tuned_xgb, y_pred_tuned_xgb = measure_inference_time(best_model_xgb, X_test)
mae_tuned_xgb, mse_tuned_xgb, r2_tuned_xgb = evaluate_model(y_test, y_pred_tuned_xgb, "XGBoost (After Tune)")
memory_usage_tuned_xgb = measure_memory_usage()

# 📊 Create comparison table for XGBoost
xgb_comparison = {
    "Metric": ["Training Time (s)", "Avg CPU Usage (%)", "Peak RAM Usage (MB)",
               "Total Inference Time (s)", "Avg Inference Time per Sample (s)",
               "MAE", "MSE", "R² Score", "Memory Usage (MB)"],
    "XGBoost_Before": [train_time_orig_xgb, avg_cpu_orig_xgb, peak_ram_orig_xgb,
                        inf_time_orig_xgb, avg_inf_time_orig_xgb,
                        mae_orig_xgb, mse_orig_xgb, r2_orig_xgb, memory_usage_orig_xgb],
    "XGBoost_After": [train_time_tuned_xgb, avg_cpu_tuned_xgb, peak_ram_tuned_xgb,
                       inf_time_tuned_xgb, avg_inf_time_tuned_xgb,
                       mae_tuned_xgb, mse_tuned_xgb, r2_tuned_xgb, memory_usage_tuned_xgb]
}

xgb_comparison_df = pd.DataFrame(xgb_comparison)
display(xgb_comparison_df)

## Step 8.3: Comparing Both Feature Importance Before & After Tuning

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Feature importances for Random Forest models
feature_importances_rfr_before = rfr_model.feature_importances_
feature_importances_rfr_after = best_model_rfr.feature_importances_
# Feature importances for XGBoost models
feature_importances_xgb_before = xgb_model.feature_importances_
feature_importances_xgb_after = best_model_xgb.feature_importances_

feature_names = X_final.columns

# Sorting indices
sorted_idx_rfr_before = np.argsort(feature_importances_rfr_before)[::-1]
sorted_idx_rfr_after = np.argsort(feature_importances_rfr_after)[::-1]
sorted_idx_xgb_before = np.argsort(feature_importances_xgb_before)[::-1]
sorted_idx_xgb_after = np.argsort(feature_importances_xgb_after)[::-1]

# Create subplots for both models
fig, axes = plt.subplots(2, 2, figsize=(15, 10))  # 2 rows, 2 columns

# Plot feature importances for Random Forest (Before Tuning)
axes[0, 0].bar(range(len(feature_importances_rfr_before)),
               feature_importances_rfr_before[sorted_idx_rfr_before],
               tick_label=np.array(feature_names)[sorted_idx_rfr_before])
axes[0, 0].set_xticklabels(np.array(feature_names)[sorted_idx_rfr_before], rotation=90)
axes[0, 0].set_xlabel("Feature")
axes[0, 0].set_ylabel("Importance")
axes[0, 0].set_title("Feature Importance in Random Forest (Before Tune)")

# Plot feature importances for Random Forest (After Tuning)
axes[0, 1].bar(range(len(feature_importances_rfr_after)),
               feature_importances_rfr_after[sorted_idx_rfr_after],
               tick_label=np.array(feature_names)[sorted_idx_rfr_after])
axes[0, 1].set_xticklabels(np.array(feature_names)[sorted_idx_rfr_after], rotation=90)
axes[0, 1].set_xlabel("Feature")
axes[0, 1].set_ylabel("Importance")
axes[0, 1].set_title("Feature Importance in Random Forest (After Tune)")

# Plot feature importances for XGBoost (Before Tuning)
axes[1, 0].bar(range(len(feature_importances_xgb_before)),
               feature_importances_xgb_before[sorted_idx_xgb_before],
               tick_label=np.array(feature_names)[sorted_idx_xgb_before])
axes[1, 0].set_xticklabels(np.array(feature_names)[sorted_idx_xgb_before], rotation=90)
axes[1, 0].set_xlabel("Feature")
axes[1, 0].set_ylabel("Importance")
axes[1, 0].set_title("Feature Importance in XGBRegressor (Before Tune)")

# Plot feature importances for XGBoost (After Tuning)
axes[1, 1].bar(range(len(feature_importances_xgb_after)),
               feature_importances_xgb_after[sorted_idx_xgb_after],
               tick_label=np.array(feature_names)[sorted_idx_xgb_after])
axes[1, 1].set_xticklabels(np.array(feature_names)[sorted_idx_xgb_after], rotation=90)
axes[1, 1].set_xlabel("Feature")
axes[1, 1].set_ylabel("Importance")
axes[1, 1].set_title("Feature Importance in XGBRegressor (After Tune)")

# Adjust layout and display
plt.tight_layout()
plt.show()


# Step 9: Exploring Ensembling of Two Best Tuned Algorithms Models (Run only after Step 8)

* We will explore whether if Ensembling is better than our Best Tuned Model
* Conduct Performance Evaluation.



## Step 9.1: Three Ensemble Methods

(Ensemble 1): Adaptive Dynamic Weighted Averaging (Performance-Based Weights)
* This code averages predictions from two models (Random Forest and XGBoost) using weighted averaging.

* The weights are determined based on the R² scores of each model, meaning that models with higher performance (higher R² score) are given more weight.

* Weights are normalised to the sum up to 1.

Example: If XGBoost has an R² of 0.85 and Random Forest has an R² of 0.75, their weights would be:

    w_rfr = 0.75 / (0.75 + 0.85) = 0.47
    w_xgb = 0.85 / (0.75 + 0.85) = 0.53

In [None]:
# Define Weights Based on Performance (higher R² → higher weight)
w_rfr = r2_score(y_test, y_pred_rfr)
w_xgb = r2_score(y_test, y_pred_xgb)

# Normalize Weights
total_weight = w_rfr + w_xgb
w_rfr /= total_weight
w_xgb /= total_weight

# Weighted Average
y_pred_weighted = (w_rfr * y_pred_rfr) + (w_xgb * y_pred_xgb)

# Evaluate Performance
mae = mean_absolute_error(y_test, y_pred_weighted)
mse = mean_squared_error(y_test, y_pred_weighted)
r2 = r2_score(y_test, y_pred_weighted)

print("Ensemble Model Performance (Weighted Averaging):")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.4f}")

(Ensemble 2): Static Weighted Averaging (Fixed Weights)

In [None]:
# Define weights (adjust based on model performance)
w_rfr = 0.4  # Adjust weight for Random Forest
w_xgb = 0.6  # Adjust weight for XGBoost

# Weighted ensemble
y_pred_ensemble = (w_rfr * y_pred_rfr) + (w_xgb * y_pred_xgb)

# Evaluate
mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
r2_ensemble = r2_score(y_test, y_pred_ensemble)

print("Weighted Ensemble Model:")
print(f"Mean Absolute Error (MAE): {mae_ensemble:.2f}")
print(f"Mean Squared Error (MSE):: {mse_ensemble:.2f}")
print(f"R² Score: {r2_ensemble:.4f}")

(Ensemble 3): Stacking Ensemble (Complex)

* The predictions from Random Forest and XGBoost (base models) are used as features for a meta-model (Linear Regression).

* The meta-model learns how to combine base model predictions to make a final prediction.




In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor

# Extract the best parameters from tuning
rf_best_params = {k: v for k, v in best_params.items() if k in ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features']}
xgb_best_params = {k: v for k, v in best_params.items() if k in ['n_estimators', 'learning_rate', 'max_depth', 'min_child_weight', 'subsample', 'colsample_bytree']}

# Define Base Models (Using Correct Parameters for Each Model)
base_models = [
    ('rf', RandomForestRegressor(**rf_best_params, random_state=42)),
    ('xgb', XGBRegressor(**xgb_best_params, random_state=42))
]

# Meta Model (Final Predictor)
meta_model = LinearRegression()

# Stacking Regressor
stacking_reg = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5)
stacking_reg.fit(X_train, y_train)

# Predict on Test Data
y_pred_stack = stacking_reg.predict(X_test)

# Evaluate Performance
mae_stacked = mean_absolute_error(y_test, y_pred_stack)
mse_stacked = mean_squared_error(y_test, y_pred_stack)
r2_stacked = r2_score(y_test, y_pred_stack)

print("Stacking Model Performance:")
print(f"Mean Absolute Error (MAE): {mae_stacked:.2f}")
print(f"Mean Squared Error (MSE): {mse_stacked:.2f}")
print(f"R² Score: {r2_stacked:.4f}")

## 9.2: Compare Metrics for the Three Ensembling Methods against the Tuned Individual Models

In [None]:
import time
import psutil
import tracemalloc
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def measure_execution_time(func, *args, **kwargs):
    """Measure execution time, CPU usage, memory usage, and inference time of a function."""
    cpu_before = psutil.cpu_percent(interval=None)
    tracemalloc.start()
    start_time = time.time()

    result = func(*args, **kwargs)  # Run the function

    end_time = time.time()
    cpu_after = psutil.cpu_percent(interval=None)
    current_memory, peak_memory = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    peak_ram = peak_memory / (1024 * 1024)  # Convert bytes to MB
    execution_time = end_time - start_time
    avg_cpu = (cpu_before + cpu_after) / 2  # Approximate CPU usage

    # Measure memory usage after execution
    process = psutil.Process()
    memory_usage = process.memory_info().rss / (1024 * 1024)  # Convert bytes to MB

    return execution_time, avg_cpu, peak_ram, memory_usage, result

# Tracking Metrics for the Ensemble Methods and Tuned Models
comparison_results = {}

# --- Tuned Random Forest ---
exec_time, avg_cpu, peak_ram, memory_usage, y_pred_rfr = measure_execution_time(
    lambda: best_model_rfr.predict(X_test)
)

# Measure inference time
inf_time_start = time.time()
y_pred_rfr = best_model_rfr.predict(X_test)
inf_time_end = time.time()
total_inference_time = inf_time_end - inf_time_start
average_inference_time = total_inference_time / len(y_test)

# Evaluate Performance
mae_rfr = mean_absolute_error(y_test, y_pred_rfr)
mse_rfr = mean_squared_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)

comparison_results["Tuned Random Forest"] = {
    "Execution Time (s)": exec_time,
    "Average CPU Usage (%)": avg_cpu,
    "Peak RAM Usage (MB)": peak_ram,
    "Total Inference Time (s)": total_inference_time,
    "Average Inference Time per Sample (s)": average_inference_time,
    "MAE": mae_rfr,
    "MSE": mse_rfr,
    "R² Score": r2_rfr,
    "Memory Usage (MB)": memory_usage
}

# --- Tuned XGBoost ---
exec_time, avg_cpu, peak_ram, memory_usage, y_pred_xgb = measure_execution_time(
    lambda: best_model_xgb.predict(X_test)
)

# Measure inference time
inf_time_start = time.time()
y_pred_xgb = best_model_xgb.predict(X_test)
inf_time_end = time.time()
total_inference_time = inf_time_end - inf_time_start
average_inference_time = total_inference_time / len(y_test)

# Evaluate Performance
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

comparison_results["Tuned XGBoost"] = {
    "Execution Time (s)": exec_time,
    "Average CPU Usage (%)": avg_cpu,
    "Peak RAM Usage (MB)": peak_ram,
    "Total Inference Time (s)": total_inference_time,
    "Average Inference Time per Sample (s)": average_inference_time,
    "MAE": mae_xgb,
    "MSE": mse_xgb,
    "R² Score": r2_xgb,
    "Memory Usage (MB)": memory_usage
}

# --- Adaptive Dynamic Weighted Averaging ---
exec_time, avg_cpu, peak_ram, memory_usage, y_pred_weighted = measure_execution_time(
    lambda: (w_rfr * y_pred_rfr) + (w_xgb * y_pred_xgb)
)

# Measure inference time
inf_time_start = time.time()
y_pred_weighted = (w_rfr * y_pred_rfr) + (w_xgb * y_pred_xgb)
inf_time_end = time.time()
total_inference_time = inf_time_end - inf_time_start
average_inference_time = total_inference_time / len(y_test)

# Evaluate Performance
mae_weighted = mean_absolute_error(y_test, y_pred_weighted)
mse_weighted = mean_squared_error(y_test, y_pred_weighted)
r2_weighted = r2_score(y_test, y_pred_weighted)

comparison_results["Adaptive Weighted"] = {
    "Execution Time (s)": exec_time,
    "Average CPU Usage (%)": avg_cpu,
    "Peak RAM Usage (MB)": peak_ram,
    "Total Inference Time (s)": total_inference_time,
    "Average Inference Time per Sample (s)": average_inference_time,
    "MAE": mae_weighted,
    "MSE": mse_weighted,
    "R² Score": r2_weighted,
    "Memory Usage (MB)": memory_usage
}

# --- Static Weighted Averaging ---
exec_time, avg_cpu, peak_ram, memory_usage, y_pred_ensemble = measure_execution_time(
    lambda: (0.4 * y_pred_rfr) + (0.6 * y_pred_xgb)
)

# Measure inference time
inf_time_start = time.time()
y_pred_ensemble = (0.4 * y_pred_rfr) + (0.6 * y_pred_xgb)
inf_time_end = time.time()
total_inference_time = inf_time_end - inf_time_start
average_inference_time = total_inference_time / len(y_test)

# Evaluate Performance
mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
r2_ensemble = r2_score(y_test, y_pred_ensemble)

comparison_results["Static Weighted"] = {
    "Execution Time (s)": exec_time,
    "Average CPU Usage (%)": avg_cpu,
    "Peak RAM Usage (MB)": peak_ram,
    "Total Inference Time (s)": total_inference_time,
    "Average Inference Time per Sample (s)": average_inference_time,
    "MAE": mae_ensemble,
    "MSE": mse_ensemble,
    "R² Score": r2_ensemble,
    "Memory Usage (MB)": memory_usage
}

# --- Stacking Ensemble ---
exec_time, avg_cpu, peak_ram, memory_usage, y_pred_stack = measure_execution_time(
    lambda: stacking_reg.predict(X_test)
)

# Measure inference time
inf_time_start = time.time()
y_pred_stack = stacking_reg.predict(X_test)
inf_time_end = time.time()
total_inference_time = inf_time_end - inf_time_start
average_inference_time = total_inference_time / len(y_test)

# Evaluate Performance
mae_stacked = mean_absolute_error(y_test, y_pred_stack)
mse_stacked = mean_squared_error(y_test, y_pred_stack)
r2_stacked = r2_score(y_test, y_pred_stack)

comparison_results["Stacking Ensemble"] = {
    "Execution Time (s)": exec_time,
    "Average CPU Usage (%)": avg_cpu,
    "Peak RAM Usage (MB)": peak_ram,
    "Total Inference Time (s)": total_inference_time,
    "Average Inference Time per Sample (s)": average_inference_time,
    "MAE": mae_stacked,
    "MSE": mse_stacked,
    "R² Score": r2_stacked,
    "Memory Usage (MB)": memory_usage
}

# Convert to DataFrame for better visualization
comparison_results_df = pd.DataFrame.from_dict(comparison_results, orient="index")

# Display results
display(comparison_results_df)


# Step 10: Save & Deploy

* This is if we decide Stack Ensembling Model is the best

## Step 10.1: Save the Model

In [None]:
import joblib

# Save the model
joblib.dump(stacking_reg, 'stacking_regressor.pkl')

## Step 10.2: Load the Trained Model

In [None]:
# Load the trained model
stacking_reg = joblib.load('stacking_regressor.pkl')

## Step 10.3: Prepare New Data for Prediction

In [None]:
import pandas as pd

preset = { "Age": [30],
        "YrsExp": [5],
        "EduLevel": ["BachelorDegree"],
        "Gender": ["Male"],
        "Industry": ["manager-product-operations"],
        "Generation": ["Millennials"],
        "Seniority": ["Junior"]
    }

industry_titles = {
    1: 'manager-product-operations',
    2: 'developer-end-back',
    3: 'senior-engineer-sales',
    4: 'analyst-data-financial',
    5: 'engineer-full-stack',
    6: 'director-of-marketing',
    7: 'marketing-manager-senior',
    8: 'scientist-data-senior',
    9: 'software-engineer-manager'
}

def get_industry_title(number):
    # If the key (number) exists in the dictionary
    if number in industry_titles:
        return industry_titles[number]
    else:
        return "Job title not found for this number."


def derive_generation(age):
    """Automatically derive Generation based on Age"""
    birth_year = 2023 - age
    if birth_year <= 1943:
        return "SilentGeneration"
    elif birth_year <= 1964:
        return "BabyBoomer"
    elif birth_year <= 1979:
        return "GenX"
    elif birth_year <= 1994:
        return "Millennials"
    else:
        return "GenZ"

def derive_seniority(yrs_exp):
    """Automatically derive Seniority based on Years of Experience"""
    if yrs_exp <= 3:
        return "Entry"
    elif 4 <= yrs_exp <= 6:
        return "Junior"
    elif 7 <= yrs_exp <= 10:
        return "Mid"
    elif 11 <= yrs_exp <= 15:
        return "Senior"
    else:
        return "Executive"

# Ask the user how they want to proceed
print("\nChoose Data Input Method:")
print("1 - Manually Enter Your Own Data")
print("2 - Load Pre-Prepared Data from Code")

choice = input("Enter 1 or 2: ").strip()

if choice == "1":
    # Manual User Input
    age = int(input("Enter Age: "))
    yrs_exp = int(input("Enter Years of Experience: "))
    edu_level = input("Enter Education Level (HighSchool, BachelorDegree, MasterDegree, PhD): ")
    gender = input("Enter Gender (Male/Female): ")
    display(industry_titles)
    industry_input = int(input("Enter a number of your closest industry (1-9): "))
    industry =  get_industry_title(industry_input)

    # Auto-derive 'Generation' and 'Seniority'
    generation = derive_generation(age)
    seniority = derive_seniority(yrs_exp)

    user_data = {
        "Age": [age],
        "YrsExp": [yrs_exp],
        "EduLevel": [edu_level],
        "Gender": [gender],
        "Industry": [industry],
        "Generation": [generation],
        "Seniority": [seniority]
    }

    print("\n User Data Entered Successfully!\n")

elif choice == "2":
    # Load Pre-Prepared Data
    user_data = preset

    print("\n Loaded Pre-Prepared Data from Code!\n")

else:
    print("Invalid choice. Defaulting to pre-prepared data.")
    user_data = preset

# Convert user input to DataFrame
df_new = pd.DataFrame(user_data)

# Display the selected data
print("\n Final Input Data:")
print(df_new)

Preprocess New Data (Apply Same Encoding and Scaling)

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

# Load encoders used during training
le_education = LabelEncoder()
le_education.classes_ = np.array(['HighSchool', 'BachelorDegree', 'MasterDegree', 'PhD'])
df_new['EduLevel'] = le_education.transform(df_new['EduLevel'])

le_generation = LabelEncoder()
le_generation.classes_ = np.array(['SilentGeneration', 'BabyBoomer', 'GenX', 'Millennials', 'GenZ'])
df_new['Generation'] = le_generation.transform(df_new['Generation'])

le_seniority = LabelEncoder()
le_seniority.classes_ = np.array(['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])
df_new['Seniority'] = le_seniority.transform(df_new['Seniority'])

# OneHotEncode Gender
ohe_gender = OneHotEncoder(categories=[['Female', 'Male']], drop='first', sparse_output=False)
gender_encoded = ohe_gender.fit_transform(df_new[['Gender']])  # Apply same encoding as in training
gender_columns = ohe_gender.get_feature_names_out(['Gender'])
df_gender_encoded = pd.DataFrame(gender_encoded, columns=gender_columns)

# OneHotEncode Industry
ohe_industry = OneHotEncoder(categories=[[
    'manager-product-operations', 'developer-end-back',
    'senior-engineer-sales', 'analyst-data-financial',
    'engineer-full-stack', 'director-of-marketing',
    'marketing-manager-senior', 'scientist-data-senior',
    'software-engineer-manager'
]], drop='first', sparse_output=False)

industry_encoded = ohe_industry.fit_transform(df_new[['Industry']])  # Ensure same encoding
industry_columns = ohe_industry.get_feature_names_out(['Industry'])
df_industry_encoded = pd.DataFrame(industry_encoded, columns=industry_columns)

# **Ensure all expected columns are present (to match training features)**
df_gender_encoded = df_gender_encoded.reindex(columns=['Gender_Male'], fill_value=0)  # Only 'Male' exists due to drop='first'
df_industry_encoded = df_industry_encoded.reindex(columns=industry_columns, fill_value=0)

# Scale numerical features
scaler = StandardScaler()
df_new[['Age', 'YrsExp']] = scaler.fit_transform(df_new[['Age', 'YrsExp']])

# **Combine processed data**
df_processed = pd.concat([
    df_new[['Age', 'YrsExp', 'EduLevel', 'Generation', 'Seniority']],
    df_gender_encoded, df_industry_encoded
], axis=1)

# **Ensure final feature order matches training data**
df_processed = df_processed.reindex(columns=X_final.columns, fill_value=0)

Test if any missing features between user and saved model

In [None]:
# Get expected feature names from training data
expected_features = X_final.columns.tolist()

# Get actual feature names from the new input
actual_features = df_processed.columns.tolist()

# Find the missing feature
missing_features = list(set(expected_features) - set(actual_features))
extra_features = list(set(actual_features) - set(expected_features))

print(f"Missing feature(s): {missing_features}")
print(f"Extra feature(s): {extra_features}")


## Step 10.4: Make Predictions

*  Once preprocessing is complete, pass the transformed data to your trained model



In [None]:
predicted_salary = stacking_reg.predict(df_processed.to_numpy())

# Calculate absolute error between predicted salary and actual test values
errors = np.abs(y_test - predicted_salary[0])

# Find the closest actual salary
closest_index = np.argmin(errors)
closest_actual_salary = y_test.iloc[closest_index]

# Calculate percentage error
percentage_error = (abs(predicted_salary[0] - closest_actual_salary) / closest_actual_salary) * 100

# Print results
print(f"Predicted Salary: {predicted_salary[0]:,.2f}")
print(f"Closest Actual Salary: {closest_actual_salary:,.2f}")
print(f"Percentage Error: {percentage_error:.2f}%")


## Step 10.5: Visualisation

ScatterPlot to Demonstrate Accuracy

*   Blue dots → Predictions from the test set
*   Red dashed line → Ideal prediction line (y = x)
*   Red 'X' marker → The newly predicted salary



In [None]:
# Predict salaries for test set
y_pred_test = stacking_reg.predict(X_test)

# Create the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.4, label="Model Predictions")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label="Perfect Prediction (y = x)")

# Mark the new predicted salary
plt.scatter(predicted_salary[0], predicted_salary[0], color='red', s=150, marker='x', label="New Prediction")

# Labels and title
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("Predicted vs. Actual Salary")
plt.legend()
plt.grid(True)

# Show plot
plt.show()

2D histogram (hexbin plot) to show density:

In [None]:
plt.hexbin(y_test, y_pred_test, gridsize=50, cmap='Blues', alpha=0.8)
plt.colorbar(label='Density of Points')

In [None]:
# Plot error distribution
plt.figure(figsize=(8,5))
plt.hist(errors, bins=30, color='skyblue', alpha=0.7)
plt.axvline(x=np.abs(predicted_salary[0] - closest_actual_salary), color='red', linestyle='dashed', linewidth=2, label="New Prediction Error")
plt.xlabel("Absolute Error")
plt.ylabel("Frequency")
plt.title("Distribution of Prediction Errors")
plt.legend()
plt.show()