In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import matplotlib.pyplot as plt # Import matplotlib here for SHAP plots

In [2]:
%pip install streamlit



In [3]:
%pip install lime



In [4]:
import os

datasets_path = '/content/drive/MyDrive/Datasets/'
if os.path.exists(datasets_path):
    print(f"Contents of {datasets_path}:")
    for filename in os.listdir(datasets_path):
        print(os.path.join(datasets_path, filename))
else:
    print(f"Directory not found: {datasets_path}")

Contents of /content/drive/MyDrive/Datasets/:
/content/drive/MyDrive/Datasets/merged_all_features_final.csv
/content/drive/MyDrive/Datasets/.ipynb_checkpoints
/content/drive/MyDrive/Datasets/X_train_processed_ensemble.csv
/content/drive/MyDrive/Datasets/y_train_imputed.csv
/content/drive/MyDrive/Datasets/y_train_processed_ensemble.csv


In [5]:
import os

# Creating the 'pages' directory if it doesn't exist
pages_dir = "pages"
os.makedirs(pages_dir, exist_ok=True)
print(f"Created directory: {pages_dir}")

Created directory: pages


In [6]:
%%writefile app.py
# -*- coding: utf-8 -*-
import streamlit as st
import os

st.set_page_config(
    page_title="Global Pay Insight",
    page_icon="🌍",
    layout="wide"
)

st.sidebar.title("Navigation")

st.title("🌍 Global Pay Insight")
st.markdown("""
Welcome to Global Pay Insight! This application helps you predict your potential salary based on a comprehensive dataset and explore insights into global pay trends and potential disparities.

Use the navigation on the left to:
- **Predict Your Salary**: Get a personalized salary estimate and understand the key factors influencing it.
- **Explore Pay Gaps & Trends**: Visualize salary averages across different demographics and professional categories.
- **About Our Model**: Learn about the machine learning model powering this application, its data sources, and limitations.
"""
)

st.info("Please note: This application is for informational purposes based on available data and models. Actual salaries can vary widely due to many factors not included here.")

# Ensuring required directories exist if needed by page scripts (though pages should handle their own needs)
# os.makedirs('/content/drive/MyDrive/saved_models', exist_ok=True)
# os.makedirs('/content/drive/MyDrive/Datasets', exist_ok=True)

Overwriting app.py


In [50]:
%%writefile pages/1_Predict_Your_Salary.py
# -*- coding: utf-8 -*-
import streamlit as st
import joblib
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import re


st.set_page_config(page_title="Predict Your Salary", page_icon="💰", layout="wide")

# --- Configuration and Data Loading ---
MODEL_PATH = '/content/drive/MyDrive/saved_models/voting_regressor_ensemble.joblib'
FACTORS_PATH = '/content/drive/MyDrive/saved_models/adjustment_factors.joblib'
TRAIN_DATA_PATH = '/content/drive/MyDrive/Datasets/X_train_processed_ensemble.csv'

@st.cache_resource
def load_model(path):
    try:
        model = joblib.load(path)
        return model
    except Exception as e:
        st.error(f"Error loading model from {path}: {e}")
        return None

@st.cache_resource
def load_adjustment_factors(path):
    try:
        factors = joblib.load(path)
        return factors
    except Exception as e:
        st.warning(f"Could not load fairness adjustment factors from {path}: {e}")
        return None

@st.cache_resource
def load_processed_training_data(path):
    if os.path.exists(path):
        try:
            # Load the DataFrame, handle potential 'Unnamed: 0' if it exists
            df = pd.read_csv(path)
            if 'Unnamed: 0' in df.columns:
                df = df.drop(columns=['Unnamed: 0'])
            return df
        except Exception as e:
            st.error(f"Error loading processed training data from {path}: {e}")
            return None
    else:
        st.error(f"Processed training data not found at {path}")
        return None

# Loading all components
voting_regressor = load_model(MODEL_PATH)
adjustment_factors_by_sensitive_attribute = load_adjustment_factors(FACTORS_PATH)
X_train_processed_ensemble = load_processed_training_data(TRAIN_DATA_PATH)

# This is the critical line: model_features should contain all columns from the CSV
model_features = X_train_processed_ensemble.columns.tolist() if X_train_processed_ensemble is not None else []


if voting_regressor is None or not model_features:
    st.error("Essential components could not be loaded. Please check file paths and ensure the model and data are correctly formatted.")
    st.stop()

# Defining PCA NLP feature importance order based on your provided columns
pca_nlp_importance_order = [f'pca_nlp_{i}' for i in range(15)]

# Filtering to only include pca_nlp features that are actually in the model features
pca_nlp_features_ordered = [f for f in pca_nlp_importance_order if f in model_features]

# Defining generic labels for PCA NLP features
generic_nlp_labels = [
    "Job Complexity Factor 1", "Role Scope Dimension", "Skill Specialization Score",
    "Duties Component (Main)", "Tasks Component (Secondary)", "Text Pattern 1",
    "Job Description Feature (PCA-7)", "Text Pattern 2", "Job Description Feature (PCA-9)",
    "Text Pattern 3", "Job Description Feature (PCA-11)", "Job Complexity Factor 2",
    "Text Pattern 4", "Skill Specialization Score 2", "Duties Component (Secondary)",
]
# Creating a dictionary to map pca_nlp names to generic labels
nlp_label_map = {pca_col: generic_nlp_labels[i] for i, pca_col in enumerate(pca_nlp_features_ordered) if i < len(generic_nlp_labels)}


def get_imputer(train_data):
    """Creates and fits a median imputer on the training data."""
    imputer = SimpleImputer(strategy='median')
    # Fitting only on numerical columns to avoid errors with object/boolean types
    numerical_cols = train_data.select_dtypes(include=np.number).columns
    imputer.fit(train_data[numerical_cols])
    return imputer, numerical_cols

imputer_for_predict_fn, numerical_features_for_imputation = get_imputer(X_train_processed_ensemble)


def clean_feature_name(feature_name):
    """Clean feature names for display"""
    # Handling boolean features like Gender_Female, Gender_Male
    if feature_name.startswith(('Gender_', 'Country_', 'continent_name_',
                                'ManageStaff_', 'EducationIsComputerRelated_',
                                'EmploymentSector_', 'LookingForAnotherJob_',
                                'CareerPlansThisYear_', 'JobTitle_', 'EmploymentStatus_')):
        # Removing prefix and replace underscores with spaces, then title case
        parts = feature_name.split('_', 1) # Split only on the first underscore
        if len(parts) > 1:
             cleaned = parts[1].replace('_', ' ').title()
             # Handling specific cases like '1__this_is...'
             cleaned = cleaned.replace('  This Is The Only Company Where I Ve Had This Kind Of Position ', ' (1st Company in this Role)')
        else:
             cleaned = feature_name.replace('_', ' ').title() # Fallback if no underscore after prefix
        return cleaned
    elif feature_name.startswith('pca_nlp_'):
        # Using the generic label mapping for PCA NLP features
        return nlp_label_map.get(feature_name, feature_name.replace('_', ' ').title())
    else:
        return feature_name.replace('_', ' ').title()


def get_ohe_options(prefix, model_features):
    """
    Extracts and cleans options for one-hot encoded features based on a prefix.
    Returns a sorted list of unique options, using clean_feature_name.
    """
    cols = [col for col in model_features if col.startswith(prefix)]

    if not cols:
        return []

    # Applying clean_feature_name to get display options
    options = [clean_feature_name(col) for col in cols]

    # Removing duplicates and sort for consistent display
    options = sorted(list(set(options)))
    return options

def preprocess_user_input(user_input_data, model_features, X_train_df, imputer, numerical_cols_for_imputation):
    """
    Preprocesses user input to match the model's expected feature format.
    Handles OHE features dynamically based on model_features.
    Applies imputation to numerical features.
    """
    processed_input = {feature: 0 for feature in model_features}

    # Numerical features (direct mapping)
    if 'YearsOfExperience' in user_input_data and user_input_data['YearsOfExperience'] is not None:
        if 'YearsOfExperience' in model_features:
            processed_input['YearsOfExperience'] = user_input_data['YearsOfExperience']
        # Handling squared term if it exists in model features
        if 'YearsOfExperience_sq' in model_features:
            processed_input['YearsOfExperience_sq'] = user_input_data['YearsOfExperience']**2

    # Handling 'CompanySize' (numerical mapping from friendly string)
    company_size_mapping = {'1-5': 3, '6-99': 52.5, '100-249': 174.5, '250-499': 374.5, '500-999': 749.5, '1000 or more': 1500}
    if 'CompanySize_friendly' in user_input_data and user_input_data['CompanySize_friendly'] is not None:
        if 'CompanySize' in model_features: # Check if the numerical column exists
             processed_input['CompanySize'] = company_size_mapping.get(user_input_data['CompanySize_friendly'], np.nan)


    # Handling 'TeamSize' (direct numerical input)
    if 'TeamSize' in user_input_data and user_input_data['TeamSize'] is not None:
        if 'TeamSize' in model_features:
            processed_input['TeamSize'] = user_input_data['TeamSize']

    # Handling 'HasCertifications' (direct binary input, not OHE with prefix)
    # Assuming 'HasCertifications' is a binary numerical column (0 or 1)
    if 'HasCertifications' in user_input_data and user_input_data['HasCertifications'] is not None:
        if 'HasCertifications' in model_features:
            processed_input['HasCertifications'] = 1 if user_input_data['HasCertifications'] == 'Yes' else 0

    # One-Hot Encoded Features (dynamic handling with CORRECTED prefixes)
    # Mapping friendly input names to the OHE prefixes used in the model features
    ohe_mappings = {
        'Gender': 'Gender_',
        'Country': 'Country_',
        'Continent': 'continent_name_',
        'Manages Staff': 'ManageStaff_',
        'Is Education Computer Related?': 'EducationIsComputerRelated_',
        'Employment Sector': 'EmploymentSector_',
        'Are you Looking for Another Job?': 'LookingForAnotherJob_',
        'Your Career Plans This Year': 'CareerPlansThisYear_',
        'Your Job Title': 'JobTitle_',
        'Employment Status': 'EmploymentStatus_'
    }

    for friendly_key, prefix in ohe_mappings.items():
        if friendly_key in user_input_data and user_input_data[friendly_key] is not None:
            user_value = user_input_data[friendly_key]
            # Converting the user-friendly value back to a format that matches the original OHE column names
            #trying to reconstruct a likely original column name.
            # This might require adjusting based on the exact format in X_train_processed_ensemble.csv
            original_ohe_part = user_value.replace(' (1st Company in this Role)', '_this_is_the_only_company_where_I_ve_had_this_kind_of_position_').replace(' ', '_').title()

            # Handling boolean-like OHEs where the column name is just the prefix + value (e.g., Gender_Male)
            if user_value.lower() in ['yes', 'no', 'true', 'false']: # Example boolean-like options
                 ohe_col = f"{prefix}{user_value.title()}"
            else: # Handle categorical OHEs
                 ohe_col = f"{prefix}{original_ohe_part}"

            # Checking if the constructed OHE column name exists in the model features
            if ohe_col in model_features:
                 processed_input[ohe_col] = 1
            else:
                 # Fallback for cases where the reconstruction doesn't match or the option is missing
                 # This could happen if the cleaning process was complex or data is sparse.
                 # A safer approach might be to iterate through model_features and match the cleaned name.
                 # For now, just passing if the column isn't found.
                 pass


    # Processing PCA NLP features
    for pca_col in pca_nlp_features_ordered:
         if pca_col in user_input_data and user_input_data[pca_col] is not None:
            processed_input[pca_col] = user_input_data[pca_col]

    # Converting to DataFrame
    user_input_df = pd.DataFrame([processed_input], columns=model_features)

    # Applying imputation only to the numerical columns that were used for fitting
    user_input_df_numerical = user_input_df[numerical_cols_for_imputation]
    user_input_df_imputed_numerical = pd.DataFrame(
        imputer.transform(user_input_df_numerical),
        columns=numerical_cols_for_imputation,
        index=user_input_df_numerical.index
    )

    # Combining imputed numerical columns with non-numerical columns (like booleans)
    # Finding non-numerical columns in the user input dataframe
    non_numerical_cols = user_input_df.columns.difference(numerical_cols_for_imputation)
    user_input_df_final = pd.concat([user_input_df_imputed_numerical, user_input_df[non_numerical_cols]], axis=1)

    # Ensuring the final dataframe has the same column order as model_features
    user_input_df_final = user_input_df_final[model_features]

    return user_input_df_final


# --- Streamlit UI ---
st.header("💰 Salary Prediction Tool")
st.markdown("Enter your details for a personalized salary estimate")

user_input_data = {}

# Section 1: Identity and Location
with st.container():
    st.subheader("🌍 Your Identity")
    col1, col2, col3 = st.columns(3)

    with col1:
        gender_options = get_ohe_options('Gender_', model_features)
        if gender_options:
            user_input_data['Gender'] = st.selectbox("Gender", gender_options)
        else:
            st.warning("Gender options not found in model features. Please check data preprocessing.")
            user_input_data['Gender'] = None

    with col2:
        country_options = get_ohe_options('Country_', model_features)
        if country_options:
             user_input_data['Country'] = st.selectbox("Country", country_options)
        else:
            st.warning("Country options not found in model features. Please check data preprocessing.")
            user_input_data['Country'] = None

    with col3:
        continent_options = get_ohe_options('continent_name_', model_features)
        if continent_options:
            user_input_data['Continent'] = st.selectbox("Continent", continent_options) # Use friendly name 'Continent'
        else:
            st.warning("Continent options not found in model features. Please check data preprocessing.")
            user_input_data['Continent'] = None

# Section 2: Professional Details
with st.container():
    st.subheader("💼 Professional Information")
    col1, col2 = st.columns(2)

    with col1:
        # Checking if 'YearsOfExperience' is a numerical column in the training data for min/max
        if 'YearsOfExperience' in X_train_processed_ensemble.select_dtypes(include=np.number).columns:
            min_years = float(X_train_processed_ensemble['YearsOfExperience'].min()) if not np.isnan(X_train_processed_ensemble['YearsOfExperience'].min()) else 0.0
            max_years = float(X_train_processed_ensemble['YearsOfExperience'].max()) if not np.isnan(X_train_processed_ensemble['YearsOfExperience'].max()) else 50.0
            user_input_data['YearsOfExperience'] = st.number_input(
                "Years in Role",
                min_value=min_years,
                max_value=max_years,
                value=min(5.0, max_years),
                step=0.1 # Allow for fractional years
            )
        else:
            st.warning("YearsOfExperience column not found or is not numerical in training data. Defaulting to a generic input.")
            user_input_data['YearsOfExperience'] = st.number_input("Years in Role", min_value=0.0, max_value=50.0, value=5.0, step=0.1)


        company_size_mapping = {
            '1-5': 3, '6-99': 52.5, '100-249': 174.5,
            '250-499': 374.5, '500-999': 749.5, '1000 or more': 1500
        }
        # Checking if the numerical 'CompanySize' column exists for model input
        if 'CompanySize' in model_features:
            user_input_data['CompanySize_friendly'] = st.selectbox( # Use friendly name for UI
                "Company Size",
                options=list(company_size_mapping.keys())
            )
        else:
            st.warning("CompanySize column not found in model features. Cannot select company size.")
            user_input_data['CompanySize_friendly'] = None

        # Checking if 'TeamSize' is a numerical column in the training data for min/max
        if 'TeamSize' in X_train_processed_ensemble.select_dtypes(include=np.number).columns:
            min_team_size = int(X_train_processed_ensemble['TeamSize'].min()) if not np.isnan(X_train_processed_ensemble['TeamSize'].min()) else 0
            max_team_size = int(X_train_processed_ensemble['TeamSize'].max()) if not np.isnan(X_train_processed_ensemble['TeamSize'].max()) else 1000
            user_input_data['TeamSize'] = st.number_input("Team Size", min_value=min_team_size, max_value=max_team_size, value=min(10, max_team_size), step=1)
        else:
            st.warning("TeamSize column not found or is not numerical in training data. Cannot select team size.")
            user_input_data['TeamSize'] = None


    with col2:
        managestaff_options = get_ohe_options('ManageStaff_', model_features)
        if managestaff_options:
            user_input_data['Manages Staff'] = st.selectbox("Manages Staff", managestaff_options) # Use friendly name
        else:
            st.warning("Manage Staff options not found in model features. Please check data preprocessing.")
            user_input_data['Manages Staff'] = None

        # Check if 'HasCertifications' is a numerical column for model input
        if 'HasCertifications' in model_features:
            user_input_data['HasCertifications'] = st.selectbox("Has Certifications", options=['Yes', 'No'])
        else:
            st.warning("HasCertifications column not found in model features. Cannot select certifications.")
            user_input_data['HasCertifications'] = None

# Section 3: Additional Features
with st.container():
    st.subheader("📝 Additional Information")

    education_options = get_ohe_options('EducationIsComputerRelated_', model_features)
    if education_options:
        user_input_data['Is Education Computer Related?'] = st.selectbox("Is Education Computer Related?", education_options) # Use friendly name
    else:
        st.warning("EducationIsComputerRelated options not found in model features. Please check data preprocessing.")
        user_input_data['Is Education Computer Related?'] = None

    employment_sector_options = get_ohe_options('EmploymentSector_', model_features)
    if employment_sector_options:
        user_input_data['Employment Sector'] = st.selectbox("Employment Sector", employment_sector_options) # Use friendly name
    else:
        st.warning("Employment Sector options not found in model features. Please check data preprocessing.")
        user_input_data['Employment Sector'] = None

    # Assuming 'EmploymentStatus' is also one-hot encoded
    employment_status_options = get_ohe_options('EmploymentStatus_', model_features)
    if employment_status_options:
        user_input_data['Employment Status'] = st.selectbox("Employment Status", employment_status_options) # Use friendly name
    else:
        st.warning("Employment Status options not found in model features. Please check data preprocessing.")
        user_input_data['Employment Status'] = None


    job_title_options = get_ohe_options('JobTitle_', model_features)
    if job_title_options:
        user_input_data['Your Job Title'] = st.selectbox("Your Job Title", job_title_options) # Use friendly name
    else:
        st.warning("Job Title options not found in model features. Please check data preprocessing.")
        user_input_data['Your Job Title'] = None


# Career Outlook
with st.container():
    st.subheader("🎯 Career Outlook")
    col1, col2 = st.columns(2)

    with col1:
        looking_for_another_job_options = get_ohe_options('LookingForAnotherJob_', model_features)
        if looking_for_another_job_options:
            user_input_data['Are you Looking for Another Job?'] = st.selectbox("Are you Looking for Another Job?", looking_for_another_job_options) # Use friendly name
        else:
            st.warning("Looking For Another Job options not found in model features. Please check data preprocessing.")
            user_input_data['Are you Looking for Another Job?'] = None

    with col2:
        career_plans_options = get_ohe_options('CareerPlansThisYear_', model_features)
        if career_plans_options:
            user_input_data['Your Career Plans This Year'] = st.selectbox("Your Career Plans This Year", career_plans_options) # Use friendly name
        else:
            st.warning("Career Plans This Year options not found in model features. Please check data preprocessing.")
            user_input_data['Your Career Plans This Year'] = None

# Text areas for NLP features (these don't directly map to OHEs, but their PCA components do)
# Text input is for user experience, but the model uses PCA features derived from text.
# Adding placeholder text areas here but won't use them directly in the prediction for now,as don't have a live NLP processing step in this app. The user can adjust the PCA sliders below.
with st.container():
    st.subheader("📝 Job Responsibilities & Tasks (For Context - Adjust Sliders Below)")
    st.write("Provide text descriptions for context, but adjust the sliders below to directly influence the NLP-derived factors used in the model.")
    st.text_area(
        "Describe Your Job Responsibilities & Tasks",
        help="This text area is for your reference and does not directly feed into the current prediction model in this demo. Adjust the 'Advanced Text-Derived Factors' sliders below.",
        key='other_job_duties_text_area'
    )
    st.text_area(
        "Describe the Kinds of Tasks You Perform",
        help="This text area is for your reference and does not directly feed into the current prediction model in this demo. Adjust the 'Advanced Text-Derived Factors' sliders below.",
        key='kinds_of_tasks_performed_text_area'
    )


# Sliders for Advanced Text-Derived Factors (PCA NLP features)
st.subheader("✨ Advanced Text-Derived Factors")
st.info("These factors are automatically generated from text descriptions of job duties and tasks provided in the original data. While not directly interpretable, they capture nuances that influence salary. Adjust them to see their hypothetical impact.")

col_nlp_count = 5 # Number of columns for sliders
nlp_cols = st.columns(col_nlp_count)
for i, pca_col in enumerate(pca_nlp_features_ordered):
    if pca_col in model_features:
        # Ensureing min/max/mean are calculated only on numerical columns
        if pca_col in X_train_processed_ensemble.select_dtypes(include=np.number).columns:
            min_val = float(X_train_processed_ensemble[pca_col].min()) if not np.isnan(X_train_processed_ensemble[pca_col].min()) else 0.0
            max_val = float(X_train_processed_ensemble[pca_col].max()) if not np.isnan(X_train_processed_ensemble[pca_col].max()) else 1.0
            mean_val = float(X_train_processed_ensemble[pca_col].mean()) if not np.isnan(X_train_processed_ensemble[pca_col].mean()) else 0.0
        else:
             # Default values if the column is not numerical or has no variation
             min_val, max_val, mean_val = 0.0, 1.0, 0.0
             st.warning(f"PCA NLP feature '{pca_col}' not found or is not numerical in training data. Defaulting slider range.")


        with nlp_cols[i % col_nlp_count]:
            # Using the generic label if available, otherwise fallback
            label_to_display = nlp_label_map.get(pca_col, f"Text Factor {i + 1}")
            # Adding a unique key for each slider
            user_input_data[pca_col] = st.slider(label_to_display, min_val, max_val, mean_val, key=f'nlp_slider_{pca_col}', help="This slider represents an abstract factor derived from job descriptions and tasks. Adjust it to see its hypothetical impact on salary.")

st.markdown("---")

apply_adjustment = st.checkbox("Apply Fairness Adjustment (based on Gender)", value=True, help="Applying this adjustment may modify the prediction to help mitigate potential bias related to gender, based on patterns observed in the training data.")

# --- Prediction Button ---
if st.button("Predict Salary"):
    with st.spinner("Calculating..."):
        # Ensuring all necessary keys are in user_input_data, even if None, so preprocess_user_input doesn't fail on missing keys.
        # This could be done more robustly by checking against expected keys from UI elements.
        # For now, relying on preprocess_user_input handling missing keys gracefully.

        user_input_df = preprocess_user_input(user_input_data, model_features, X_train_processed_ensemble, imputer_for_predict_fn, numerical_features_for_imputation)

        try:
            raw_prediction = voting_regressor.predict(user_input_df)[0]

            st.subheader("Results")
            st.success(f"Predicted Salary: ${raw_prediction:,.2f}")

            # Fairness adjustment
            if apply_adjustment and adjustment_factors_by_sensitive_attribute:
                # Getting the user's selected gender (cleaned name)
                selected_gender_cleaned = user_input_data.get('Gender')

                # Finding the original gender column name that corresponds to the selected cleaned name
                original_gender_col = None
                if selected_gender_cleaned:
                    # Iteratng through the original gender columns to find a match
                    for col in model_features:
                        if col.startswith('Gender_') and clean_feature_name(col) == selected_gender_cleaned:
                            original_gender_col = col
                            break

                if original_gender_col and original_gender_col in adjustment_factors_by_sensitive_attribute:
                    adjustment = adjustment_factors_by_sensitive_attribute[original_gender_col]
                    adjusted_prediction = raw_prediction + adjustment
                    st.info(f"Fairness Adjusted Salary: ${adjusted_prediction:,.2f}")
                else:
                    st.warning(f"Adjustment factors for selected gender '{selected_gender_cleaned}' not found. No fairness adjustment applied.")
            elif apply_adjustment and not adjustment_factors_by_sensitive_attribute:
                 st.warning("Fairness adjustment factors could not be loaded. No adjustment applied.")


        except Exception as e:
            st.error(f"An error occurred during prediction: {e}")
            st.error("Please check the input values and ensure the model and data are correctly loaded.")

st.markdown("---")
st.caption("Note: This is a predictive estimate based on available data")

Overwriting pages/1_Predict_Your_Salary.py


In [61]:
%%writefile pages/2_Explore_Pay_Gaps_&_Trends.py
# -*- coding: utf-8 -*-
import streamlit as st
import pandas as pd
import plotly.express as px
import os
import re

st.header("Explore Pay Gaps & Trends 📊")
st.markdown("Visualize salary trends and potential pay gaps based on the dataset.")

# Loading processed training data and salary data
TRAIN_DATA_PATH = '/content/drive/MyDrive/Datasets/X_train_processed_ensemble.csv'
SALARY_DATA_PATH = '/content/drive/MyDrive/Datasets/y_train_processed_ensemble.csv'

# Adding a button to clear the cache
if st.button("Clear Cache and Reload Data"):
    st.cache_data.clear()
    st.rerun()

@st.cache_data
def load_and_prepare_data(train_path, salary_path):
    """Loads and prepares the data for visualization."""
    if not os.path.exists(train_path):
        st.error(f"Training data file not found at: {train_path}")
        return None
    if not os.path.exists(salary_path):
        st.error(f"Salary data file not found at: {salary_path}")
        return None

    try:
        df_train = pd.read_csv(train_path)
        # Dropping 'Unnamed: 0' if it exists and reset index for reliable concatenation
        if 'Unnamed: 0' in df_train.columns:
            df_train = df_train.drop(columns=['Unnamed: 0'])
        df_train = df_train.reset_index(drop=True)

        df_salary = pd.read_csv(salary_path)
        # Dropping 'Unnamed: 0' if it exists and reset index for reliable concatenation
        if 'Unnamed: 0' in df_salary.columns:
            df_salary = df_salary.drop(columns=['Unnamed: 0'])
        df_salary = df_salary.reset_index(drop=True)

        # Checking if both dataframes have the same number of rows. If not, truncate to the minimum.
        if df_train.shape[0] != df_salary.shape[0]:
            min_rows = min(df_train.shape[0], df_salary.shape[0])
            # Removed the st.warning here
            df_train = df_train.iloc[:min_rows].reset_index(drop=True)
            df_salary = df_salary.iloc[:min_rows].reset_index(drop=True)


        # Concatenating the dataframes. Assuming they are aligned by index after reset/truncation.
        df = pd.concat([df_train, df_salary], axis=1)

        # Cleaning column names for consistency
        df.columns = ["".join(c if re.fullmatch(r'[A-Za-z0-9_]+', c) else '_' for c in str(x)) for x in df.columns]
        df = df.loc[:, ~df.columns.duplicated(keep='first')] # Drop duplicates if any

        # Renaming the salary column to a consistent name if it exists
        salary_col = None
        for col in df.columns:
            if 'salary' in col.lower() and 'usd' in col.lower():
                salary_col = col
                break

        if salary_col is None:
            st.error("Could not find a column containing 'salary' and 'usd'. Please ensure the salary column exists and is named appropriately.")
            return None

        df = df.rename(columns={salary_col: 'salaryusd'})


        # Dropping rows with missing salary values
        df = df.dropna(subset=['salaryusd'])

        # Data Cleaning and Preparation for Visualization
        def clean_col_name_for_display(col_name):
            """Clean feature names for display in plots."""
            cleaned = col_name.replace('_', ' ').title()
            # Specific replacements for common patterns
            cleaned = cleaned.replace('Pca Nlp ', 'Text Factor ')
            cleaned = cleaned.replace('Continent Name ', 'Continent ')
            cleaned = cleaned.replace('Managestaff ', 'Manages Staff ')
            cleaned = cleaned.replace('Educationiscomputerrelated ', 'Education Computer Related ')
            cleaned = cleaned.replace('Employmentsector ', 'Employment Sector ')
            cleaned = cleaned.replace('Lookingforanotherjob ', 'Looking For Another Job ')
            cleaned = cleaned.replace('Careerplansthisyear ', 'Career Plans This Year ')
            cleaned = cleaned.replace('Jobtitle ', 'Job Title ')
            cleaned = cleaned.replace('1  This Is The Only Company Where I Ve Had This Kind Of Position ', ' (1st Company in this Role)')

            return cleaned.strip() # Remove leading/trailing spaces

        # Identifying and cleaning relevant columns for plotting
        plot_df = pd.DataFrame(index=df.index)
        plot_df['salaryusd'] = df['salaryusd']

        # Gender
        gender_cols = [col for col in df.columns if col.startswith('gender_')]
        if gender_cols:
            # Use idxmax to get the column name with the max value (which should be 1 for the selected category)
            plot_df['Gender'] = df[gender_cols].idxmax(axis=1).apply(lambda x: clean_col_name_for_display(x.replace('gender_', '')))
        else:
            plot_df['Gender'] = 'Unknown' # Default if no gender columns

        # Country (Top N for visualization)
        country_cols = [col for col in df.columns if col.startswith('country_')]
        if country_cols:
            plot_df['Country'] = df[country_cols].idxmax(axis=1).apply(lambda x: clean_col_name_for_display(x.replace('country_', '')))
            top_countries = plot_df['Country'].value_counts().nlargest(10).index.tolist()
            plot_df['Country_Plot'] = plot_df['Country'].apply(lambda x: x if x in top_countries else 'Other')
        else:
            plot_df['Country_Plot'] = 'Unknown'

        # Continent
        continent_cols = [col for col in df.columns if col.startswith('continent_name_')]
        if continent_cols:
            plot_df['Continent'] = df[continent_cols].idxmax(axis=1).apply(lambda x: clean_col_name_for_display(x.replace('continent_name_', '')))
        else:
            plot_df['Continent'] = 'Unknown'

        # Years of Experience (using the numeric column)
        if 'YearsOfExperience' in df.columns:
            plot_df['Years_Experience'] = df['YearsOfExperience']
        else:
            plot_df['Years_Experience'] = 0 # Default if not available


        # Job Title (Top N for visualization)
        job_title_cols = [col for col in df.columns if col.startswith('jobtitle_')]
        if job_title_cols:
            plot_df['JobTitle'] = df[job_title_cols].idxmax(axis=1).apply(lambda x: clean_col_name_for_display(x.replace('jobtitle_', '')))
            top_job_titles = plot_df['JobTitle'].value_counts().nlargest(10).index.tolist()
            plot_df['JobTitle_Plot'] = plot_df['JobTitle'].apply(lambda x: x if x in top_job_titles else 'Other')
        else:
            plot_df['JobTitle_Plot'] = 'Unknown'

        # Employment Status (assuming OHE with 'employmentstatus_' prefix or similar)
        employment_status_cols = [col for col in df.columns if col.startswith('employmentstatus_')]
        if employment_status_cols:
            plot_df['EmploymentStatus'] = df[employment_status_cols].idxmax(axis=1).apply(lambda x: clean_col_name_for_display(x.replace('employmentstatus_', '')))
        else:
            plot_df['EmploymentStatus'] = 'Unknown'

        # Employment Sector (assuming OHE with 'employmentsector_' prefix or similar)
        employment_sector_cols = [col for col in df.columns if col.startswith('employmentsector_')]
        if employment_sector_cols:
            plot_df['EmploymentSector'] = df[employment_sector_cols].idxmax(axis=1).apply(lambda x: clean_col_name_for_display(x.replace('employmentsector_', '')))
        else:
            plot_df['EmploymentSector'] = 'Unknown'


        return plot_df

    except Exception as e:
        st.error(f"An error occurred during data loading and preparation: {e}")
        return None

df_explore = load_and_prepare_data(TRAIN_DATA_PATH, SALARY_DATA_PATH)

if df_explore is None:
    st.stop()

# --- Visualizations ---

st.subheader("Average Salary by Gender")
if 'Gender' in df_explore.columns and not df_explore['Gender'].empty:
    avg_salary_gender = df_explore.groupby('Gender')['salaryusd'].mean().reset_index()
    if not avg_salary_gender.empty:
        fig_gender = px.bar(avg_salary_gender, x='Gender', y='salaryusd',
                            labels={'Gender': 'Gender', 'salaryusd': 'Average Salary (USD)'},
                            title='Average Salary by Gender')
        st.plotly_chart(fig_gender)
    else:
        st.warning("No gender data available for plotting.")
else:
    st.warning("Gender column not found or is empty after data preparation.")


st.subheader("Average Salary by Top 10 Countries")
if 'Country_Plot' in df_explore.columns and not df_explore['Country_Plot'].empty:
    avg_salary_country = df_explore[df_explore['Country_Plot'] != 'Other'].groupby('Country_Plot')['salaryusd'].mean().reset_index()
    if not avg_salary_country.empty:
        fig_country = px.bar(avg_salary_country, x='Country_Plot', y='salaryusd',
                             labels={'Country_Plot': 'Country', 'salaryusd': 'Average Salary (USD)'},
                             title='Average Salary by Top 10 Countries')
        st.plotly_chart(fig_country)
    else:
        st.warning("No data available for top countries.")
else:
    st.warning("Country_Plot column not found or is empty after data preparation.")


st.subheader("Average Salary by Continent")
if 'Continent' in df_explore.columns and not df_explore['Continent'].empty:
    avg_salary_continent = df_explore.groupby('Continent')['salaryusd'].mean().reset_index()
    if not avg_salary_continent.empty:
        fig_continent = px.bar(avg_salary_continent, x='Continent', y='salaryusd',
                               labels={'Continent': 'Continent', 'salaryusd': 'Average Salary (USD)'},
                               title='Average Salary by Continent')
        st.plotly_chart(fig_continent)
    else:
         st.warning("No data available for continents.")
else:
    st.warning("Continent column not found or is empty after data preparation.")

st.subheader("Salary vs. Years of Experience")
if 'Years_Experience' in df_explore.columns and not df_explore['Years_Experience'].empty:
    fig_experience = px.scatter(df_explore, x='Years_Experience', y='salaryusd',
                                labels={'Years_Experience': 'Years in This Job Type', 'salaryusd': 'Salary (USD)'},
                                title='Salary vs. Years of Experience',
                                hover_data=['JobTitle_Plot', 'Country_Plot'])
    st.plotly_chart(fig_experience)
else:
    st.warning("Years of Experience data not available for visualization.")

st.subheader("Average Salary by Highest Education Level")
if 'Education' in df_explore.columns and not df_explore['Education'].empty:
    # Ensuring the order is maintained for plotting
    avg_salary_education = df_explore.groupby('Education')['salaryusd'].mean().reset_index()
    # Reindexing to ensure all categories are present and in order, filling missing with NaN (will be skipped by plot)
    if 'Education' in df_explore.columns and pd.api.types.is_categorical_dtype(df_explore['Education']):
         avg_salary_education = avg_salary_education.set_index('Education').reindex(df_explore['Education'].cat.categories).reset_index()

    if not avg_salary_education.empty:
        fig_education = px.bar(avg_salary_education, x='Education', y='salaryusd',
                               labels={'Education': 'Highest Education Level', 'salaryusd': 'Average Salary (USD)'},
                               title='Average Salary by Highest Education Level')
        st.plotly_chart(fig_education)
    else:
        st.warning("No data available for education levels.")
else:
    st.warning("Education column not found or is empty after data preparation.")


st.subheader("Average Salary by Employment Status")
if 'EmploymentStatus' in df_explore.columns and not df_explore['EmploymentStatus'].empty:
    avg_salary_employment_status = df_explore.groupby('EmploymentStatus')['salaryusd'].mean().reset_index()
    if not avg_salary_employment_status.empty:
        fig_employment_status = px.bar(avg_salary_employment_status, x='EmploymentStatus', y='salaryusd',
                                       labels={'EmploymentStatus': 'Employment Status', 'salaryusd': 'Average Salary (USD)'},
                                       title='Average Salary by Employment Status')
        st.plotly_chart(fig_employment_status)
    else:
        st.warning("No data available for employment status.")
else:
    st.warning("EmploymentStatus column not found or is empty after data preparation.")

st.subheader("Average Salary by Employment Sector")
if 'EmploymentSector' in df_explore.columns and not df_explore['EmploymentSector'].empty:
    avg_salary_employment_sector = df_explore.groupby('EmploymentSector')['salaryusd'].mean().reset_index()
    if not avg_salary_employment_sector.empty:
         fig_employment_sector = px.bar(avg_salary_employment_sector, x='EmploymentSector', y='salaryusd',
                                        labels={'EmploymentSector': 'Employment Sector', 'salaryusd': 'Average Salary (USD)'},
                                        title='Average Salary by Employment Sector')
         st.plotly_chart(fig_employment_sector)
    else:
        st.warning("No data available for employment sectors.")
else:
    st.warning("Employment Sector data not available for visualization.")

st.subheader("Average Salary by Top 10 Job Titles")
if 'JobTitle_Plot' in df_explore.columns and not df_explore['JobTitle_Plot'].empty:
    avg_salary_job_title = df_explore[df_explore['JobTitle_Plot'] != 'Other'].groupby('JobTitle_Plot')['salaryusd'].mean().reset_index()
    if not avg_salary_job_title.empty:
        fig_job_title = px.bar(avg_salary_job_title, x='JobTitle_Plot', y='salaryusd',
                               labels={'JobTitle_Plot': 'Job Title', 'salaryusd': 'Average Salary (USD)'},
                               title='Average Salary by Top 10 Job Titles')
        st.plotly_chart(fig_job_title)
    else:
        st.warning("No data available for top job titles.")
else:
    st.warning("JobTitle_Plot column not found or is empty after data preparation.")


st.markdown("---")
st.subheader("Interpreting Pay Gaps")
st.write("""
The visualizations above show average salaries across different demographic and professional groups present in the dataset. Apparent differences in average salaries may indicate potential pay gaps.

**Important Considerations:**
- **Correlation vs. Causation**: These charts show correlations in the data, not necessarily direct causal relationships. Many factors influence salary.
- **Intersectionality**: Pay gaps are often intersectional (e.g., the experience of women of color may differ from white women). These simple visualizations may not capture such complexities.
- **Data Limitations**: The dataset's composition and potential biases can influence the observed trends.
- **Controlling for Factors**: A rigorous analysis of pay gaps requires controlling for relevant factors (like experience, education, location, job title) to isolate the effect of sensitive attributes. Our prediction model attempts to do this, and the 'Fairness Adjustment' on the 'Predict Your Salary' page is one approach to mitigate observed disparities.
These visualizations are intended to provide a high-level overview of trends and potential areas of concern within the dataset. For a deeper understanding and individual assessment, consider the prediction and explanation provided on the 'Predict Your Salary' page.
""")

Overwriting pages/2_Explore_Pay_Gaps_&_Trends.py


In [9]:
%%writefile pages/3_About_Our_Model.py
# -*- coding: utf-8 -*-
import streamlit as st

st.header("About Our Model 🤖")
st.markdown("Learn about the methodology, data sources, and limitations of our salary prediction model.")

st.subheader("Methodology")
st.write("""
Our salary prediction tool is powered by a robust machine learning model. Specifically, we utilize an **ensemble method** called a **Voting Regressor**. This ensemble combines the strengths of multiple individual models to produce a more accurate and stable prediction.

The base models contributing to our Voting Regressor are:
- **Ridge Regression**: A linear model that handles multicollinearity and prevents overfitting by adding a penalty term.
- **LightGBM Regressor**: A gradient boosting framework that uses tree-based learning algorithms. It's known for its speed and efficiency, especially on large datasets, while maintaining high accuracy.

By combining these diverse models, the Voting Regressor leverages their collective intelligence, often leading to better performance than any single model alone.
"""
)

st.subheader("Data Sources")
st.write("""
The model is trained on a comprehensive dataset compiled from various publicly available sources, primarily:
- **Kaggle Datasets**: We've utilized anonymized salary survey data from Kaggle, which includes a wide range of professional and demographic information.
- **World Bank API**: Economic indicators such as GDP per capita were integrated from the World Bank to enrich the dataset with macroeconomic context.

We continuously strive to update and expand our data sources to improve the model's accuracy and representativeness.
"""
)

st.subheader("Natural Language Processing (NLP) Features")
st.write("""
To capture the nuanced impact of job responsibilities and daily tasks on salary, our model analyzes your free-form text descriptions provided for 'OtherJobDuties' and 'KindsOfTasksPerformed'. We transform these descriptions into numerical 'text features' using techniques like **TF-IDF** (Term Frequency-Inverse Document Frequency) and **Principal Component Analysis (PCA)**.

- **TF-IDF**: This technique assigns a numerical value to words based on how frequently they appear in a document relative to the entire dataset, helping to identify important terms.
- **PCA**: After converting text to numerical representations with TF-IDF, PCA is applied. PCA is a dimensionality reduction technique that helps us identify the most significant underlying patterns and dimensions within the vast amount of text data. This allows our model to understand aspects like job complexity, leadership focus, or technical specialization, which are crucial for a more accurate salary estimate.

These NLP-derived features, while not directly interpretable as individual words, capture valuable contextual information from your job description.
"""
)

st.subheader("Limitations")
st.warning("""
It's crucial to understand the limitations of any predictive model, including ours:
- **Estimates, Not Guarantees**: The predictions provided are statistical estimates based on historical data. Actual job offers and salaries can vary widely due to negotiation skills, specific company policies, market demand at a given time, and other unique factors not included here.
- **Data Reliance**: The model's accuracy is dependent on the quality, recency, and representativeness of the training data. Publicly available data may not capture all private or negotiated compensation nuances.
- **Bias Detection and Mitigation**: While we actively work to identify and mitigate biases (e.g., related to gender, country, continent), these are ongoing efforts. The model may still reflect existing societal biases present in the training data. Results should be interpreted as insights, not definitive truths.
- **Statistical Tool**: This model is a statistical tool; individual situations are unique and cannot be fully captured by a generalized model.
"""
)

st.subheader("Our Commitment to Fairness")
st.write("""
We are committed to building fair and transparent AI systems. Our process includes:
- **Identifying Sensitive Attributes**: We explicitly analyze features like Gender, Country, and Continent for potential biases.
- **Quantifying Bias**: We measure disparities in predictions across different demographic groups using various fairness metrics.
- **Implementing Mitigation Strategies**: We've explored and implemented techniques like post-processing output adjustment (e.g., based on Gender) to reduce observed biases in predictions.
- **Transparency**: We aim to be transparent about our methods and the limitations of our model, encouraging users to interpret results critically.

We believe that understanding and addressing bias is a continuous journey, and we welcome feedback to improve our model's fairness and accuracy.
"""
)

st.subheader("Contact & Feedback")
st.write("""
We value your feedback! If you have any questions, suggestions, or encounter issues, please feel free to reach out.
"""
)
st.markdown("Email: [support@globalpayinsight.com](mailto:support@globalpayinsight.com)")

Overwriting pages/3_About_Our_Model.py


In [10]:
# Getting the public IP address of the Colab runtime
!curl ifconfig.me

34.168.51.195

In [None]:
!streamlit run app.py & npx localtunnel --port 8501

[1G[0K⠙
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.168.51.195:8501[0m
[0m
[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0Kyour url is: https://public-squids-wonder.loca.lt
