# Clean Salary Data

## Glassdoor salary cleaner

In [17]:
import json

def round_to_decade(value: float) -> float:
    """Round a float to the closest decade as a float."""
    return float(round(value / 10) * 10)

def clean_job_data(
    json_file_path="raw_data/data_dict.json",
    output_file_path="raw_data/data_dict_clean.json"
):
    """
    Cleans job data by:
    1. Replacing null 'position' with 'AI Engineer'
    2. Removing old 'salary' field
    3. Renaming positions:
       - 'Back End Developer/ Engineer' -> 'Backend Engineer'
       - 'Front End Developer / Engineer' -> 'Frontend Engineer'
    4. Restructuring data into:
       {
         "position": ...,
         "seniority": ...,
         "compensation": {
            "currency": "EUR",
            "period": "monthly",
            "min_amount": ...,
            "avg_amount": ...,
            "max_amount": ...
         },
         "source_url": ...,
         "source_site": ...
       }
    5. Converts yearly values to monthly by dividing by 12,
       rounding each amount to the closest decade,
       and changing period -> "monthly".
    """
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    cleaned = []
    for entry in data:
        # --- Fix position ---
        position = entry.get("position")
        if position is None:
            position = "AI Engineer"
        elif position == "Back End Developer/ Engineer":
            position = "Backend Engineer"
        elif position == "Front End Developer / Engineer":
            position = "Frontend Engineer"

        # --- Seniority ---
        seniority = entry.get("seniority")

        # --- Compensation ---
        salary_str = entry.get("salary", "")
        period = "monthly" if "mnd" in salary_str else "yearly"

        min_amount = entry.get("min_amount")
        avg_amount = entry.get("avg_amount")
        max_amount = entry.get("max_amount")

        if period == "yearly":
            # Convert to monthly & round to closest decade
            min_amount = round_to_decade(min_amount / 12) if min_amount else None
            avg_amount = round_to_decade(avg_amount / 12) if avg_amount else None
            max_amount = round_to_decade(max_amount / 12) if max_amount else None
            period = "monthly"  # update period

        compensation = {
            "currency": "EUR",
            "period": period,
            "min_amount": min_amount,
            "avg_amount": avg_amount,
            "max_amount": max_amount,
        }

        cleaned.append({
            "position": position,
            "seniority": seniority,
            "compensation": compensation,
            "source_url": entry.get("source_url"),
            "source_site": entry.get("source_site")
        })

    # Save updated JSON
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(cleaned, f, indent=4, ensure_ascii=False)

    print(f"Cleaned data saved to {output_file_path}")
    return cleaned


In [18]:
cleaned_data = clean_job_data()

Cleaned data saved to raw_data/data_dict_clean.json


## Payscale salary cleaner

In [13]:
import json

def round_to_decade(value: float) -> float:
    """Round a float to the closest decade as a float."""
    return float(round(value / 10) * 10)

def normalize_payscale_salaries(
    json_file_path="raw_data/payscale_tech_salaries.json",
    output_file_path="raw_data/ps_normalised_salaries.json"
):
    """
    Normalizes Payscale salaries by:
    1. Renaming positions:
       - 'Back End Developer/ Engineer' -> 'Backend Engineer'
       - 'Front End Developer / Engineer' -> 'Frontend Engineer'
    2. Converting yearly compensation into monthly:
       - Divide by 12
       - Round to closest decade
       - Change period to 'monthly'
    """
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for entry in data:
        # --- Rename positions ---
        if entry.get("position") == "Back End Developer/ Engineer":
            entry["position"] = "Backend Engineer"
        elif entry.get("position") == "Front End Developer / Engineer":
            entry["position"] = "Frontend Engineer"

        # --- Compensation conversion ---
        comp = entry.get("compensation", {})
        if comp.get("period") == "yearly":
            for key in ["min_amount", "avg_amount", "max_amount"]:
                if comp.get(key) is not None:
                    comp[key] = round_to_decade(comp[key] / 12)
            comp["period"] = "monthly"

    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Normalized data saved to {output_file_path}")
    return data


In [15]:
ps_cleaned_data = normalize_payscale_salaries()

Normalized data saved to raw_data/ps_normalised_salaries.json
