In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# 1. Load the Dataset
# df = pd.read_csv('https://github.com/dpm24800/top-jobs-scraper/blob/main/data/top_jobs.csv')
df = pd.read_csv('https://raw.githubusercontent.com/dpm24800/top-jobs-scraper/main/data/top_jobs.csv')
# df = pd.read_csv('top_jobs.csv')

In [None]:
# 2. BASIC DATA INSPECTION
print("Dataset Shape:", df.shape)
print("\nData Types:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())

In [None]:
df.columns = (df.columns.str.strip().str.lower().str.replace(" ", "_"))
df.head()

In [None]:
def encode_experience(value):
    if pd.isna(value):
        return None

    value = value.strip().lower()

    # Not Required
    if "not required" in value:
        return 0.0

    # Less than 1 year
    if "less than" in value:
        return 0.5

    # Extract number from string
    match = re.search(r"\d+", value)
    if match:
        years = float(match.group())

        # More than X years
        if "more than" in value:
            return years + 0.5

        # Equal to X year(s)
        if "equal" in value:
            return years

    return None  # fallback for unexpected values

df["experience_e"] = df["experience"].apply(encode_experience)
df.head()

In [None]:
def encode_level(value):
    if pd.isna(value):
        return None

    value = value.strip().lower()

    if "entry" in value:
        return 1
    elif "mid" in value:
        return 2
    elif "senior" in value:
        return 3
    elif "top" in value:
        return 4
    else:
        return None

# Apply to dataframe
df["level_e"] = df["level"].apply(encode_level)
df.head()

In [None]:
df["deadline"] = pd.to_datetime(df["deadline"])

df["year"] = df["deadline"].dt.year
df["month"] = df["deadline"].dt.month
df["day"] = df["deadline"].dt.day
df.head()

In [None]:
def parse_salary(value):
    if pd.isna(value):
        return pd.Series([np.nan, np.nan, np.nan])

    value = value.lower().strip()

    if "not disclosed" in value:
        return pd.Series([np.nan, np.nan, np.nan])

    # Check if yearly
    is_yearly = "yearly" in value

    # Extract all numbers
    numbers = re.findall(r"\d{1,3}(?:,\d{3})*", value)
    numbers = [int(n.replace(",", "")) for n in numbers]

    if not numbers:
        return pd.Series([np.nan, np.nan, np.nan])

    # Single salary
    if len(numbers) == 1:
        min_salary = max_salary = numbers[0]
    else:
        min_salary, max_salary = min(numbers), max(numbers)

    # Convert yearly to monthly
    if is_yearly:
        min_salary /= 12
        max_salary /= 12

    avg_salary = (min_salary + max_salary) / 2

    return pd.Series([min_salary, max_salary, avg_salary])


# Apply to dataframe
df[["salary_min", "salary_max", "salary_avg"]] = df["salary"].apply(parse_salary)
df.head()

In [None]:
df_numeric = df.drop(
    columns=["experience", "level", "salary", "deadline"],
    errors="ignore"
)

df_numeric.rename(
    columns={"experience_e": "experience", "level_e": "level"},
    inplace=True
)

df_numeric.head()

In [None]:
# 6. Export Cleaned Data
df_numeric.to_csv('cleaned_top_jobs.csv', index=False)
print("cleaned_top_jobs.csv exported successfully.")