In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# 1. Load the Dataset
# df = pd.read_csv('https://github.com/dpm24800/top-jobs-scraper/blob/main/data/top_jobs.csv')
df = pd.read_csv('https://raw.githubusercontent.com/dpm24800/top-jobs-scraper/main/data/top_jobs.csv')
# df = pd.read_csv('top_jobs.csv')

In [11]:
# 2. BASIC DATA INSPECTION
print("Dataset Shape:", df.shape)
print("\nData Types:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())

Dataset Shape: (197, 6)

Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Post        197 non-null    object
 1   Company     197 non-null    object
 2   Experience  197 non-null    object
 3   Level       197 non-null    object
 4   Salary      197 non-null    object
 5   Deadline    197 non-null    object
dtypes: object(6)
memory usage: 9.4+ KB
None

Missing Values:
Post          0
Company       0
Experience    0
Level         0
Salary        0
Deadline      0
dtype: int64

Duplicate Rows: 0


In [12]:
df.columns = (df.columns.str.strip().str.lower().str.replace(" ", "_"))
df.head()

Unnamed: 0,post,company,experience,level,salary,deadline
0,Front Desk Officer,Autism Care Nepal Society,More than 1 year,Mid Level,Not Disclosed,2026-01-01
1,Territory Sales Officer,Apollo Paints,More than 1 year,Mid Level,Not Disclosed,2026-01-26
2,Project Officer,Sneha's Care,More than 2 years,Mid Level,Not Disclosed,2026-01-10
3,School Teachers,Ace School,More than 2 years,Mid Level,Not Disclosed,2026-01-10
4,Regional Sales Coordinator (Mobile Phone),ZTE Nepal,More than 3 years,Mid Level,Not Disclosed,2026-01-10


In [13]:
def encode_experience(value):
    if pd.isna(value):
        return None

    value = value.strip().lower()

    # Not Required
    if "not required" in value:
        return 0.0

    # Less than 1 year
    if "less than" in value:
        return 0.5

    # Extract number from string
    match = re.search(r"\d+", value)
    if match:
        years = float(match.group())

        # More than X years
        if "more than" in value:
            return years + 0.5

        # Equal to X year(s)
        if "equal" in value:
            return years

    return None  # fallback for unexpected values

df["experience_e"] = df["experience"].apply(encode_experience)
df.head()

Unnamed: 0,post,company,experience,level,salary,deadline,experience_e
0,Front Desk Officer,Autism Care Nepal Society,More than 1 year,Mid Level,Not Disclosed,2026-01-01,1.5
1,Territory Sales Officer,Apollo Paints,More than 1 year,Mid Level,Not Disclosed,2026-01-26,1.5
2,Project Officer,Sneha's Care,More than 2 years,Mid Level,Not Disclosed,2026-01-10,2.5
3,School Teachers,Ace School,More than 2 years,Mid Level,Not Disclosed,2026-01-10,2.5
4,Regional Sales Coordinator (Mobile Phone),ZTE Nepal,More than 3 years,Mid Level,Not Disclosed,2026-01-10,3.5


In [14]:
def encode_level(value):
    if pd.isna(value):
        return None

    value = value.strip().lower()

    if "entry" in value:
        return 1
    elif "mid" in value:
        return 2
    elif "senior" in value:
        return 3
    elif "top" in value:
        return 4
    else:
        return None

# Apply to dataframe
df["level_e"] = df["level"].apply(encode_level)
df.head()

Unnamed: 0,post,company,experience,level,salary,deadline,experience_e,level_e
0,Front Desk Officer,Autism Care Nepal Society,More than 1 year,Mid Level,Not Disclosed,2026-01-01,1.5,2
1,Territory Sales Officer,Apollo Paints,More than 1 year,Mid Level,Not Disclosed,2026-01-26,1.5,2
2,Project Officer,Sneha's Care,More than 2 years,Mid Level,Not Disclosed,2026-01-10,2.5,2
3,School Teachers,Ace School,More than 2 years,Mid Level,Not Disclosed,2026-01-10,2.5,2
4,Regional Sales Coordinator (Mobile Phone),ZTE Nepal,More than 3 years,Mid Level,Not Disclosed,2026-01-10,3.5,2


In [15]:
df["deadline"] = pd.to_datetime(df["deadline"])

df["year"] = df["deadline"].dt.year
df["month"] = df["deadline"].dt.month
df["day"] = df["deadline"].dt.day
df.head()

Unnamed: 0,post,company,experience,level,salary,deadline,experience_e,level_e,year,month,day
0,Front Desk Officer,Autism Care Nepal Society,More than 1 year,Mid Level,Not Disclosed,2026-01-01,1.5,2,2026,1,1
1,Territory Sales Officer,Apollo Paints,More than 1 year,Mid Level,Not Disclosed,2026-01-26,1.5,2,2026,1,26
2,Project Officer,Sneha's Care,More than 2 years,Mid Level,Not Disclosed,2026-01-10,2.5,2,2026,1,10
3,School Teachers,Ace School,More than 2 years,Mid Level,Not Disclosed,2026-01-10,2.5,2,2026,1,10
4,Regional Sales Coordinator (Mobile Phone),ZTE Nepal,More than 3 years,Mid Level,Not Disclosed,2026-01-10,3.5,2,2026,1,10


In [16]:
def parse_salary(value):
    if pd.isna(value):
        return pd.Series([np.nan, np.nan, np.nan])

    value = value.lower().strip()

    if "not disclosed" in value:
        return pd.Series([np.nan, np.nan, np.nan])

    # Check if yearly
    is_yearly = "yearly" in value

    # Extract all numbers
    numbers = re.findall(r"\d{1,3}(?:,\d{3})*", value)
    numbers = [int(n.replace(",", "")) for n in numbers]

    if not numbers:
        return pd.Series([np.nan, np.nan, np.nan])

    # Single salary
    if len(numbers) == 1:
        min_salary = max_salary = numbers[0]
    else:
        min_salary, max_salary = min(numbers), max(numbers)

    # Convert yearly to monthly
    if is_yearly:
        min_salary /= 12
        max_salary /= 12

    avg_salary = (min_salary + max_salary) / 2

    return pd.Series([min_salary, max_salary, avg_salary])


# Apply to dataframe
df[["salary_min", "salary_max", "salary_avg"]] = df["salary"].apply(parse_salary)
df.head()

Unnamed: 0,post,company,experience,level,salary,deadline,experience_e,level_e,year,month,day,salary_min,salary_max,salary_avg
0,Front Desk Officer,Autism Care Nepal Society,More than 1 year,Mid Level,Not Disclosed,2026-01-01,1.5,2,2026,1,1,,,
1,Territory Sales Officer,Apollo Paints,More than 1 year,Mid Level,Not Disclosed,2026-01-26,1.5,2,2026,1,26,,,
2,Project Officer,Sneha's Care,More than 2 years,Mid Level,Not Disclosed,2026-01-10,2.5,2,2026,1,10,,,
3,School Teachers,Ace School,More than 2 years,Mid Level,Not Disclosed,2026-01-10,2.5,2,2026,1,10,,,
4,Regional Sales Coordinator (Mobile Phone),ZTE Nepal,More than 3 years,Mid Level,Not Disclosed,2026-01-10,3.5,2,2026,1,10,,,


In [17]:
df_numeric = df.drop(
    columns=["experience", "level", "salary", "deadline"],
    errors="ignore"
)

df_numeric.rename(
    columns={"experience_e": "experience", "level_e": "level"},
    inplace=True
)

df_numeric.head()

Unnamed: 0,post,company,experience,level,year,month,day,salary_min,salary_max,salary_avg
0,Front Desk Officer,Autism Care Nepal Society,1.5,2,2026,1,1,,,
1,Territory Sales Officer,Apollo Paints,1.5,2,2026,1,26,,,
2,Project Officer,Sneha's Care,2.5,2,2026,1,10,,,
3,School Teachers,Ace School,2.5,2,2026,1,10,,,
4,Regional Sales Coordinator (Mobile Phone),ZTE Nepal,3.5,2,2026,1,10,,,


In [18]:
# Export Cleaned Data
df_numeric.to_csv('cleaned_top_jobs.csv', index=False)
print("cleaned_top_jobs.csv exported successfully.")

cleaned_top_jobs.csv exported successfully.
