In [55]:
import pandas as pd
import numpy as np

In [56]:
# Load the dataset
file_path = "ESS10SC-subset.csv"
job_sat = pd.read_csv(file_path)


In [57]:
# Drop unnecessary columns
clean_job_sat = job_sat.drop(columns=[
    "idno", "dweight", "pweight", "pspwght", "name", "essround", "edition",
    "proddate", "edlvhpl", "edlvdse", "prob", "stratum", "psu", "yrbrn",
    "crpdwk", "pdjobyr", "njbspv", "emplno"
])

In [58]:
# Apply value filters
filters = {
    "stfmjob": 66, "happy": 66, "inprdsc": 66, "health": 7, "hlthhmp": 7,
    "rlgdgr": 66, "brncntr": 3, "gndr": 3, "agea": 666, "rshpsts": 66,
    "domicil": 7, "edulvlb": 5555, "eduyrs": 66, "emplrel": 6, "wrkctra": 6,
    "estsz": 6, "wkdcorga": 66, "wkhtot": 666, "nacer2": 666, "tporgwk": 66,
    "uemp3m": 6, "hincsrca": 66, "hinctnta": 66, "emprelp": 6, "atncrse": 7,
    "trdawrk": 6, "jbprtfp": 6, "pfmfdjba": 6, "dcsfwrka": 6,
}

In [59]:
for col, threshold in filters.items():
    clean_job_sat = clean_job_sat[clean_job_sat[col] < threshold]

In [60]:
# Convert 'cntry' to categorical
clean_job_sat["cntry"] = clean_job_sat["cntry"].astype("category")

In [61]:
clean_job_sat

Unnamed: 0,cntry,anweight,happy,inprdsc,health,hlthhmp,rlgdgr,brncntr,gndr,agea,...,uemp3m,hincsrca,hinctnta,emprelp,atncrse,stfmjob,trdawrk,jbprtfp,pfmfdjba,dcsfwrka
0,DE,0.843071,8,2,3,2,8,1,1,56,...,2,1,9,1,2,8,4,4,4,2
4,DE,0.620281,9,4,1,3,3,1,1,41,...,2,2,9,2,1,6,2,3,2,3
9,DE,0.531075,8,3,2,3,5,1,2,58,...,2,2,9,2,1,7,3,3,3,2
19,DE,0.874986,8,2,4,1,0,1,1,55,...,2,1,8,2,1,8,4,1,1,2
21,DE,0.914222,7,4,1,3,8,1,2,44,...,2,1,7,1,1,10,3,1,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13054,SE,0.576273,6,4,2,3,7,1,2,45,...,1,1,9,1,1,9,3,4,2,3
13056,SE,0.403964,6,1,3,2,0,1,2,46,...,1,1,7,1,1,8,5,4,2,2
13063,SE,0.506704,7,3,2,3,0,1,1,49,...,1,1,7,1,2,8,3,2,2,2
13067,SE,0.553192,6,3,1,2,6,1,1,21,...,1,3,4,1,2,8,3,1,1,2


### Recoding categorical variables

In [62]:
# Relationship status recoding
clean_job_sat["rshpsts"] = clean_job_sat["rshpsts"].map({
    1: "Legally married",
    2: "In a legally registered civil union",
    3: "Living with my partner - not legally recognised",
    4: "Living with my partner - legally recognised",
    5: "Legally separated",
    6: "Legally divorced/Civil union dissolved"
}).astype("category")

In [63]:
# Domicile recoding
clean_job_sat["domicil"] = clean_job_sat["domicil"].map({
    1: "A big city",
    2: "Suburbs or outskirts of big city",
    3: "Town or small city",
    4: "Country village",
    5: "Farm or home in countryside"
}).astype("category")

In [64]:
# Work contract recoding
clean_job_sat["wrkctra"] = clean_job_sat["wrkctra"].map({
    1: "Unlimited",
    2: "Limited",
    3: "No contract"
}).astype("category")

In [65]:

# Employment relation recoding
clean_job_sat["emplrel"] = clean_job_sat["emplrel"].map({
    1: "Employee",
    2: "Self-employed",
    3: "Working for own family business"
}).astype("category")

In [66]:
# NACER2 occupational category recoding
def recode_nacer2(val):
    if val in [1, 2, 3, 5, 6, 7, 8, 9, 41, 42, 43, 49, 50, 51, 52, 53, 80, 81]:
        return "Physical Work"
    elif val in range(10, 34):
        return "Manufacturing"
    elif val in list(range(58, 76)) + list(range(85, 89)):
        return "Intellectual Work"
    elif val in [35, 36, 37, 38, 39, 45, 46, 47, 55, 56, 57, 77, 78, 79, 82, 84] + list(range(90, 100)):
        return "Service & Administration"
    else:
        return "Missing/Other"

clean_job_sat["nacer2"] = clean_job_sat["nacer2"].apply(recode_nacer2).astype("category")

In [67]:
# Education levels recoding 
def recode_edulvlb(val):
    if val in [0, 113]:
        return "Primary Education"
    elif val in [129, 212, 213, 221, 222, 223]:
        return "Lower Secondary Education"
    elif val in [229, 311, 312, 313, 321, 322, 323]:
        return "Upper Secondary Education"
    elif val in [412, 413, 421, 422, 423]:
        return "Post-Secondary Non-Tertiary Education"
    elif val in [510, 520, 610, 620, 710, 720, 800]:
        return "Tertiary Education"
    elif val in [5555, 7777, 8888, 9999]:
        return "Other/Missing"
    else:
        return "Unknown"

# Apply the function and convert to categorical with a specific order
clean_job_sat["edulvlb"] = clean_job_sat["edulvlb"].apply(recode_edulvlb)

education_levels = [
    "Primary Education",
    "Lower Secondary Education",
    "Upper Secondary Education",
    "Post-Secondary Non-Tertiary Education",
    "Tertiary Education",
    "Other/Missing",
    "Unknown"
]

clean_job_sat["edulvlb"] = pd.Categorical(clean_job_sat["edulvlb"],
                                          categories=education_levels,
                                          ordered=False)

In [68]:
# Type of work recoding
tporgwk_map = {
    1: "Central or local government",
    2: "Other public sector (ex. education and health)",
    3: "A state owned enterprise",
    4: "A private firm",
    5: "Self employed"
}

clean_job_sat["tporgwk"] = clean_job_sat["tporgwk"].map(tporgwk_map).fillna("N/A")

clean_job_sat["tporgwk"] = clean_job_sat["tporgwk"].astype("category")

In [69]:
# Type of income recoding
hincsrca_map = {
    1: "Wages or salaries",
    2: "Income from self-employment (excluding farming)",
    3: "Income from farming",
    4: "Pensions",
    5: "Unemployment/redundancy benefit",
    6: "Any other social benefits or grants",
    7: "Income from investments, savings etc.",
    8: "Income from other sources"
}

clean_job_sat["hincsrca"] = clean_job_sat["hincsrca"].map(hincsrca_map).fillna("N/A")

clean_job_sat["hincsrca"] = clean_job_sat["hincsrca"].astype("category")

In [70]:
# Employment relationship recoding
emprelp_map = {
    1: "Employee",
    2: "Self-employed",
    3: "Working for own family business"
}

clean_job_sat["emprelp"] = clean_job_sat["emprelp"].map(emprelp_map).fillna("N/A")
clean_job_sat["emprelp"] = clean_job_sat["emprelp"].astype("category")


In [71]:
# Health status recoding

health_labels = ["Very good", "Good", "Fair", "Bad", "Very Bad"]

clean_job_sat["health"] = pd.Categorical(clean_job_sat["health"],
                                         categories=[1, 2, 3, 4, 5],
                                         ordered=False)
clean_job_sat["health"] = clean_job_sat["health"].cat.rename_categories(health_labels)


In [72]:
# Other ordinal vairbles

frequency_labels = ["Never", "Hardly ever", "Sometimes", "Often", "Always"]
frequency_levels = [1, 2, 3, 4, 5]

# Same logic at once
for col in ["trdawrk", "jbprtfp", "pfmfdjba", "dcsfwrka"]:
    clean_job_sat[col] = pd.Categorical(clean_job_sat[col],
                                        categories=frequency_levels,
                                        ordered=False)
    clean_job_sat[col] = clean_job_sat[col].cat.rename_categories(frequency_labels)


### Recoding binary variables

In [73]:
# TRUE if code is 1 or 2, FALSE if 3
# Hampered in daily activities by illness/disability/infirmity/mental problem 
clean_job_sat["hlthhmp"] = clean_job_sat["hlthhmp"].isin([1, 2])

# TRUE if 1, FALSE if 2
clean_job_sat["brncntr"] = clean_job_sat["brncntr"] == 1 # Born in coutnry
clean_job_sat["uemp3m"] = clean_job_sat["uemp3m"] == 1 # Unemployed
clean_job_sat["atncrse"] = clean_job_sat["atncrse"] == 1 # 

# Gender
# 0 if 1 (male), 1 if 2 (female)
clean_job_sat["gndr"] = clean_job_sat["gndr"].map({1: 0, 2: 1})

# TRUE if 1, FALSE if 0
for col in ["uempla", "uempli", "rtrd", "hswrk"]:
    clean_job_sat[col] = clean_job_sat[col] == 1


### Recoding main variable of interest

In [74]:
def categorize_stfmjob(value):
    if pd.isna(value) or (66 <= value <= 99):
        return np.nan 
    elif 0 <= value <= 2:
        return 1
    elif 3 <= value <= 4:
        return 2
    elif 5 <= value <= 6:
        return 3
    elif 7 <= value <= 8:
        return 4
    elif 9 <= value <= 10:
        return 5
    else:
        return np.nan

clean_job_sat["stfmjob_grouped"] = clean_job_sat["stfmjob"].apply(categorize_stfmjob)

clean_job_sat["stfmjob_grouped"] = pd.Categorical(
    clean_job_sat["stfmjob_grouped"],
    categories=[1, 2, 3, 4, 5],
    ordered=True
)

satisfaction_levels = [
    "Very Dissatisfied",
    "Somewhat Dissatisfied",
    "Neutral",
    "Somewhat Satisfied",
    "Very Satisfied"
]

# Step 1: Rename + ensure ordered categorical
clean_job_sat["stfmjob_named"] = pd.Categorical(
    clean_job_sat["stfmjob_grouped"].map({
        1: "Very Dissatisfied",
        2: "Somewhat Dissatisfied",
        3: "Neutral",
        4: "Somewhat Satisfied",
        5: "Very Satisfied"
    }),
    categories=satisfaction_levels,
    ordered=True
)

In [75]:
# Drop missing values 
clean_job_sat = clean_job_sat.dropna(subset=["stfmjob_grouped"])

Save to CSV

In [76]:
clean_job_sat.to_csv("clean_job_sat.csv", index=False)