Here I deal with creating an adequate list for occupations

In [44]:
#Loading packages (hopefully installed, all is correct version and whatnot)

# Data manipulation
import numpy as np
import pandas as pd
import pickle

# Statistical analysis
import scipy.stats as stats

# Language processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy
import textblob as TextBlob
import contractions
import string
from collections import Counter
from nltk.corpus import wordnet as wn
from spacy.matcher import PhraseMatcher

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

nltk.download('punkt')       # Tokeniser
nltk.download('stopwords')   # Stopwords list
nltk.download('wordnet')     # Lemmatiser
nlp = spacy.load('en_core_web_sm')

stop_words = set(stopwords.words('english')) # Initialise stopwords
lemmatizer = WordNetLemmatizer() # Initialise lemmatiser
occupations = pd.read_csv("All_Occupations.csv")

[nltk_data] Downloading package punkt to C:\Users\andra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\andra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
#split "and" into two separate occupations
def split_and_expand(text):
    parts = text.split(" and ")
    if len(parts) != 2:
        return [text.strip()]  # Return the original text if "and" not found
    left_part = parts[0].strip()
    right_part = parts[1].strip()
    
    # Expand left part
    left_expanded = []
    if ',' in left_part:
        left_subparts = [subpart.strip() for subpart in left_part.split(',')]
        for subpart in left_subparts:
            left_expanded.append(f"{subpart} {right_part}".strip())
    else:
        left_expanded.append(left_part)
    
    # Expand right part
    right_expanded = []
    if ',' in right_part:
        right_subparts = [subpart.strip() for subpart in right_part.split(',')]
        for subpart in right_subparts:
            right_expanded.append(f"{left_part} {subpart}".strip())
    else:
        right_expanded.append(right_part)
    return [left_expanded, right_expanded]
# Apply the function to each row → produces lists
occupations["Occupation expanded"] = occupations['Occupation'].apply(split_and_expand)
# Explode lists into separate rows
occupations = occupations.explode("Occupation expanded", ignore_index=True)
occupations = occupations.rename(columns={"Occupation": "Occupation_drop"})
occupations = occupations.rename(columns={"Occupation expanded": "Occupation"})
#drop Occupation_drop
occupations = occupations.drop(columns=['Occupation_drop'])
#elements that are lists should just be strings
occupations['Occupation'] = occupations['Occupation'].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)

#make lower string
occupations['Occupation'] = occupations['Occupation'].str.lower()

#remove duplicates
occupations = occupations.drop_duplicates(subset=['Occupation'], ignore_index=True)

#drop Data-level columns
occupations = occupations.drop(columns=['Data-level'])

print(occupations)
occupations.to_csv("occupations1.csv", index=False)

      Job Zone        Code           Occupation
0          4.0  13-2011.00          accountants
1          4.0  13-2011.00             auditors
2          2.0  27-2011.00               actors
3          4.0  15-2011.00            actuaries
4          5.0  29-1291.00       acupuncturists
...        ...         ...                  ...
1159       2.0  43-9022.00              typists
1160       4.0  27-3043.00              writers
1161       4.0  27-3043.00              authors
1162       4.0  19-1023.00           zoologists
1163       4.0  19-1023.00  wildlife biologists

[1164 rows x 3 columns]


In [46]:
#work with occupations1.csv. find common occurences of words
from collections import Counter
import nltk
stop_words = set(nltk.corpus.stopwords.words('english'))
occupations1 = pd.read_csv("occupations1.csv")
occupation1 = occupations1.dropna(subset=['Occupation']).reset_index(drop=True)
all_words = ' '.join(occupation1['Occupation']).split()
word_counts = Counter(all_words)
common_words = word_counts.most_common(1000)
print(common_words)

#remove stopwords from most common words
common_words = [(word, count) for word, count in common_words if word.lower() not in stop_words]
print(common_words)

#save most common words to csv
common_words_df = pd.DataFrame(common_words, columns=['Word', 'Count'])
common_words_df.to_csv("most_common_words.csv", index=False)

[('and', 143), ('other', 58), ('all', 52), ('technicians', 51), ('workers', 49), ('managers', 48), ('operators', 40), ('specialists', 39), ('engineers', 39), ('machine', 38), ('teachers,', 37), ('operators,', 29), ('of', 26), ('postsecondary', 26), ('equipment', 25), ('technologists', 24), ('education', 22), ('workers,', 22), ('supervisors', 22), ('setters,', 22), ('clerks', 21), ('first-line', 21), ('officers', 19), ('service', 19), ('except', 19), ('installers', 18), ('repairers', 18), ('metal', 18), ('tenders', 17), ('analysts', 17), ('assistants', 16), ('computer', 15), ('medical', 15), ('plastic', 14), ('special', 14), ('sales', 13), ('engineering', 13), ('scientists', 13), ('food', 13), ('tenders,', 13), ('attendants', 12), ('health', 12), ('science', 11), ('therapists', 10), ('school', 10), ('agents', 9), ('inspectors', 9), ('mechanics', 9), ('systems', 9), ('plant', 9), ('social', 9), ('information', 9), ('transportation', 9), ('financial', 9), ('gambling', 9), ('operations', 8

In [47]:
occupations_1 = [
    "technicians","workers","managers","operators","specialists","engineers","teachers","technologists","supervisors","clerks","officers","installers",
    "repairers","analysts","assistants","attendants","scientists","sales","therapists","agents","inspectors","mechanics","designers","assemblers","physicians",
    "directors","counselors","aides","nurses","drivers","planners","repairers","cooks","assistants","scientists","truck drivers","cutters","cleaners","nurse",
    "coordinators","architects","representatives","administrators","secretaries","police","writers","occupational therapists","instructors","trainers",
    "drafters","artists","biologists","investigators","psychologists","administrators","dental assistants","advisors","engineers","executive","firefighters",
    "housekeeping staff","retail workers","funeral directors","lawyers","librarians","marketing specialists","trimmers","ophthalmic technicians","pediatric nurses",
    "railroad workers","teachers","web developers", "auditors", "performers", "athletes", "controllers", "caretakers", "appraisers", "assessors", "collectors", 
    "brickmasons", "carpenters", "pilots", "cashiers", "neuropsychologists", "programmers", "laborers", "clerks", "detectives", "economists", "editors", 
    "epidemiologists", "fabricators", "surveyors", "practitioners", "maids", "midwives", "military", "nursing assistants", "surgeons", "salespersons", "testers", 
    "therapists", "physicians", "pathologists", "pumpers", "recycling workers", "robotics engineers", "representatives", "teachers", "software developers", 
    "speech-language pathologists", "tax preparers", "veterinary technicians", "accountants", "actors", "actuaries", "acupuncturists", "allergists", "immunologists", 
    "anesthesiologists", "breeders", "anthropologists", "archeologists", "archivists", "astronomers", "audiologists", "bailiffs", "bakers", "barbers", "baristas", 
    "bartenders", "bellhops", "biochemists", "biophysicists", "bioengineers", "biostatisticians", "boilermakers", "blockmasons", "announcers", "jockeys", "butchers", 
    "buyers", "cabinetmakers", "cardiologists", "cartographers", "photogrammetrists", "chefs", "cooks", "chemists", "chiropractors", "executives", "choreographers", 
    "clergy", "coaches", "scouts", "divers", "engineers", "architects", "concierges", "coroners", "jailers", "cost estimators", "counselors", "couriers", "messengers", 
    "court reporters", "captioners", "craft workers", "crane operators", "crematory operators", "curators", "customer service representatives", "cytotechnologists", 
    "dancers", "data entry keyers", "demonstrators", "promoters", "dental hygienists", "dermatologists", "designers", "publishers", "diagnostic sonographers", 
    "dietitians", "nutritionists", "dishwashers", "drafters", "driver/sales workers", "drywall installers", "electricians", "electromechanical technicians", 
    "elevator installers", "embalmers", "entertainers", "etchers", "engravers", "excavating operators", "physiologists", "patternmakers", "fallers", "educators", 
    "contractors", "farmworkers", "fashion designers", "fence erectors", "fiberglass laminators", "illustrators", "firefighters", "groundskeepers", "material-moving workers", 
    "specialists", "wardens", "fishers", "foresters", "foundry workers", "coremakers", "fuel cell technicians","fundraisers", "dealers", "geneticists", "geographers", 
    "glaziers", "cosmetologists", "registrars", "practitioners", "heating and air conditioning technicians", "tractor-trailer drivers", "helpers", "historians", 
    "histotechnologists", "hospitalists", "hosts", "hostesses", "ergonomists", "hydrologists", "ecologists", "industrial-organizational psychologists", "weighers", "appraisers", 
    "underwriters", "interior designers", "interpreters", "translators", "janitors", "jewelers", "magistrates", "movers", "landscapers", "lathe operators", "laundry workers", 
    "lawyers", "legislators", "librarians", "locksmiths", "logisticians", "feeders", "offbearers", "machinists", "manicurists", "pedicurists", "mathematicians", "programmers", 
    "directors", "managers", "dosimetrists", "transcriptionists", "merchandise displayers", "metal-refining workers", "microbiologists", "enlisted personnel", "officers", 
    "millwrights", "mechanics", "casting workers", "arrangers", "projectionists", "museum conservators", "composers","musical tuners", "musicians", "singers", "nannies", 
    "neurologists", "journalists", "anesthetists", "obstetricians", "gynecologists", "merchants", "ophthalmologists", "opticians", "optometrists", "orthodontists", 
    "orthotists", "orthoptists", "prosthetists", "pediatricians", "pharmacists", "phlebotomists", "photographers", "physicians", "physicists", "driver", "pipelayers", 
    "plasterers", "podiatrists", "sheriffs", "patrol officers", "carriers", "postmasters", "superintendents", "potters", "pourers", "distributors", "dispatchers", "producers", 
    "proofreaders", "prosthodontists", "psychiatrists", "psychologists", "president", "ceo", "cfo", "cto", "coo", "founder", "entrepreneur","astronaut", "cosmonaut", 
    "marine biologist", "zoologist", "geologist","paleontologist", "archaeologist", "environmental scientist", "climatologist","software architect", "data scientist", 
    "machine learning engineer", "ai researcher","quantitative analyst", "actuary", "urban planner", "civil engineer", "mechanical engineer","electrical engineer", 
    "chemical engineer", "nuclear engineer", "aerospace engineer","pilot", "air traffic controller", "ship captain", "sailor", "diplomat", "ambassador","senator", 
    "congressperson", "judge", "chief justice", "law enforcement officer","detective", "fire chief", "paramedic", "emergency medical technician","composer", 
    "conductor", "choreographer", "artist", "actor", "director", "producer","photographer", "journalist", "editor", "novelist", "screenwriter", "poet","philosopher", 
    "historian", "linguist", "psychologist", "psychiatrist","surgeon", "cardiologist", "oncologist", "neurologist", "radiologist", "dentist","veterinarian", "nurse practitioner", 
    "midwife", "therapist", "social worker","teacher", "professor", "lecturer", "researcher", "scientist"
]



In [48]:
#synonyms for occupations
from nltk.corpus import wordnet as wn

def get_synonyms(word):
    synonyms = set()
    for syn in wn.synsets(word, pos=wn.NOUN):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    return synonyms

In [49]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(occ) for occ in occupations]
matcher.add("OCCUPATION", patterns)

# --- Step 3: Process your text ---
text = """
John is a software engineer. Mary works as a nurse and data analyst. 
Alex is an electrician, and Sarah is a pilot. Many scientists also work as teachers.
"""

doc = nlp(text)

# --- Step 4: Match occupations in the text ---
matches = matcher(doc)
found_occupations = set()  # using set to deduplicate
for match_id, start, end in matches:
    found_occupations.add(doc[start:end].text.lower())

# --- Step 5: Extract noun phrases as additional potential occupations ---
# This helps catch titles not in your reference list
for chunk in doc.noun_chunks:
    # Simple heuristic: length <= 3 words, contains a known occupation word
    words = [w.text.lower() for w in chunk]
    if any(w in occupations for w in words):
        found_occupations.add(chunk.text.lower())

# --- Step 6: Output deduplicated occupations ---
print(sorted(found_occupations))

[]


---
# External datasets


## Functions

The datasets that we will see need to be cleaned. There are multiple cleaning procedures that are necessary
1. If there is a comma at the end, and it is followed by 2 or 1 words, remove the comma and the words.
2. If there is a sentence with example1, example2, and example3 manager -> turn it into [example1 manager, example2 manager, example3 manager] and then explode the dataset
3. if there is example1 and example2 operator -> [example1 operator, example2 operator]
4. if there is care-taker of tables and chairs -> [care-taker of tables, care-taker of chairs]
5. Drop everything including "in" from the text
6. If there is something like fitter and turner -> [fitter, turner]
This all needs to be done to each of the datasets.

Other pre-processing steps include
1. making everything lower case
2. removing duplicates

In [50]:
import pandas as pd
import re
from itertools import product

def preprocess_occupation(data):
    # 1. Make everything lowercase
    data['Occupation'] = data['Occupation'].str.lower()

    # 2. Remove duplicates
    data = data.drop_duplicates(subset=['Occupation'], ignore_index=True)
    
    # 3. Remove trailing comma followed by 1 or 2 words
    data['Occupation'] = data['Occupation'].apply(
        lambda x: re.sub(r',\s*\b\w+\b(\s+\b\w+\b)?$', '', x)
    )

    # 4. Handle "example1, example2, and example3 manager" -> [example1 manager, example2 manager, example3 manager]
    def expand_commas(text):
        # Pattern: words separated by commas + "and" + final word(s)
        match = re.match(r'(.+?)\s+(?:and\s+)?(\w+\s*\w*)$', text)
        if match and ',' in text:
            parts = re.split(r',\s*', match.group(1))
            last = match.group(2)
            # Strip spaces and append last word(s)
            expanded = [p.strip() + ' ' + last for p in parts]
            return expanded
        return [text]

    # 5. Handle "example1 and example2 operator" -> [example1 operator, example2 operator]
    def expand_and(text):
        match = re.match(r'(.+?)\s+and\s+(.+?)\s+(\w+)$', text)
        if match:
            return [match.group(1) + ' ' + match.group(3), match.group(2) + ' ' + match.group(3)]
        return [text]

    # 6. Handle "care-taker of tables and chairs" -> [care-taker of tables, care-taker of chairs]
    def expand_of_and(text):
        match = re.match(r'(.+? of) (.+?) and (.+)', text)
        if match:
            return [match.group(1) + ' ' + match.group(2), match.group(1) + ' ' + match.group(3)]
        return [text]

    # 7. Drop everything including "in"
    def drop_in(text):
        return text.split(' in ')[0].strip()

    # 8. Handle "fitter and turner" -> [fitter, turner]
    def split_simple_and(text):
        if re.match(r'^\w+\s+and\s+\w+$', text):
            return [t.strip() for t in text.split(' and ')]
        return [text]

    # Combine all expansion rules
    all_expanded = []
    for occ in data['Occupation']:
        temp = expand_commas(occ)
        temp2 = []
        for t in temp:
            temp2.extend(expand_and(t))
        temp3 = []
        for t in temp2:
            temp3.extend(expand_of_and(t))
        temp4 = []
        for t in temp3:
            t = drop_in(t)
            temp4.extend(split_simple_and(t))
        all_expanded.extend(temp4)

    # Create final DataFrame
    clean_data = pd.DataFrame({'Occupation': list(set(all_expanded))})
    clean_data = clean_data.sort_values('Occupation').reset_index(drop=True)

    return clean_data


## EU data

In [51]:
# EU dataset
eu_occ = pd.read_csv("occupations_en.csv")
eu_occ = eu_occ[['preferredLabel']]
eu_occ.columns = ['Occupation']
print(len(eu_occ))
eu_occ = preprocess_occupation(eu_occ)

print(len(eu_occ))
print(eu_occ)

3039
3081
                          Occupation
0                        3d animator
1                        3d modeller
2             3d printing technician
3         abrasive blasting operator
4     absorbent pad machine operator
...                              ...
3076                    zoo educator
3077                   zoo registrar
3078              zoo section leader
3079                       zookeeper
3080              zoology technician

[3081 rows x 1 columns]


In [52]:
#locate all "and" in eu_occ
print(eu_occ[eu_occ['Occupation'].str.contains(' and ')])
#drop all "and" in eu_occ
eu_occ = eu_occ[~eu_occ['Occupation'].str.contains(' and ')]

                                             Occupation
1360  heating equipment and supplies distribution ma...
2779             telecommunications equipment and parts
2780  telecommunications equipment and parts distrib...
2811            textile semi-finished and raw materials


In [53]:
#save the data
eu_occ.to_csv("eu_occupations_cleaned.csv", index=False)

In [54]:
#most frequent words in EU dataset
eu_occ = pd.read_csv("eu_occupations_cleaned.csv")
from collections import Counter
all_words_eu = ' '.join(eu_occ['Occupation']).split()
word_counts_eu = Counter(all_words_eu)
most_common_words_eu = word_counts_eu.most_common(1000)
print(most_common_words_eu)
np.savetxt("most_common_words_eu.csv", most_common_words_eu, delimiter=",", fmt='%s')
# additional list from most common terms

[('manager', 376), ('operator', 297), ('technician', 208), ('engineer', 152), ('machine', 99), ('officer', 99), ('worker', 89), ('supervisor', 85), ('teacher', 74), ('seller', 57), ('assistant', 53), ('inspector', 53), ('distribution', 48), ('ict', 47), ('engineering', 46), ('equipment', 46), ('production', 45), ('specialised', 44), ('shop', 44), ('maker', 44), ('leather', 39), ('assembler', 38), ('lecturer', 38), ('goods', 37), ('school', 35), ('designer', 35), ('analyst', 34), ('textile', 32), ('instructor', 30), ('quality', 30), ('social', 30), ('consultant', 29), ('coordinator', 29), ('director', 28), ('construction', 27), ('specialist', 26), ('animal', 25), ('footwear', 25), ('plant', 24), ('vehicle', 24), ('secondary', 23), ('developer', 23), ('aircraft', 22), ('attendant', 22), ('aquaculture', 22), ('industrial', 22), ('service', 20), ('products', 20), ('energy', 20), ('metal', 20), ('machinery', 19), ('policy', 19), ('vocational', 19), ('maintenance', 19), ('product', 19), ('sc

In [55]:
occupations_eu = [
    "manager","operator","technician","engineer","officer","worker","supervisor","teacher",
    "seller","assistant","inspector","maker","assembler","lecturer","designer","analyst",
    "instructor","consultant","coordinator","director","specialist","developer","attendant",
    "installer","driver","therapist","artist","administrator","clerk","agent","drafter","tester",
    "planner","chief","cleaner","painter","electrician","advisor","pilot","counsellor","grader",
    "representative","broker","adviser","breeder","mechanic","trader","animator","leader",
    "trainer","fitter","researcher","buyer","dispatcher","guard","technologist","builder",
    "journalist","patternmaker","surveyor","chemist","psychologist","nurse","handler","chef",
    "auditor","operative","repairer","machinist","cutter","finisher","cook","welder","upholsterer",
    "firefighter","receptionist","engraver","polisher","guide","pharmacist","underwriter",
    "physiotherapist","executive","chiropractor","biologist","restorer","assessor","audiologist",
    "butcher","metallurgist","conductor","registrar","diver","lawyer","investigator","curator",
    "radiographer","moulder","labourer","veterinarian","hairdresser","slaughterer","porter",
    "vendor","manufacturer","writer","merchandiser","programmer","detective","setter","secretary",
    "groomer","nutritionist","osteopath","captain","teller","treasurer","bartender","librarian",
    "publisher","carpenter","weaver","maintainer","scaffolder","correctional officer","miller",
    "observer","examiner","commissioner","stewardess","stylist","farmer","appraiser",
    "superintendent","educator","performer","miner","interpreter","judge","mentor","driller",
    "volunteer","modeller","actor","acupuncturist","copywriter","aesthetician","agronomist",
    "behaviourist","hydrotherapist","anthropologist","arboriculturist","archaeologist",
    "archivist","aromatherapist","artisan","assayer","astrologer","astronaut","astronomer", 'police officer'
]


In [56]:
#Kaggle - Occupation Titles
kaggle_occ = pd.read_csv('job_descriptions.csv')
print(len(kaggle_occ))
kaggle_occ = kaggle_occ[['Job Title']].drop_duplicates().reset_index(drop=True)
kaggle_occ.columns = ['Occupation']
print(len(kaggle_occ))


1615940
147


In [57]:
kaggle_occ_cleaned = preprocess_occupation(kaggle_occ)
#look for "and" in kaggle_occ
print(len(kaggle_occ_cleaned))

print(kaggle_occ_cleaned[kaggle_occ_cleaned['Occupation'].str.contains(' and ')])
#drop all "and" in kaggle_occ
kaggle_occ_cleaned = kaggle_occ_cleaned[~kaggle_occ_cleaned['Occupation'].str.contains(' and ')]

#save the data
kaggle_occ_cleaned.to_csv("kaggle_occupations_cleaned.csv", index=False)

147
Empty DataFrame
Columns: [Occupation]
Index: []


In [58]:
#To create categories, count word occurences across all occupation names (ie. "engineer", "manager", "technician", etc.) and use the most common ones as categories.
all_words = ' '.join(kaggle_occ_cleaned['Occupation']).split()
word_counts = Counter(all_words)
most_common_words = word_counts.most_common(1000)
print(most_common_words)

np.savetxt("most_common_words_kaggle.csv", most_common_words, delimiter=",", fmt='%s')

occupations_kaggle = [
    "manager","analyst","engineer","specialist","designer","developer","coordinator","assistant","administrator","planner","nurse","director",
    "architect","representative","consultant","advisor","therapist","practitioner","executive","teacher","writer","scientist","accountant","copywriter",
    "lawyer","hygienist","physician","pediatrician","paralegal","counselor","researcher","veterinarian"
]

[('manager', 21), ('analyst', 16), ('engineer', 14), ('specialist', 10), ('designer', 9), ('developer', 7), ('marketing', 7), ('coordinator', 6), ('assistant', 5), ('network', 5), ('sales', 5), ('account', 4), ('customer', 4), ('data', 4), ('administrator', 4), ('planner', 4), ('nurse', 4), ('financial', 4), ('legal', 4), ('software', 4), ('director', 3), ('architect', 3), ('representative', 3), ('consultant', 3), ('event', 3), ('advisor', 3), ('hr', 3), ('investment', 3), ('it', 3), ('research', 3), ('therapist', 3), ('procurement', 3), ('social', 3), ('systems', 3), ('executive', 2), ('art', 2), ('teacher', 2), ('brand', 2), ('business', 2), ('chemical', 2), ('writer', 2), ('service', 2), ('support', 2), ('scientist', 2), ('database', 2), ('electrical', 2), ('environmental', 2), ('family', 2), ('practitioner', 2), ('front-end', 2), ('landscape', 2), ('market', 2), ('mechanical', 2), ('product', 2), ('project', 2), ('qa', 2), ('seo', 2), ('media', 2), ('supply', 2), ('chain', 2), ('we

In [59]:
# US Bureau of Labor Statistics - Occupational Employment and Wage Statistics (OEWS) - 2024
us_bls_occ = pd.read_excel('all_data_M_2024.xlsx', usecols=['OCC_CODE', 'OCC_TITLE'])
print(len(us_bls_occ))

414437


In [60]:
us_bls_occ_copy = us_bls_occ[['OCC_TITLE']].copy()
us_bls_occ_copy.columns = ['Occupation']
us_bls_occ_copy = preprocess_occupation(us_bls_occ_copy)
print(us_bls_occ_copy)

def split_and_jobs(text):
    # Regex to capture: everything before "and", then everything after
    match = re.match(r'(.+?)\s+and\s+(.+)$', text)
    if match:
        first_part = match.group(1).strip()
        second_part = match.group(2).strip()
        
        # Handle cases where the first part has multiple words (common prefix)
        # Example: "automotive technicians and repairers"
        first_words = first_part.split()
        second_words = second_part.split()
        
        # If second part is single word, attach prefix from first part
        if len(second_words) == 1:
            # keep prefix except last word of first part
            prefix = " ".join(first_words[:-1])
            second_job = (prefix + " " + second_words[0]).strip()
            return [first_part, second_job]
        else:
            return [first_part, second_part]
    
    # If no "and", just return the text as a single-item list
    return [text]

all_rows = []

for occ in us_bls_occ_copy['Occupation']:
    splits = split_and_jobs(occ)
    all_rows.extend(splits)  # add all split occupations

# Create a new dataframe with each occupation in its own row
us_bls_occ_expanded = pd.DataFrame(all_rows, columns=['Occupation'])

# Reset index
us_bls_occ_expanded = us_bls_occ_expanded.reset_index(drop=True)

#locate all ands in us_bls_occ
print(us_bls_occ_expanded[us_bls_occ_expanded['Occupation'].str.contains(' and ')])

#drop all ands in us_bls_occ
us_bls_occ_expanded = us_bls_occ_expanded[~us_bls_occ_expanded['Occupation'].str.contains(' and ')]

#save the data
us_bls_occ_expanded.to_csv("us_bls_occupations_cleaned.csv", index=False)
print(us_bls_occ_expanded)

#most frequent words in US BLS dataset
from collections import Counter
all_words_us = ' '.join(us_bls_occ_expanded['Occupation']).split()
word_counts_us = Counter(all_words_us)
most_common_words_us = word_counts_us.most_common(1000)
print(most_common_words_us)
np.savetxt("most_common_words_us.csv", most_common_words_us, delimiter=",", fmt='%s')

occupations_us = [
    "workers","operators","technicians","teachers","managers","repairers","specialists","clerks",
    "supervisors","installers","setters","engineers","tenders","assistants","scientists",
    "attendants","mechanics","inspectors","drivers","designers","therapists","architects",
    "counselors","drafters","cleaners","instructors","therapists","practitioners",
    "collectors","judges","advisors","tapers","artists","programmers","investigators",
    "representatives","surgeons","cooks","psychologists","planners","cutters","patternmakers",
    "directors","performers","pilots","helpers","salespersons","reporters","promoters",
    "physicians","fabricators","pipefitters","auditors","officials","trainers","appraisers",
    "cosmetologists","carpenters","blockmasons","cashiers","jockeys","laborers","guards","curators",
    "solderers","detectives","dispatchers","distributors","dressmakers","drillers","pumpers",
    "sculptors","fitness trainers","painters","lawyers","nurses","developers","writers","actors",
    "producers","servicers","casters","mediators","archivists","athletes","competitors","porters",
    "bailiffs","barbers","captains","brickmasons","butchers","chemists","executives","adjusters",
    "coaches","foresters","dentists","embalmers","paramedics","entertainers","geoscientists",
    "hydrologists","maids","educators","farmworkers","filers","firefighters","wardens","cooks",
    "hairdressers","registrars","magistrate","librarians","logisticians","superintendents",
    "woodworkers","musicians","journalists","orderlies","paperhangers","plasterers","printing workers",
    "processors","brokers","singers","testers","stonemasons","tailors","guides","umpires","welders",
    "biologists","accountants","actuaries","acupuncturists","adjudicators","anesthesiologists",
    "breeders","anthropologists","auditors","audiologists","authors","tellers","bakers","bartenders",
    "chefs","chiropractors","copilots","coremakers","jailers","couriers","crane operators","dancers",
    "hygienists","dermatologists","publishers","sonographers","dietitians","dishwashers","electricians",
    "elevator operators","engravers","epidemiologists","fallers","contractors","farmers","erectors",
    "firefighters","servers","fundraisers","dealers","glaziers","gynecologists","janitors","jewelers",
    "locksmiths","lifeguards","manicurists","mathematicians","microbiologists","millwrights","molders",
    "morticians","projectionists","mechanics","composers","neurologists","anesthetists","midwives",
    "nutritionists","obstetricians","ophthalmologists","opticians","optometrists","orthodontists",
    "orthopedic surgeons","orthotists","paralegals","pediatricians","pedicurists","aides","handlers",
    "pharmacists","phlebotomists","photogrammetrists","photographers","physicians","physicists",
    "plumbers","podiatrists","carriers","postmasters","pressers","probation officers",
    "procurement officers","proofreaders","assessors","prosthetists","prosthodontists","psychiatrists",
    "telecommunicators","radiologists","conductors","yardmasters","ranchers","receptionists",
    "referees","registered nurses","rehabilitation specialists","restaurant workers","riggers",
    "roofers","roustabouts","sailors","monitors","scouts","securities officers","sewer workers",
    "sewing machine operators"
]



                                 Occupation
0                               abstractors
1                        account collectors
2                               accountants
3                accounting auditing clerks
4                                    actors
...                                     ...
1597  woodworking machine setters operators
1598    woodworking machine setters tenders
1599            word processors and typists
1600                                writers
1601                  zoologists biologists

[1602 rows x 1 columns]
Empty DataFrame
Columns: [Occupation]
Index: []
                               Occupation
0                             abstractors
1                      account collectors
2                             accountants
3              accounting auditing clerks
4                                  actors
...                                   ...
1695  woodworking machine setters tenders
1696                      word processors
1697                 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Occupation'] = data['Occupation'].apply(


In [61]:
#US census data
census_loc = 'Alphabetical-Index-of-Occupations-December-2019_Final.xlsx'
occupations = pd.read_excel(census_loc, skiprows=6)
occupations.columns = ['occupation_name', 'industry_restriction', 'occupation_code', 'SOC_code']
#print(len(occupations))
occupations = occupations[['occupation_name']]
occupations.columns = ['Occupation']
occupations = preprocess_occupation(occupations)
#print(len(occupations))


  warn(msg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Occupation'] = data['Occupation'].apply(


In [62]:
def split_and_jobs(text):
    # Regex to capture: everything before "and", then everything after
    match = re.match(r'(.+?)\s+and\s+(.+)$', text)
    if match:
        first_part = match.group(1).strip()
        second_part = match.group(2).strip()
        
        # Handle cases where the first part has multiple words (common prefix)
        # Example: "automotive technicians and repairers"
        first_words = first_part.split()
        second_words = second_part.split()
        
        # If second part is single word, attach prefix from first part
        if len(second_words) == 1:
            # keep prefix except last word of first part
            prefix = " ".join(first_words[:-1])
            second_job = (prefix + " " + second_words[0]).strip()
            return [first_part, second_job]
        else:
            return [first_part, second_part]
    
    # If no "and", just return the text as a single-item list
    return [text]

all_rows = []

for occ in occupations['Occupation']:
    splits = split_and_jobs(occ)
    all_rows.extend(splits)  # add all split occupations

# Create a new dataframe with each occupation in its own row
occupations_expanded = pd.DataFrame(all_rows, columns=['Occupation'])

# Reset index
occupations_expanded = occupations_expanded.reset_index(drop=True)

#locate all ands in occupations_expanded
print(occupations_expanded[occupations_expanded['Occupation'].str.contains(' and ')])

#drop all ands in occupations_expanded
occupations_expanded = occupations_expanded[~occupations_expanded['Occupation'].str.contains(' and ')]
print(occupations_expanded)

#more cleaning: 
# See point 2 above
def filter_complicated_titles(df):

    pattern = r'.+\sSee\s+"[^"]+"'  # any text followed by 'See "..."'
    mask = df['Occupation'].str.contains(pattern, na=False, case=False, regex=True)
    filtered_df = df[~mask].reset_index(drop=True)
    return filtered_df

# See point 4 above
def extract_bracketed(text):

    # Extract bracketed text
    match = re.search(r"\[([^\]]+)\]", text) # Try to match square brackets first
    if not match:
        match = re.search(r"\(([^\)]+)\)", text) # If none, try round parentheses
    
    if match:
        alternative_name = match.group(1)
    else:
        alternative_name = None
    # Remove bracketed text from original
    cleaned_text = re.sub(r"\[.*?\]|\(.*?\)", "", text).strip()
    return cleaned_text, alternative_name

# See point 3 above
def simplify_occupation(text):
    text = str(text).lower()  # lowercase
    # Patterns to cut off extra explanations
    cut_patterns = [
        r"\\.*",            # everything after backslash
        r"see.*",           # everything after 'see'
        r"code by.*",       # everything after 'code by'
        r"specified.*",     # everything after 'specified'
        r"as ns.*",         # everything after 'as ns'
        r"any other.*",     # everything after 'any other'
        r"\/.*"              # everything after forward slash
    ]
    
    for pattern in cut_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    return text.strip()

# Apply filtering
occupations_expanded = filter_complicated_titles(occupations_expanded)
# Apply extraction of bracketed text
alt_names = []
for occ in occupations_expanded['Occupation']:
    cleaned, alt = extract_bracketed(occ)
    occupations_expanded.loc[occupations_expanded['Occupation'] == occ, 'Occupation'] = cleaned
    alt_names.append(alt)
occupations_expanded['Alternative_Name'] = alt_names
# Apply simplification
occupations_expanded['Occupation'] = occupations_expanded['Occupation'].apply(simplify_occupation)

# Remove duplicates again
occupations_expanded = occupations_expanded.drop_duplicates(subset=['Occupation'], ignore_index=True)
print(occupations_expanded)

Empty DataFrame
Columns: [Occupation]
Index: []
            Occupation
0            21 dealer
1          3d animator
2         911 operator
3      a mill operator
4           a operator
...                ...
29346     zoo director
29347       zoo keeper
29348          zoogler
29349        zoologist
29350    zumba teacher

[29351 rows x 1 columns]
            Occupation Alternative_Name
0            21 dealer             None
1          3d animator             None
2         911 operator             None
3      a mill operator             None
4           a operator             None
...                ...              ...
28791     zoo director             None
28792       zoo keeper             None
28793          zoogler             None
28794        zoologist             None
28795    zumba teacher             None

[28796 rows x 2 columns]


In [63]:

#I want the alternative names to be added to the main list of occupations
alt_occupations = occupations_expanded['Alternative_Name'].dropna().reset_index(drop=True)
alt_occupations_df = pd.DataFrame(alt_occupations, columns=['Occupation'])
final = pd.concat([occupations_expanded[['Occupation']], alt_occupations_df], ignore_index=True)
#remove NaN
final = final.dropna(subset=['Occupation']).reset_index(drop=True)

#there is an element of the form " "" " where occupation is just quotes - remove it
# Remove empty or whitespace-only entries
final = final[final['Occupation'].str.strip().astype(bool)].reset_index(drop=True)
final = final[final['Occupation'].notna()]
final = final[final['Occupation'].str.strip().astype(bool)]
#save final occupations
final.to_csv("census_occupations_cleaned.csv", index=False)


In [64]:
#most common words in US census dataset
final = pd.read_csv("census_occupations_cleaned.csv")
from collections import Counter
all_words_census = ' '.join(final['Occupation'].astype(str)).split()
word_counts_census = Counter(all_words_census)
most_common_words_census = word_counts_census.most_common(4000)
print(most_common_words_census)
np.savetxt("most_common_words_census.csv", most_common_words_census, delimiter=",", fmt='%s')
#maybe could be used to expand, but too many words I think. Maybe another time.

[('operator', 3254), ('machine', 1520), ('supervisor', 1227), ('worker', 670), ('clerk', 612), ('mechanic', 580), ('teacher', 579), ('maker', 555), ('helper', 551), ('manager', 489), ('technician', 487), ('engineer', 458), ('inspector', 454), ('tender', 408), ('sales', 403), ('cutter', 324), ('installer', 306), ('attendant', 297), ('or', 276), ('hand', 268), ('repairer', 268), ('driver', 268), ('director', 259), ('service', 253), ('tester', 233), ('exc', 232), ('equipment', 229), ('assistant', 228), ('specialist', 228), ('apprentice', 221), ('cleaner', 219), ('assembler', 212), ('car', 208), ('setter', 201), ('press', 199), ('man', 188), ('agent', 180), ('of', 178), ('officer', 173), ('metal', 173), ('analyst', 171), ('aide', 166), ('room', 160), ('control', 159), ('plant', 156), ('grinder', 143), ('health', 138), ('mixer', 136), ('checker', 136), ('builder', 134), ('feeder', 133), ('truck', 132), ('computer', 132), ('maintenance', 131), ('counselor', 128), ('finisher', 127), ('up', 12

In [65]:
#combine the four lists made above into one final list
final_occupations = list(set(occupations_eu + occupations_kaggle + occupations_us + occupations_1))

#drop duplicates
final_occupations = list(set(final_occupations))

#load the cleaned data
occupations1 = pd.read_csv("occupations1.csv")
eu_occ = pd.read_csv("eu_occupations_cleaned.csv")
kaggle_occ = pd.read_csv("kaggle_occupations_cleaned.csv")
us_bls_occ = pd.read_csv("us_bls_occupations_cleaned.csv")
us_census_occ = pd.read_csv("census_occupations_cleaned.csv")

#combine all cleaned data + final_occupations into one dataframe
combined_occupations = pd.DataFrame({'Occupation': final_occupations})
combined_occupations = pd.concat([combined_occupations, eu_occ, kaggle_occ, us_bls_occ, us_census_occ, occupations1], ignore_index=True)
#drop duplicates again
combined_occupations = combined_occupations.drop_duplicates(subset=['Occupation'], ignore_index=True).dropna(subset = ['Occupation']).reset_index(drop=True)
print(len(combined_occupations))

#save the final combined occupations
combined_occupations.to_csv("final_combined_occupations.csv", index=False)


33717


In [74]:
import pandas as pd
import inflect
import ast

df = pd.read_csv("final_combined_occupations.csv")
p = inflect.engine()

def normalise_occupation(row):
    occ = row['Occupation']
    if not isinstance(occ, str) or not occ.strip():
        return row  # skip blanks

    # Ensure Synonyms is a list
    if not isinstance(row['Synonyms'], list):
        row['Synonyms'] = [row['Synonyms']] if pd.notna(row['Synonyms']) else []

    words = occ.split()
    last = words[-1]
    singular_last = p.singular_noun(last) or last
    plural_last = p.plural(singular_last)

    singular_phrase = ' '.join(words[:-1] + [singular_last])
    plural_phrase = ' '.join(words[:-1] + [plural_last])

    row['Occupation'] = singular_phrase.lower().strip()

    plural_phrase = plural_phrase.lower().strip()
    if plural_phrase not in row['Synonyms']:
        row['Synonyms'].append(plural_phrase)

    return row


# Initialize Synonyms column
df['Synonyms'] = df['Occupation'].apply(lambda x: [x.lower().strip()] if isinstance(x, str) else [])
# Apply normalization
df = df.apply(normalise_occupation, axis=1)

# List of occupations to remove
to_remove = [
    'brother', 'sister', 'processor', 'gutter', 'sewer', 'sight', 'lot', 'ga', 'private','nipper', 'gas', 
    'office', 'performance', 'web', 'wash', 'grip', 'gem', 'smasher', 'family', 'star sewer', 'trailer', 
    'wall', 'drywall', 'second', 'soil', 'plant', 'watch', 'bill', 'billing', 'audio', 'spotter', 'helper', 
    'footer', 'infantry','command', 'hiker', 'drive', 'set', 'server network', 'network', 'urban', 'ceiling', 
    'training', 'cut', 'community', 'social', 'drawing', 'tie', 'worker', 'hand', 'travel', 'computer', 'bridge', 'fire',
    'closer', 'subway', 'camera', 'financial', 'chair', 'ordinary', 'monitor', 'marriage', 'medium', 'potato', 
    'video', 'carry', 'fish', 'loan', 'guide', 'philosophy', 'prior', 'clock', 'heavy', 'fabric', 'reader', 'planner', 'keeper',
    'rover', 'rubber', 'counter', 'river', 'medical', 'killer', 'farm', 'porter', 'compensation', 'tool', 'mate', 'passer',
    'commercial', 'whip', 'rougher', 'entertainment', 'cotton', 'smoker', 'recreation', 'tile', 'extra', 'insurance', 'sale',
    'extra', 'master', 'feed', 'plastic', 'engine', 'refuse', 'page', 'breakfast', 'fishing', 'forest', 'signal', 'general', 'information',
    'comic'
    ]
# Remove rows with occupations in the to_remove list
df = df[~df['Occupation'].isin(to_remove)].reset_index(drop=True)

#drop duplicates based on Occupation
df = df.drop_duplicates(subset=['Occupation'], ignore_index=True)

#add "sales" as a job synonym to "salesperson"
sales_idx = df[df['Occupation'] == 'salesperson'].index
if not sales_idx.empty:
    idx = sales_idx[0]
    if 'sales' not in df.at[idx, 'Synonyms']:
        df.at[idx, 'Synonyms'].append('sales')
#save the data
df.to_csv("final_combined_occupations.csv", index=False)

In [None]:
#expand using synonyms from wordnet
def get_synonyms(word):
    synonyms = set()
    for syn in wn.synsets(word, pos=wn.NOUN):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    #synonims as list
    return list(synonyms)

# take the dataset and make a new column with synonyms
combined_occupations['Synonyms'] = combined_occupations['Occupation'].apply(get_synonyms)

#print(combined_occupations)

#save the data
combined_occupations.to_csv("final_combined_occupations_with_synonyms.csv", index=False)

In [None]:
import pandas as pd
import inflect
import ast

p = inflect.engine()

def normalise_occupation(row):
    occ = row['Occupation']
    if not isinstance(occ, str) or not occ.strip():
        return row  # skip blanks

    words = occ.split()

    # Handle last word (the noun)
    last = words[-1]
    singular_last = p.singular_noun(last) or last
    plural_last = p.plural(singular_last)

    # Construct phrases
    singular_phrase = ' '.join(words[:-1] + [singular_last])
    plural_phrase = ' '.join(words[:-1] + [plural_last])

    # Update occupation → singular
    row['Occupation'] = singular_phrase.lower().strip()

    # Add plural to synonyms if not already there
    if plural_phrase.lower().strip() not in row['Synonyms']:
        row['Synonyms'].append(plural_phrase.lower().strip())

    return row

df = pd.read_csv("final_combined_occupations_with_synonyms.csv")

# Convert Synonyms column from string to list
df['Synonyms'] = df['Synonyms'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
df = df.apply(normalise_occupation, axis=1)

# Example check
print(df[df['Occupation'].str.contains('judge', case=False)])


                                      Occupation  Job Zone Code  \
77                                         judge       NaN  NaN   
396                                        judge       NaN  NaN   
3254                         supreme court judge       NaN  NaN   
3689   administrative law judges hearing officer       NaN  NaN   
4422                      judges judicial worker       NaN  NaN   
4423                     judges magistrate judge       NaN  NaN   
4424                       judges related worker       NaN  NaN   
4495                            magistrate judge       NaN  NaN   
9651                         circuit court judge       NaN  NaN   
10971                               county judge       NaN  NaN   
11120                             criminal judge       NaN  NaN   
12158                             dog show judge       NaN  NaN   
12734                             election judge       NaN  NaN   
14656                 foul judge sports activity       NaN  Na

In [None]:
#remove duplicates again
df = df.drop_duplicates(subset=['Occupation'], ignore_index=True)

In [None]:
def normalize_synonyms(synonyms):
    normalised = set()
    
    # Flatten if nested
    if isinstance(synonyms, (list, set, tuple)):
        flat_synonyms = []
        for s in synonyms:
            if isinstance(s, (list, set, tuple)):
                flat_synonyms.extend(s)
            else:
                flat_synonyms.append(s)
    else:
        flat_synonyms = [synonyms]
    
    for syn in flat_synonyms:
        if not isinstance(syn, str) or not syn.strip():
            continue
        
        words = syn.strip().split()
        if not words:
            continue

        # Singularize and pluralize only the last word
        last_word = words[-1]
        singular_last = p.singular_noun(last_word) or last_word
        plural_last = p.plural(singular_last)

        singular_phrase = ' '.join(words[:-1] + [singular_last])
        plural_phrase = ' '.join(words[:-1] + [plural_last])

        normalised.add(singular_phrase.lower())
        normalised.add(plural_phrase.lower())

    return sorted(normalised)

# Apply to your DataFrame
df['Synonyms'] = df['Synonyms'].apply(normalize_synonyms)

df.to_csv("final_combined_occupations_with_synonyms.csv", index=False)

In [None]:
#find "judge"
print(df[df['Occupation'].str.contains('judge')])

                                      Occupation  Job Zone Code  \
73                                         judge       NaN  NaN   
3058                         supreme court judge       NaN  NaN   
3486   administrative law judges hearing officer       NaN  NaN   
4128                      judges judicial worker       NaN  NaN   
4129                     judges magistrate judge       NaN  NaN   
4130                       judges related worker       NaN  NaN   
4197                            magistrate judge       NaN  NaN   
9246                         circuit court judge       NaN  NaN   
10539                               county judge       NaN  NaN   
10682                             criminal judge       NaN  NaN   
11710                             dog show judge       NaN  NaN   
12281                             election judge       NaN  NaN   
14166                 foul judge sports activity       NaN  NaN   
17715                               judges clerk       NaN  Na

In [None]:
#remove "brother" from occupation
# List of occupations to remove
to_remove = [
    'brother', 'sister', 'processor', 'gutter', 'sewer', 'sight', 'lot', 'ga', 'private','nipper', 'gas', 
    'office', 'performance', 'web', 'wash', 'grip', 'gem', 'smasher', 'family', 'star sewer', 'trailer', 
    'wall', 'drywall', 'second', 'soil', 'plant', 'watch', 'bill', 'billing', 'audio', 'spotter', 'helper', 'footer', 'infantry'
    'command', 'hiker', 'drive', 'set', 'server network', 'network',
]

# Remove all in one go
df = df[~df['Occupation'].isin(to_remove)].reset_index(drop=True)

#turn synonyms into a list of strings
import ast

#convert "assemblyman or woman" to "assemblyman" and "assemblywoman" in separate rows
df['Occupation'] = df['Occupation'].apply(
    lambda x: ['assemblyman', 'assemblywoman'] if x == 'assemblyman or woman' else [x]
)
df = df.explode('Occupation').reset_index(drop=True)

df.loc[df['Occupation'] == 'assemblyman', 'Synonyms'] = "['assemblyman', 'assemblymen']"
df.loc[df['Occupation'] == 'assemblywoman', 'Synonyms'] = "['assemblywoman', 'assemblywomen']"


# Convert each string to a Python list
df['Synonyms'] = df['Synonyms'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

#if something starts with "and" remove "and " and keep the rest
def remove_and_beginning(occupation):
    if occupation.lower().startswith('and '):
        return occupation[4:].strip()
    return occupation
df['Occupation'] = df['Occupation'].apply(remove_and_beginning)

#charwoman has as synonym "woman", "women", "char" and "chars" - remove them
df.loc[df['Occupation'] == 'charwoman', 'Synonyms'] = df.loc[df['Occupation'] == 'charwoman', 'Synonyms'].apply(
    lambda syns: [s for s in syns if s not in ['woman', 'women', 'char', 'chars']]
)

#valet has "man", "men", gentleman", "gentlemen" as synonyms - remove them
df.loc[df['Occupation'] == 'valet', 'Synonyms'] = df.loc[df['Occupation'] == 'valet', 'Synonyms'].apply(
    lambda syns: [s for s in syns if s not in ['man', 'men', 'gentleman', 'gentlemen', 'gentleman\'s gentleman', 'gentleman\'s gentlemans', 'gentlemans']]
)
#correct u s to us in synonyms. it's in the synonyms of "u s postal inspection officer"
# Locate the row by occupation
df[df['Occupation'] == 'u s postal inspection officer'] = 'us postal inspection officer'
print(df[df['Occupation'] == 'us postal inspection officer'])
df.loc[df['Occupation'] == 'us postal inspection officer', 'Synonyms'] = df.loc[df['Occupation'] == 'us postal inspection officer', 'Synonyms'].apply(
    lambda syns: [s.replace('u s', 'us') for s in syns])

                         Occupation                      Job Zone  \
31276  us postal inspection officer  us postal inspection officer   

                               Code  \
31276  us postal inspection officer   

                                                Synonyms  
31276  [u, s,  , p, o, s, t, a, l,  , i, n, s, p, e, ...  


In [None]:
# Convert each string to a Python list
df['Synonyms'] = df['Synonyms'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df.to_csv("final_combined_occupations_with_synonyms.csv", index=False)