In [17]:
import os, zipfile
import pandas as pd

# === paths (adatta solo se serve) ===
BASE_DIR = os.path.abspath("..")               # da notebooks/ torna a esco-project/
DATA_DIR = os.path.join(BASE_DIR, "data")
OUT_DIR  = os.path.join(BASE_DIR, "output")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

ZIP_PATH = os.path.join(DATA_DIR, "ESCO dataset - v1.2.1 - classification - en - csv.zip")
EXTRACT_DIR = os.path.join(DATA_DIR, "esco_csv")

# === 1) unzip ===
os.makedirs(EXTRACT_DIR, exist_ok=True)

with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(EXTRACT_DIR)

print("Unzipped into:", EXTRACT_DIR)
print("Files:", sorted(os.listdir(EXTRACT_DIR))[:20], "...")

# === 2) load ESCO csv ===
occ_path  = os.path.join(EXTRACT_DIR, "occupations_en.csv")
skill_path = os.path.join(EXTRACT_DIR, "skills_en.csv")
rel_path  = os.path.join(EXTRACT_DIR, "occupationSkillRelations_en.csv")

occupations = pd.read_csv(occ_path)
skills = pd.read_csv(skill_path)
occ_skill = pd.read_csv(rel_path)

print("occupations:", occupations.shape)
print("skills:", skills.shape)
print("occ-skill:", occ_skill.shape)

# === 3) create simplified outputs (come i tuoi file) ===
# occupations.csv -> occupation_id, occupation_label
occupations_out = occupations[["conceptUri", "preferredLabel"]].rename(
    columns={"conceptUri": "occupation_id", "preferredLabel": "occupation_label"}
)

# skills.csv -> skill_id, skill_label, skill_type
skills_out = skills[["conceptUri", "preferredLabel", "skillType"]].rename(
    columns={"conceptUri": "skill_id", "preferredLabel": "skill_label", "skillType": "skill_type"}
)

# occupation_skill.csv -> occupation_id, skill_id, relation
occ_skill_out = occ_skill[["occupationUri", "skillUri", "relationType"]].rename(
    columns={"occupationUri": "occupation_id", "skillUri": "skill_id", "relationType": "relation"}
)

# === 4) save ===
occupations_out.to_csv(os.path.join(OUT_DIR, "occupations.csv"), index=False)
skills_out.to_csv(os.path.join(OUT_DIR, "skills.csv"), index=False)
occ_skill_out.to_csv(os.path.join(OUT_DIR, "occupation_skill.csv"), index=False)

print("✅ Wrote:")
print(" -", os.path.join(OUT_DIR, "occupations.csv"), occupations_out.shape)
print(" -", os.path.join(OUT_DIR, "skills.csv"), skills_out.shape)
print(" -", os.path.join(OUT_DIR, "occupation_skill.csv"), occ_skill_out.shape)

Unzipped into: /Users/darioonsori/esco-project/data/esco_csv
Files: ['ISCOGroups_en.csv', 'broaderRelationsOccPillar_en.csv', 'broaderRelationsSkillPillar_en.csv', 'conceptSchemes_en.csv', 'dictionary_en.csv', 'digCompSkillsCollection_en.csv', 'digitalSkillsCollection_en.csv', 'greenShareOcc_en.csv', 'greenSkillsCollection_en.csv', 'languageSkillsCollection_en.csv', 'occupationSkillRelations_en.csv', 'occupations_en.csv', 'researchOccupationsCollection_en.csv', 'researchSkillsCollection_en.csv', 'skillGroups_en.csv', 'skillSkillRelations_en.csv', 'skillsHierarchy_en.csv', 'skills_en.csv', 'transversalSkillsCollection_en.csv'] ...
occupations: (3043, 15)
skills: (13960, 13)
occ-skill: (126051, 6)
✅ Wrote:
 - /Users/darioonsori/esco-project/output/occupations.csv (3043, 2)
 - /Users/darioonsori/esco-project/output/skills.csv (13960, 3)
 - /Users/darioonsori/esco-project/output/occupation_skill.csv (126051, 3)


In [18]:
pd.read_csv(os.path.join(OUT_DIR, "occupations.csv")).head()

Unnamed: 0,occupation_id,occupation_label
0,http://data.europa.eu/esco/occupation/00030d09...,technical director
1,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator
2,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector
3,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician
4,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager
