In [1]:
# ================================================
# 🎯 DATA EXPLORATION NOTEBOOK - Skills4Cpp
# ================================================
# This notebook helps you understand the data structure and functions
# without loading the full (huge) datasets

from data import Data
from utils import SEP_TOKEN
import re
import pandas as pd
from typing import List, Tuple

print("🚀 Welcome to the Data Exploration Guide!")
print(f"📁 Current separator token: '{SEP_TOKEN}'")
print("💡 This notebook will help you understand the data structure step by step")

  from .autonotebook import tqdm as notebook_tqdm


🚀 Welcome to the Data Exploration Guide!
📁 Current separator token: '<SEP>'
💡 This notebook will help you understand the data structure step by step


In [None]:
# ================================================
# 📊 1. UNDERSTANDING THE DATA STRUCTURE
# ================================================
print("📊 1. UNDERSTANDING THE DATA STRUCTURE")
print("=" * 50)

# Let's look at what a typical data pair looks likeA
sample_doc1 = "role: Software Engineer \n description: Develop software<SEP>role: Data Scientist \n description: Analyze data"
sample_doc2 = "esco role: Software Developer \n description: Creates software applications"

print("🔍 A typical data pair:")
print(f"📄 Document 1 (career history): {sample_doc1}")
print(f"📄 Document 2 (target ESCO occupation): {sample_doc2}")
print()

print("🔍 Breaking it down:")
print("   • Document 1: Your career journey (multiple jobs separated by <SEP>)")
print("   • Document 2: The target occupation you might transition to")
print("   • Goal: Learn patterns like 'Software Engineer → Data Scientist → Software Developer'")


In [None]:
# ================================================
# 🎭 2. EXPLORING UTILITY FUNCTIONS
# ================================================
print("🎭 2. EXPLORING UTILITY FUNCTIONS")
print("=" * 50)

# Create some sample data to demonstrate functions|
sample_pairs = [
    ("role: Software Engineer \n description: Develop software<SEP>role: Data Scientist \n description: Analyze data", "esco role: Software Developer \n description: Creates software"),
    ("role: Teacher \n description: Teach students<SEP>role: Professor \n description: Research and teach", "esco role: Lecturer \n description: Delivers lectures"),
    ("role: Nurse \n description: Patient care", "esco role: Healthcare Assistant \n description: Provides care")
]

print("🔍 Sample dataset with 3 career examples:")
for i, (doc1, doc2) in enumerate(sample_pairs, 1):
    print(f"\n{i}. Career Path {i}:")
    print(f"   Input:  {doc1}")
    print(f"   Target: {doc2}")
    print()

# Demonstrate title extraction
print("📝 DEMONSTRATING TITLE EXTRACTION:")
print("The _extract_titles function pulls out just the job titles:")
for i, (doc1, doc2) in enumerate(sample_pairs, 1):
    roles = re.findall(r"role: (.*?)\n", doc1)
    esco_role = re.findall(r"esco role: (.*?)\n", doc2)[0]
    print(f"{i}. {' → '.join(roles)} → {esco_role}")

print("\n" + "="*50)


In [None]:
# ================================================
# 🎯 3. UNDERSTANDING THE MINUS_LAST FUNCTION
# ================================================
print("🎯 3. UNDERSTANDING THE MINUS_LAST FUNCTION")
print("=" * 50)

print("🔍 What does 'minus_last' do?")
print("   • Takes your career history and removes the LAST job")
print("   • Purpose: Create training examples for 'career transition prediction'")
print("   • Example: 'Job1 → Job2 → Job3' becomes 'Job1 → Job2' (predict Job3)")
print()

# Demonstrate with our sample data
sample_doc1 = "role: Software Engineer \n description: Develop software<SEP>role: Data Scientist \n description: Analyze data<SEP>role: ML Engineer \n description: Build models"
sample_doc2 = "esco role: AI Engineer \n description: Develops AI systems"

print(f"📄 Original career path: {sample_doc1}")
print(f"🎯 Target prediction: {sample_doc2}")
print()

# Show what minus_last does
segments = sample_doc1.split(SEP_TOKEN)
print("🔍 Breaking into segments:")
for i, segment in enumerate(segments, 1):
    print(f"   {i}. {segment}")

print(f"\n✂️  After minus_last: {SEP_TOKEN.join(segments[:-1])}")
print("🎯 Still predict: ML Engineer")
print("💡 Idea: 'Learn from incomplete career paths to predict the next step'")

print("\n" + "="*50)


In [None]:
# ================================================
# 🗂️  4. UNDERSTANDING DATASET TYPES
# ================================================
print("🗂️  4. UNDERSTANDING DATASET TYPES")
print("=" * 50)

print("🔍 Available Dataset Types:")
print("   1. 'decorte' - Anonymous working histories with ESCO annotations")
print("   2. 'decorte_esco' - Same as decorte but all experiences in ESCO format")
print("   3. 'karrierewege' - Career paths from German job market")
print("   4. 'karrierewege_occ' - Karrierewege with occupational info")
print("   5. 'karrierewege_100k' - Larger Karrierewege dataset")
print("   6. 'karrierewege_cp' - Karrierewege with career path data")
print()

print("🔍 Dataset Characteristics:")
print("   • decorte: Real career histories with standardized ESCO occupations")
print("   • karrierewege: German career trajectories with language variants")
print("   • _100k: Larger dataset (100k+ examples)")
print("   • _cp: Career path specific processing")
print("   • _free: Uses free text instead of ESCO format")
print()

print("💡 Key Differences:")
print("   • ESCO format: Standardized job titles with descriptions")
print("   • Free text: Raw job titles and descriptions")
print("   • Size: Some datasets have 100k+ examples (hence the loading time!)")
print()

print("🧠 Memory Strategy:")
print("   • Load only what you need: Use smaller datasets for exploration")
print("   • Work with subsets: Take first N examples for testing")
print("   • Use language variants: 'en_free' datasets are often smaller")


In [None]:
# ================================================
# 🧪 5. WORKING WITH SMALL SUBSETS (Smart Testing)
# ================================================
print("🧪 5. WORKING WITH SMALL SUBSETS (Smart Testing)")
print("=" * 50)

print("🔍 Why work with subsets?")
print("   • Full datasets are HUGE (100k+ examples)")
print("   • Takes minutes to load, uses lots of memory")
print("   • For development/testing: Small samples are perfect!")
print()

print("💡 Strategy 1: Create your own small dataset")
# Create a tiny dataset for testing
tiny_pairs = [
    ("role: Junior Developer \n description: Entry level coding", "esco role: Software Developer \n description: Creates software"),
    ("role: Data Analyst \n description: Work with data", "esco role: Data Scientist \n description: Advanced data analysis"),
    ("role: Teacher \n description: Teach students<SEP>role: Principal \n description: School management", "esco role: Education Manager \n description: Manages educational programs")
]

print("📊 Our tiny test dataset:")
for i, (career, target) in enumerate(tiny_pairs, 1):
    print(f"{i}. {career}")
    print(f"   → {target}")
    print()

print("💡 Strategy 2: Load just a few examples")
print("   • Stop loading after N examples")
print("   • Use sampling techniques")
print("   • Perfect for understanding structure without waiting!")
print()

print("🔍 Benefits of small datasets:")
print("   • Fast loading (seconds vs minutes)")
print("   • Easy to inspect and understand")
print("   • Perfect for debugging functions")
print("   • Can manually verify correctness")
print()

print("🧠 Pro Tip:")
print("   • Start with 5-10 examples to understand patterns")
print("   • Test your functions on this small set first")
print("   • Then gradually increase size for performance testing")


In [None]:
# ================================================
# 🎪 6. UNDERSTANDING STAGES & DATA PROCESSING
# ================================================
print("🎪 6. UNDERSTANDING STAGES & DATA PROCESSING")
print("=" * 50)

print("🔍 Available stages in get_data():")
print("   1. 'embedding_finetuning' - Full career paths for representation learning")
print("   2. 'transformation_finetuning' - Career paths with last job removed")
print("   3. 'evaluation' - Same as transformation_finetuning")
print()

print("🎯 What each stage does:")
print("   • embedding_finetuning: Learn general career patterns")
print("     Input:  'Job1 → Job2 → Job3' → Target: 'Job4'")
print("     Use:   Training embeddings, learning career trajectories")
print()
print("   • transformation_finetuning: Learn transition patterns")
print("     Input:  'Job1 → Job2' → Target: 'Job3'")
print("     Use:   Predicting next career steps, career counseling")
print()

print("🔍 ONLY_TITLES parameter:")
print("   • ONLY_TITLES=False: Full descriptions (rich information)")
print("   • ONLY_TITLES=True: Just job titles (simpler, faster)")
print("   • Choose based on your model needs!")
print()

print("💡 Testing stages with our tiny dataset:")
# Simulate what different stages would do
career_path = "role: Developer \n description: Code<SEP>role: Senior Dev \n description: Lead teams<SEP>role: Architect \n description: Design systems"

print(f"Original: {career_path}")
print("Target:   esco role: Tech Lead \n description: Leads technical teams")

# Simulate different stages
segments = career_path.split(SEP_TOKEN)
print(f"\n📊 embedding_finetuning: {SEP_TOKEN.join(segments)}")
print(f"🎯 transformation_finetuning: {SEP_TOKEN.join(segments[:-1])}")
print("💡 Both predict: Architect")

print("\n" + "="*50)


In [None]:
# ================================================
# 🚀 7. PRACTICAL NEXT STEPS
# ================================================
print("🚀 7. PRACTICAL NEXT STEPS")
print("=" * 50)

print("💡 Now that you understand the structure, here's how to proceed:")
print()

print("🎯 Step 1: Start with a small, manageable dataset")
print("   data = Data('karrierewege_occ', ONLY_TITLES=True)  # Often smaller")
print("   # or")
print("   data = Data('decorte', ONLY_TITLES=True)  # Good starting point")
print()

print("🎯 Step 2: Get a subset of data for exploration")
print("   train, val, test = data.get_data('embedding_finetuning')")
print("   # Work with first 100 examples")
print("   small_train = train[:100]")
print("   small_val = val[:10]")
print("   small_test = test[:10]")
print()

print("🎯 Step 3: Explore the data structure")
print("   print('First training example:')")
print("   print(small_train[0])")
print("   print('\\nLabels in dataset:', len(data.labels))")
print("   print('First 5 labels:', data.labels[:5])")
print()

print("🎯 Step 4: Test different configurations")
print("   # Try different stages")
print("   train_trans, val_trans, test_trans = data.get_data('transformation_finetuning')")
print("   ")
print("   # Try with full descriptions")
print("   data_full = Data('decorte', ONLY_TITLES=False)")
print("   train_full, _, _ = data_full.get_data('embedding_finetuning')")
print()

print("🎯 Step 5: Scale up gradually")
print("   # Start with 100, then 1000, then full dataset")
print("   # Monitor memory usage and loading time")
print()

print("🔥 Pro Tips:")
print("   • Use ONLY_TITLES=True for faster loading and simpler models")
print("   • Start with 'karrierewege_occ' - often smaller than others")
print("   • Use subsets during development, full datasets for final training")
print("   • Monitor memory: Large datasets can use 10GB+ RAM")
print()

print("🎉 You're ready to explore! The key is starting small and scaling up.")
print("💡 Remember: Understanding 100 examples well is better than loading 100k poorly!")

print("\n" + "="*50)
print("🏆 CONGRATULATIONS! You now understand:")
print("   ✅ Data structure (career paths → target occupations)")
print("   ✅ Utility functions (minus_last, extract_titles)")
print("   ✅ Dataset types and their differences")
print("   ✅ Processing stages and their purposes")
print("   ✅ Smart strategies for working with large datasets")
print("="*50)


In [None]:
# ================================================
# 🧪 PRACTICAL EXAMPLE: Working with Small Data
# ================================================
print("🧪 PRACTICAL EXAMPLE: Working with Small Data")
print("=" * 50)

# Create a mock version of the loading functions to avoid the huge datasets
def create_small_test_dataset():
    """Create a small dataset for testing - simulates what the real functions do"""
    return [
        # (career_history, target_occupation)
        ("role: Junior Developer \n description: Entry level coding<SEP>role: Software Engineer \n description: Full stack development",
         "esco role: Senior Developer \n description: Leads development projects"),

        ("role: Data Analyst \n description: Basic data analysis<SEP>role: Business Analyst \n description: Business insights",
         "esco role: Data Scientist \n description: Advanced analytics and modeling"),

        ("role: Teacher \n description: Elementary education<SEP>role: Department Head \n description: Curriculum management",
         "esco role: Education Manager \n description: Manages educational programs"),

        ("role: Nurse \n description: Patient care<SEP>role: Senior Nurse \n description: Team coordination",
         "esco role: Nursing Manager \n description: Manages nursing staff"),

        ("role: Sales Rep \n description: Direct sales<SEP>role: Sales Manager \n description: Team leadership",
         "esco role: Sales Director \n description: Strategic sales planning")
    ]

# Simulate the Data class functionality
class MockData:
    def __init__(self, data_type, ONLY_TITLES=False):
        self.data_type = data_type
        self.only_titles = ONLY_TITLES
        self.train_pairs = create_small_test_dataset()
        self.val_pairs = create_small_test_dataset()[:2]  # Just 2 for validation
        self.test_pairs = create_small_test_dataset()[:1]  # Just 1 for testing
        self.labels = list(set([pair[1] for pair in self.train_pairs + self.val_pairs + self.test_pairs]))

    def get_data(self, stage):
        if stage == 'embedding_finetuning':
            if self.only_titles:
                return self._extract_titles(self.train_pairs), self._extract_titles(self.val_pairs), self._extract_titles(self.test_pairs)
            else:
                return self.train_pairs, self.val_pairs, self.test_pairs
        elif stage in ['transformation_finetuning', 'evaluation']:
            if self.only_titles:
                return self._minus_last(self._extract_titles(self.train_pairs)), self._minus_last(self._extract_titles(self.val_pairs)), self._minus_last(self._extract_titles(self.test_pairs))
            else:
                return self._minus_last(self.train_pairs), self._minus_last(self.val_pairs), self._minus_last(self.test_pairs)

    @staticmethod
    def _minus_last(data_pairs):
        """Remove last segment from career histories"""
        result = []
        for doc1, doc2 in data_pairs:
            segments = doc1.split(SEP_TOKEN)
            if len(segments) > 1:
                new_doc1 = SEP_TOKEN.join(segments[:-1])
                result.append((new_doc1, doc2))
        return result

    @staticmethod
    def _extract_titles(data_pairs):
        """Extract just the job titles"""
        result = []
        for doc1, doc2 in data_pairs:
            roles = re.findall(r"role: (.*?)\n", doc1)
            esco_role = re.findall(r"esco role: (.*?)\n", doc2)[0]
            result.append((SEP_TOKEN.join(roles), esco_role))
        return result

# Test it!
print("🔍 Creating a small mock dataset (5 examples)...")
mock_data = MockData('decorte', ONLY_TITLES=False)

print(f"✅ Created dataset with {len(mock_data.train_pairs)} training examples")
print(f"📊 Validation set: {len(mock_data.val_pairs)} examples")
print(f"🧪 Test set: {len(mock_data.test_pairs)} examples")
print(f"🏷️  Unique labels: {len(mock_data.labels)}")
print()

print("🔍 Let's look at the first example:")
train, val, test = mock_data.get_data('embedding_finetuning')
print(f"Training example 1: {train[0]}")
print()

print("🎯 Now try with ONLY_TITLES=True:")
mock_data_titles = MockData('decorte', ONLY_TITLES=True)
train_titles, val_titles, test_titles = mock_data_titles.get_data('embedding_finetuning')
print(f"Title-only example: {train_titles[0]}")
print()

print("✂️  Try transformation_finetuning (removes last job):")
train_trans, _, _ = mock_data.get_data('transformation_finetuning')
print(f"After minus_last: {train_trans[0]}")
print("💡 Notice: 'Software Engineer' was removed, still predict 'Senior Developer'")

print("\n🎉 Success! Now you can experiment without loading huge datasets!")
print("💡 Use this approach for quick testing and understanding")


In [3]:
data = Data('karrierewege+', max_rows = 1000)
train, val, test = data.get_data('embedding_finetuning')
df_train = pd.DataFrame(train)
df_train.iloc[1]
df_train.info()
print("\n".join(df_train.iloc[1].to_list()))


TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'

In [19]:
from data import Data
from utils import SEP_TOKEN
import re
import pandas as pd
from typing import List, Tuple

# import DECORTE
# The DATA_TYPE is case-sensitive, so it should be 'decorte'
decorte_data = Data('decorte', max_rows=100)
train_decorte, val_decorte, test_decorte = decorte_data.get_data('embedding_finetuning')
df_decorte_train = pd.DataFrame(train_decorte)
df_decorte_train.info()
print(df_decorte_train.head())


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

100%|██████████| 100/100 [00:00<00:00, 1018.83it/s]
100%|██████████| 100/100 [00:00<00:00, 1409.80it/s]
100%|██████████| 100/100 [00:00<00:00, 1304.38it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 853 entries, 0 to 852
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       853 non-null    object
 1   1       853 non-null    object
dtypes: object(2)
memory usage: 13.5+ KB
                                                   0  \
0  role: Line Cook \n description: - Prepped food...   
1  role: Line Cook \n description: - Prepared foo...   
2  role: Line Chef \n description: - Prepared foo...   
3  role: Line Cook \n description: - Prepped food...   
4  role: Line Cook \n description: - Prepared foo...   

                                                   1  
0  esco role: cook \n description: Cooks are culi...  
1  esco role: head chef \n description: Head chef...  
2  esco role: head chef \n description: Head chef...  
3  esco role: head chef \n description: Head chef...  
4  esco role: head chef \n description: Head chef...  





In [18]:
from data import Data
from utils import SEP_TOKEN
import re
import pandas as pd
from typing import List, Tuple

# import DECORTE
# The DATA_TYPE is case-sensitive, so it should be 'decorte'
kw_data = Data('karrierewege_plus', max_rows=100)
train_kw, val_kw, test_kw = kw_data.get_data('embedding_finetuning')
df_kw_train = pd.DataFrame(train_kw)
df_kw_train.info()
print(df_kw_train.head())


TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'

In [5]:
df_decorte_train.to_clipboard()

In [17]:
import pandas as pd
import os

# print current directory
print(os.getcwd())

# read data/talent_clef/taskA/training/taskA_training_en.tsv
df_talent_clef = pd.read_csv(
    r'/dss/dsshome1/02/ra95kix2/thesis/skills4cpp/data/talent_clef/TaskA/training/english/taskA_training_en.tsv',
    sep='\t',
    header=None,
    names=['family_id', 'id', 'jobtitle_1', 'job_title_2']
)
df_talent_clef.head()
df_talent_clef.info()

/dss/dsshome1/02/ra95kix2/thesis/skills4cpp/notebooks
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28880 entries, 0 to 28879
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   family_id    28880 non-null  object
 1   id           28880 non-null  object
 2   jobtitle_1   28880 non-null  object
 3   job_title_2  28880 non-null  object
dtypes: object(4)
memory usage: 902.6+ KB


In [10]:
df_talent_clef.head()

Unnamed: 0,family_id,id,jobtitle_1,job_title_2
0,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,air commodore,flight lieutenant
1,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,command and control officer,flight officer
2,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,air commodore,command and control officer
3,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,pilot officer,squadron leader
4,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,royal airforce officer,command and control officer


In [11]:
df_esco = pd.read_csv(r'/dss/dsshome1/02/ra95kix2/thesis/skills4cpp/data/occupations_en.csv')
df_esco.head()

Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,definition,inScheme,description,code
0,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,technical and operations director\nhead of tec...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/occu...,Technical directors realise the artistic visio...,2654.1.7
1,Occupation,http://data.europa.eu/esco/occupation/000e93a3...,8121,metal drawing machine operator,metal drawing machine technician\nmetal drawin...,,released,2024-01-23T10:09:32.099Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Metal drawing machine operators set up and ope...,8121.4
2,Occupation,http://data.europa.eu/esco/occupation/0019b951...,7543,precision device inspector,inspector of precision instruments\nprecision ...,,released,2024-01-25T15:00:12.188Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/occu...,Precision device inspectors make sure precisio...,7543.10.3
3,Occupation,http://data.europa.eu/esco/occupation/0022f466...,3155,air traffic safety technician,air traffic safety electronics hardware specia...,,released,2024-01-29T16:01:13.998Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Air traffic safety technicians provide technic...,3155.1
4,Occupation,http://data.europa.eu/esco/occupation/002da35b...,2431,hospitality revenue manager,hospitality revenues manager\nyield manager\nh...,,released,2024-01-11T10:28:45.871Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Hospitality revenue managers maximise revenue ...,2431.9


In [15]:
df_esco.query(f'conceptUri == "{df_talent_clef.iloc[0,1]}"')

Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,definition,inScheme,description,code
2884,Occupation,http://data.europa.eu/esco/occupation/f2cc5978...,110,air force officer,royal airforce officer\nflight lieutenant\nfli...,,released,2024-01-18T12:19:47.056Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Air force officers specialise in flying or gro...,110.1


In [22]:
df_talent_clef.iloc[1,1]

'http://data.europa.eu/esco/occupation/f2cc5978-e45c-4f28-b859-7f89221b0505'

In [23]:
df_esco.query(f'conceptUri == "{df_talent_clef.iloc[1,1]}"')

Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,definition,inScheme,description,code
2884,Occupation,http://data.europa.eu/esco/occupation/f2cc5978...,110,air force officer,royal airforce officer\nflight lieutenant\nfli...,,released,2024-01-18T12:19:47.056Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Air force officers specialise in flying or gro...,110.1


In [24]:
df_talent_clef.head()

Unnamed: 0,family_id,id,jobtitle_1,job_title_2
0,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,air commodore,flight lieutenant
1,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,command and control officer,flight officer
2,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,air commodore,command and control officer
3,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,pilot officer,squadron leader
4,http://data.europa.eu/esco/isco/C0110,http://data.europa.eu/esco/occupation/f2cc5978...,royal airforce officer,command and control officer
