In [1]:
import pandas as pd
import re
from pathlib import Path

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
path = Path("../1.raw/cv_labeled_final.csv")
df = pd.read_csv(path)

In [None]:
print(df.shape)
df.head(2)

(8264, 6)


Unnamed: 0,cv_id,cv_text,role_label_final,role_raw,source_dataset,resume_len
0,1,Python Developer Python Developer Python Devel...,python_developer,Python_Developer,dataset1_avishek,3467
1,2,R&D Engineer R&D Engineer R&D Engineer - Nokia...,python_developer,Python_Developer,dataset1_avishek,2812


In [None]:
# ============================================
# 1. Detección de seniority por TEXTO
#    (Junior / Mid / Senior en la cabecera)
# ============================================

def detect_seniority_from_text(text: str):
    """
    Intenta detectar Junior / Mid / Senior a partir de palabras clave
    en la CABECERA del CV (primeros ~600 caracteres).
    Devuelve: "Junior", "Mid", "Senior" o None.
    """
    if not isinstance(text, str):
        return None
    
    # Nos centramos en la cabecera del CV
    t = text.lower()[:600]
    
    # Palabras que apuntan fuerte a Junior
    junior_keywords = [
        "junior ", " jr ", " jr.", " entry level",
        "graduate ", " intern ", " internship "
    ]
    
    # Palabras que apuntan a Senior
    senior_keywords = [
        "senior ", " sr ", " sr.",
        " lead ", " principal ", " staff ", " architect "
    ]
    
    # Palabras que apuntan explícitamente a Mid
    mid_keywords = [
        "mid level", "mid-level", "intermediate "
    ]
    
    # Primero miramos si parece Junior
    for kw in junior_keywords:
        if kw in t:
            return "Junior"
    
    # Luego Senior
    for kw in senior_keywords:
        if kw in t:
            return "Senior"
    
    # Luego Mid
    for kw in mid_keywords:
        if kw in t:
            return "Mid"
    
    return None

# Aplicar al dataset
df['seniority_text'] = df['cv_text'].apply(detect_seniority_from_text)
df[['cv_id', 'role_label_final', 'seniority_text']].head(5)


Unnamed: 0,cv_id,role_label_final,seniority_text
0,1,python_developer,
1,2,python_developer,
2,3,python_developer,Senior
3,4,python_developer,Senior
4,5,python_developer,Senior


In [None]:
# ============================================
# 2. Detección de años de experiencia en texto
#    (X years of experience, X years' experience, X years)
# ============================================

# Patrón estricto: "3 years of experience", "5+ years of experience"
pattern_strict = re.compile(
    r"(\d+)\+?\s*(?:years|year|yrs)\s+of\s+experience",
    re.IGNORECASE
)

# Patrón: "8+ years' experience", "10 years experience"
pattern_apostrophe = re.compile(
    r"(\d+)\+?\s*(?:years|year|yrs)'?\s+experience",
    re.IGNORECASE
)

# Patrón laxo: "5 years", "7 yrs" (sin 'experience')
pattern_loose = re.compile(
    r"(\d+)\+?\s*(?:years|year|yrs)",
    re.IGNORECASE
)

def extract_years_of_experience(text: str):
    """
    Intenta extraer años de experiencia del CV.
    Prioridad:
      1) "X years of experience"
      2) "X years' experience"
      3) cualquier "X years" si X >= 5 (para evitar juniors falsos)
    Devuelve float o None.
    """
    if not isinstance(text, str):
        return None
    
    # 1) Patrón estricto
    m = pattern_strict.search(text)
    if m:
        try:
            return float(m.group(1))
        except ValueError:
            return None
    
    # 2) Patrón con apóstrofe / sin "of"
    m = pattern_apostrophe.search(text)
    if m:
        try:
            return float(m.group(1))
        except ValueError:
            return None
    
    # 3) Patrón laxo: "X years" a secas (solo si parece perfil no-junior)
    m = pattern_loose.search(text)
    if m:
        try:
            years = float(m.group(1))
        except ValueError:
            years = None
        
        # Solo nos fiamos si parece claramente perfil rodado
        if years is not None and years >= 5:
            return years
    
    return None

# Aplicar al dataset
df['years_from_text'] = df['cv_text'].apply(extract_years_of_experience)
df[['cv_id', 'role_label_final', 'years_from_text']].head(10)


Unnamed: 0,cv_id,role_label_final,years_from_text
0,1,python_developer,
1,2,python_developer,3.0
2,3,python_developer,6.0
3,4,python_developer,7.0
4,5,python_developer,7.0
5,6,python_developer,6.0
6,7,python_developer,7.0
7,8,python_developer,
8,9,python_developer,4.0
9,10,python_developer,5.0


In [None]:
# ============================================
# 3. Mapear años -> Junior / Mid / Senior
# ============================================

def map_years_to_seniority(years):
    """
    Regla básica:
      0–2 años   -> Junior
      2–5 años   -> Mid
      >5 años    -> Senior
    """
    if years is None or pd.isna(years):
        return None
    
    y = float(years)
    
    if y < 2:
        return "Junior"
    elif y < 5:
        return "Mid"
    else:
        return "Senior"

df['seniority_years'] = df['years_from_text'].apply(map_years_to_seniority)

df[['cv_id', 'role_label_final', 'years_from_text', 'seniority_years']].head(10)


Unnamed: 0,cv_id,role_label_final,years_from_text,seniority_years
0,1,python_developer,,
1,2,python_developer,3.0,Mid
2,3,python_developer,6.0,Senior
3,4,python_developer,7.0,Senior
4,5,python_developer,7.0,Senior
5,6,python_developer,6.0,Senior
6,7,python_developer,7.0,Senior
7,8,python_developer,,
8,9,python_developer,4.0,Mid
9,10,python_developer,5.0,Senior


In [None]:
# ============================================
# 4. Combinar ambas fuentes en seniority_weak
# ============================================

def combine_seniority(row):
    """
    Combinamos las dos fuentes:
      - Si solo hay texto -> usamos seniority_text
      - Si solo hay años  -> usamos seniority_years
      - Si hay ambas y coinciden -> usamos cualquiera
      - Si NO coinciden -> priorizamos años (menos "marketing")
    """
    text_label = row['seniority_text']
    years_label = row['seniority_years']
    
    # Normalizar NaN a None
    if pd.isna(text_label):
        text_label = None
    if pd.isna(years_label):
        years_label = None

    # Caso 1: no tenemos nada
    if text_label is None and years_label is None:
        return None
    
    # Caso 2: solo texto
    if text_label is not None and years_label is None:
        return text_label
    
    # Caso 3: solo años
    if text_label is None and years_label is not None:
        return years_label
    
    # Caso 4: tenemos las dos cosas
    if text_label == years_label:
        return text_label  # da igual cuál devolvamos

    # Conflicto -> priorizamos lo que dicen los años
    return years_label

df['seniority_weak'] = df.apply(combine_seniority, axis=1)

print("Distribución de etiquetas débiles (seniority_weak):")
print(df['seniority_weak'].value_counts(dropna=False))


Distribución de etiquetas débiles (seniority_weak):
seniority_weak
Senior    4594
None      3011
Mid        405
Junior     254
Name: count, dtype: int64


In [None]:
# ============================================
# 5. Inspección rápida (opcional, pero recomendable)
#    Para que tú misma veas que tiene sentido
# ============================================

df_weak = df[~df['seniority_weak'].isna()].copy()

print("Distribución SOLO casos con etiqueta:")
print(df_weak['seniority_weak'].value_counts())

# 1 ejemplo de cada
print("\n===== EJEMPLO JUNIOR =====")
print(
    df_weak[df_weak['seniority_weak'] == "Junior"][
        ['cv_id', 'role_label_final', 'cv_text']
    ].head(1)
)

print("\n===== EJEMPLO MID =====")
print(
    df_weak[df_weak['seniority_weak'] == "Mid"][
        ['cv_id', 'role_label_final', 'cv_text']
    ].head(1)
)

print("\n===== EJEMPLO SENIOR =====")
print(
    df_weak[df_weak['seniority_weak'] == "Senior"][
        ['cv_id', 'role_label_final', 'cv_text']
    ].head(1)
)


Distribución SOLO casos con etiqueta:
seniority_weak
Senior    4594
Mid        405
Junior     254
Name: count, dtype: int64

===== EJEMPLO JUNIOR =====
    cv_id  role_label_final                                            cv_text
74     75  python_developer  Technology Summer Analyst Technology Summer An...

===== EJEMPLO MID =====
   cv_id  role_label_final                                            cv_text
1      2  python_developer  R&D Engineer R&D Engineer R&D Engineer - Nokia...

===== EJEMPLO SENIOR =====
   cv_id  role_label_final                                            cv_text
2      3  python_developer  Sr. Full Stack Developer Sr. Full Stack Develo...


In [None]:
# ============================================
# 6. Guardar dataset enriquecido en 3.processed
# ============================================

output_path = Path("../3.processed/cv_with_seniority_weak.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

df.to_csv(output_path, index=False)
print("Guardado:", output_path)


Guardado: ..\3.processed\cv_with_seniority_weak.csv
