In [6]:
import pandas as pd
import re
from pathlib import Path

In [7]:
path = Path("../1.raw/cv_labeled_final.csv")
df = pd.read_csv(path)

In [8]:
print(df.shape)
df.head(2)

(8264, 6)


Unnamed: 0,cv_id,cv_text,role_label_final,role_raw,source_dataset,resume_len
0,1,Python Developer Python Developer Python Devel...,python_developer,Python_Developer,dataset1_avishek,3467
1,2,R&D Engineer R&D Engineer R&D Engineer - Nokia...,python_developer,Python_Developer,dataset1_avishek,2812


In [9]:
def detect_seniority_from_text(text: str):
    """
    Intenta detectar Junior / Mid / Senior a partir de palabras clave
    en el propio CV (cabecera, título del puesto, etc.).
    Devuelve: "Junior", "Mid", "Senior" o None.
    """
    if not isinstance(text, str):
        return None
    
    t = text.lower()
    
    # Palabras que apuntan muy fuerte a Junior
    junior_keywords = [
        "junior ", " jr ", " jr.", " entry level", 
        "graduate ", " intern ", " internship "
    ]
    
    # Palabras que apuntan a Senior
    senior_keywords = [
        "senior ", " sr ", " sr.", 
        " lead ", " principal ", " staff ", " architect "
    ]
    
    # Palabras que apuntan explícitamente a Mid
    mid_keywords = [
        "mid level", "mid-level", "intermediate "
    ]
    
    # Primero miramos si parece Junior
    for kw in junior_keywords:
        if kw in t:
            return "Junior"
    
    # Luego Senior
    for kw in senior_keywords:
        if kw in t:
            return "Senior"
    
    # Luego Mid
    for kw in mid_keywords:
        if kw in t:
            return "Mid"
    
    return None

In [10]:
# Patrón tipo "3 years of experience", "5+ years of experience", etc.
years_pattern = re.compile(
    r"(\d+)\+?\s+(?:years|year|yrs)\s+of\s+experience",
    re.IGNORECASE
)

def extract_years_of_experience(text: str):
    """
    Devuelve el número de años de experiencia detectado en el texto,
    o None si no encuentra nada.
    """
    if not isinstance(text, str):
        return None
    
    m = years_pattern.search(text)
    if m:
        try:
            return float(m.group(1))
        except ValueError:
            return None
    return None


def map_years_to_seniority(years):
    """
    Mapea años de experiencia a Junior / Mid / Senior.
    Regla básica:
      0–2 años   -> Junior
      2–5 años   -> Mid
      >5 años    -> Senior
    """
    import math
    
    if years is None:
        return None
    try:
        y = float(years)
    except Exception:
        return None
    
    if math.isnan(y):
        return None
    
    if y < 2:
        return "Junior"
    elif y < 5:
        return "Mid"
    else:
        return "Senior"


In [11]:
# 3. Aplicar reglas al dataset

# Aseguramos que la columna se llama cv_text (en tu CSV es así)
print("Columnas disponibles:", df.columns)

df['seniority_text'] = df['cv_text'].apply(detect_seniority_from_text)
df['years_from_text'] = df['cv_text'].apply(extract_years_of_experience)
df['seniority_years'] = df['years_from_text'].apply(map_years_to_seniority)

df[['cv_id', 'role_label_final', 'seniority_text', 'years_from_text', 'seniority_years']].head(10)


Columnas disponibles: Index(['cv_id', 'cv_text', 'role_label_final', 'role_raw', 'source_dataset',
       'resume_len'],
      dtype='object')


Unnamed: 0,cv_id,role_label_final,seniority_text,years_from_text,seniority_years
0,1,python_developer,,,
1,2,python_developer,,3.0,Mid
2,3,python_developer,Senior,,
3,4,python_developer,Senior,7.0,Senior
4,5,python_developer,Senior,7.0,Senior
5,6,python_developer,Senior,6.0,Senior
6,7,python_developer,Junior,7.0,Senior
7,8,python_developer,,,
8,9,python_developer,Senior,4.0,Mid
9,10,python_developer,,5.0,Senior


In [12]:
def combine_seniority(row):
    """
    Combinamos las dos fuentes:
      - Si hay seniority detectada por palabras clave, usamos esa.
      - Si no, usamos la derivada de años de experiencia.
      - Si no hay nada, devolvemos None.
    """
    if row['seniority_text'] is not None:
        return row['seniority_text']
    return row['seniority_years']

df['seniority_weak'] = df.apply(combine_seniority, axis=1)

print("Distribución de etiquetas débiles (seniority_weak):")
print(df['seniority_weak'].value_counts(dropna=False))


Distribución de etiquetas débiles (seniority_weak):
seniority_weak
Senior    4107
None      2250
Junior    1677
Mid        230
Name: count, dtype: int64


In [14]:
def combine_seniority(row):
    text_label = row['seniority_text']
    years_label = row['seniority_years']
    
    # Normalizar Nones/NaN para trabajar cómodo
    if pd.isna(text_label):
        text_label = None
    if pd.isna(years_label):
        years_label = None

    # Caso 1: no tenemos nada
    if text_label is None and years_label is None:
        return None
    
    # Caso 2: solo texto
    if text_label is not None and years_label is None:
        return text_label
    
    # Caso 3: solo años
    if text_label is None and years_label is not None:
        return years_label
    
    # Caso 4: tenemos las dos cosas
    if text_label == years_label:
        return text_label  # da igual cuál devolvamos
        
    # Conflicto: priorizamos lo que dicen los años
    return years_label


In [15]:
df['seniority_weak'] = df.apply(combine_seniority, axis=1)
print(df['seniority_weak'].value_counts(dropna=False))


seniority_weak
Senior    4265
None      2250
Junior    1366
Mid        383
Name: count, dtype: int64


In [None]:
# 3.processed
output_path = Path("../3.processed/cv_with_seniority_weak.csv")

df.to_csv(output_path, index=False)
print("Guardado:", output_path)

Guardado: ..\3.processed\cv_with_seniority_weak.csv
