In [51]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split


In [52]:
DATA_DIR = Path("../../data/model2_seniority/3.processed")
CSV_PATH = DATA_DIR / "cv_with_seniority_weak.csv"

df = pd.read_csv(CSV_PATH)

In [53]:
print(df.shape)
df.head()


(8264, 10)


Unnamed: 0,cv_id,cv_text,role_label_final,role_raw,source_dataset,resume_len,seniority_text,years_from_text,seniority_years,seniority_weak
0,1,Python Developer Python Developer Python Devel...,python_developer,Python_Developer,dataset1_avishek,3467,,,,
1,2,R&D Engineer R&D Engineer R&D Engineer - Nokia...,python_developer,Python_Developer,dataset1_avishek,2812,,3.0,Mid,Mid
2,3,Sr. Full Stack Developer Sr. Full Stack Develo...,python_developer,Python_Developer,dataset1_avishek,16606,Senior,6.0,Senior,Senior
3,4,Sr. Full Stack Python Developer Sr. Full Stack...,python_developer,Python_Developer,dataset1_avishek,9253,Senior,7.0,Senior,Senior
4,5,Sr. Python Developer Sr. Python Developer Sr. ...,python_developer,Python_Developer,dataset1_avishek,15659,Senior,7.0,Senior,Senior


In [54]:
# Etiquetas válidas
VALID_LABELS = ["Junior", "Mid", "Senior"]

# Aseguramos texto como string y sin NaN
df["cv_text"] = df["cv_text"].fillna("").astype(str)

# Filtrado por etiqueta débil de seniority
mask_labels = df["seniority_weak"].isin(VALID_LABELS)
df_train_seniority = df[mask_labels].copy()

# Eliminar duplicados por cv_id (por seguridad)
if "cv_id" in df_train_seniority.columns:
    df_train_seniority = df_train_seniority.drop_duplicates(subset="cv_id")

print("Tamaño df_train_seniority:", df_train_seniority.shape)
df_train_seniority[["cv_id", "seniority_weak"]].head()


Tamaño df_train_seniority: (5253, 10)


Unnamed: 0,cv_id,seniority_weak
1,2,Mid
2,3,Senior
3,4,Senior
4,5,Senior
5,6,Senior


In [55]:
class_counts = df_train_seniority["seniority_weak"].value_counts()
class_ratio = df_train_seniority["seniority_weak"].value_counts(normalize=True)

print("Recuento por clase:")
print(class_counts)
print("\nProporción por clase:")
print(class_ratio)


Recuento por clase:
seniority_weak
Senior    4594
Mid        405
Junior     254
Name: count, dtype: int64

Proporción por clase:
seniority_weak
Senior    0.874548
Mid       0.077099
Junior    0.048353
Name: proportion, dtype: float64


In [56]:
X = df_train_seniority["cv_text"].values
y = df_train_seniority["seniority_weak"].values

In [57]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Tamaño X_train:", len(X_train))
print("Tamaño X_test :", len(X_test))


Tamaño X_train: 4202
Tamaño X_test : 1051


In [58]:
print("Train class distribution:")
print(pd.Series(y_train).value_counts(normalize=True))

print("\nTest class distribution:")
print(pd.Series(y_test).value_counts(normalize=True))


Train class distribution:
Senior    0.874584
Mid       0.077106
Junior    0.048310
Name: proportion, dtype: float64

Test class distribution:
Senior    0.874405
Mid       0.077069
Junior    0.048525
Name: proportion, dtype: float64


In [59]:
from sklearn.utils import resample

# Pasamos el train a DataFrame para trabajar más cómodo
train_df = pd.DataFrame({
    "cv_text": X_train,
    "seniority_weak": y_train,
})

# Lo separamos por clase
df_junior = train_df[train_df["seniority_weak"] == "Junior"]
df_mid    = train_df[train_df["seniority_weak"] == "Mid"]
df_senior = train_df[train_df["seniority_weak"] == "Senior"]

print(len(df_junior), len(df_mid), len(df_senior))


203 324 3675


In [60]:
# Objetivo: mismo nº de ejemplos que Senior
n_target = len(df_senior)

df_junior_up = resample(
    df_junior,
    replace=True,
    n_samples=n_target,
    random_state=42,
)

df_mid_up = resample(
    df_mid,
    replace=True,
    n_samples=n_target,
    random_state=42,
)

# Juntamos todo y mezclamos
train_df_balanced = pd.concat([df_senior, df_mid_up, df_junior_up], axis=0)
train_df_balanced = train_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(train_df_balanced["seniority_weak"].value_counts())


seniority_weak
Mid       3675
Senior    3675
Junior    3675
Name: count, dtype: int64


In [61]:
X_train_bal = train_df_balanced["cv_text"].values
y_train_bal = train_df_balanced["seniority_weak"].values


### Model

In [62]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

text_clf_bal = Pipeline(
    steps=[
        (
            "tfidf",
            TfidfVectorizer(
                lowercase=True,
                stop_words="english",
                ngram_range=(1, 2),
                min_df=5,
                max_df=0.8,
            ),
        ),
        (
            "clf",
            LogisticRegression(
                max_iter=1000,
                n_jobs=-1,
                multi_class="auto",
            ),
        ),
    ]
)




In [63]:
%%time
text_clf_bal.fit(X_train_bal, y_train_bal)

CPU times: total: 16.6 s
Wall time: 47.1 s


In [64]:
y_pred_test_bal = text_clf_bal.predict(X_test)

print("=== Classification report (TEST) - train balanceado ===")
print(classification_report(y_test, y_pred_test_bal, digits=3))

labels = ["Junior", "Mid", "Senior"]
cm_bal = confusion_matrix(y_test, y_pred_test_bal, labels=labels)

cm_bal_df = pd.DataFrame(
    cm_bal,
    index=[f"true_{l}" for l in labels],
    columns=[f"pred_{l}" for l in labels],
)

print("=== Matriz de confusión (TEST) - balanceado ===")
print(cm_bal_df)

cm_bal_norm = cm_bal_df.div(cm_bal_df.sum(axis=1), axis=0)
print("\n=== Matriz de confusión normalizada (TEST) - balanceado ===")
print(cm_bal_norm.round(3))


=== Classification report (TEST) - train balanceado ===
              precision    recall  f1-score   support

      Junior      0.600     0.706     0.649        51
         Mid      0.362     0.309     0.333        81
      Senior      0.930     0.933     0.931       919

    accuracy                          0.873      1051
   macro avg      0.631     0.649     0.638      1051
weighted avg      0.870     0.873     0.871      1051

=== Matriz de confusión (TEST) - balanceado ===
             pred_Junior  pred_Mid  pred_Senior
true_Junior           36         3           12
true_Mid               3        25           53
true_Senior           21        41          857

=== Matriz de confusión normalizada (TEST) - balanceado ===
             pred_Junior  pred_Mid  pred_Senior
true_Junior        0.706     0.059        0.235
true_Mid           0.037     0.309        0.654
true_Senior        0.023     0.045        0.933


### Guardar modelo

In [73]:
from pathlib import Path
import joblib

MODEL_DIR = Path("../../models/seniority")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PATH = MODEL_DIR / "seniority_from_cv_balanced_v1.pkl"

joblib.dump(text_clf_bal, MODEL_PATH)
print(f"Modelo guardado en: {MODEL_PATH}")


Modelo guardado en: ..\..\models\seniority\seniority_from_cv_balanced_v1.pkl
