In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam


2025-12-04 18:51:16.269266: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexmiliandieguez/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
def clean_text(text):
    # Elimina URLs
    text = re.sub(r"http\S+", " ", text)
    # Elimina mencions i hashtags
    text = re.sub(r"[@#]\S+", " ", text)
    # Elimina accents i caràcters no ASCII
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    # Elimina puntuació
    text = re.sub(rf"[{string.punctuation}]", " ", text)
    # Lowercase
    text = text.lower()
    # Remove stopwords
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)

In [5]:
df = pd.read_csv('/Users/alexmiliandieguez/code/TechCareer/data/model1_cv_role/3.processed/v1_english/cv_labeled_final.csv')

df = df.loc[:, ['cv_text', 'role_label_final']]

df.rename(columns={'cv_text': 'cv', 'role_label_final': 'role'}, inplace=True)
df.head()


Unnamed: 0,cv,role
0,Python Developer Python Developer Python Devel...,python_developer
1,R&D Engineer R&D Engineer R&D Engineer - Nokia...,python_developer
2,Sr. Full Stack Developer Sr. Full Stack Develo...,python_developer
3,Sr. Full Stack Python Developer Sr. Full Stack...,python_developer
4,Sr. Python Developer Sr. Python Developer Sr. ...,python_developer


In [6]:
df['cv_clean'] = df['cv'].apply(clean_text)

In [7]:
X_texts = df['cv_clean'].values
y_labels = df['role'].values

X_train_texts, X_test_texts, y_train_labels, y_test_labels = train_test_split(
    X_texts, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)


In [8]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    max_df=0.9,
    min_df=3
)
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

In [9]:
le = LabelEncoder()
y_train = le.fit_transform(y_train_labels)
y_test = le.transform(y_test_labels)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
num_classes = y_train.shape[1]

In [10]:
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.4))         # Regularització forta

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(num_classes, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.0007),
    metrics=['accuracy']
)

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
es = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)


In [12]:
history = model.fit(
    X_train.toarray(), y_train,
    batch_size=32,
    epochs=20,
    validation_data=(X_test.toarray(), y_test),
    callbacks=[es],
    verbose=2
)


Epoch 1/20
207/207 - 31s - 149ms/step - accuracy: 0.6374 - loss: 1.0965 - val_accuracy: 0.8221 - val_loss: 0.4843
Epoch 2/20
207/207 - 28s - 137ms/step - accuracy: 0.8861 - loss: 0.3235 - val_accuracy: 0.8500 - val_loss: 0.4414
Epoch 3/20
207/207 - 27s - 130ms/step - accuracy: 0.9669 - loss: 0.1155 - val_accuracy: 0.8554 - val_loss: 0.4563
Epoch 4/20
207/207 - 27s - 132ms/step - accuracy: 0.9933 - loss: 0.0288 - val_accuracy: 0.8657 - val_loss: 0.5085
Epoch 5/20
207/207 - 28s - 136ms/step - accuracy: 0.9974 - loss: 0.0132 - val_accuracy: 0.8536 - val_loss: 0.6459


In [13]:
loss, acc = model.evaluate(X_test.toarray(), y_test, verbose=0)
print(f"Test accuracy: {acc:.4f}")

Test accuracy: 0.8500
