In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# -------------------------------------------------------------------
# Preprocess
# -------------------------------------------------------------------
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[@#]\S+", " ", text)
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    text = re.sub(rf"[{string.punctuation}]", " ", text)
    text = text.lower()
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)

df = pd.read_csv('/Users/alexmiliandieguez/code/TechCareer/data/model1_cv_role/3.processed/v1_english/cv_labeled_final.csv')
df = df[['cv_text', 'role_label_final']]
df.rename(columns={'cv_text': 'cv', 'role_label_final': 'role'}, inplace=True)

df['cv_clean'] = df['cv'].apply(clean_text)

# -------------------------------------------------------------------
# Train/Test split
# -------------------------------------------------------------------
X = df['cv_clean'].values
y = df['role'].values

X_train_texts, X_test_texts, y_train_labels, y_test_labels = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------------------------------------------
# TF-IDF sparse (sense .toarray())
# -------------------------------------------------------------------
vectorizer = TfidfVectorizer(
    max_features=10000,   # optimitzat
    sublinear_tf=True,
    stop_words=None,      # ja fem stopwords al preprocess
    max_df=0.9,
    min_df=3
)

X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

# -------------------------------------------------------------------
# Labels
# -------------------------------------------------------------------
le = LabelEncoder()
y_train = le.fit_transform(y_train_labels)
y_test = le.transform(y_test_labels)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
num_classes = y_train.shape[1]

# -------------------------------------------------------------------
# Model MLP optimitzat
# -------------------------------------------------------------------
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(num_classes, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.0007),
    metrics=['accuracy']
)

model.summary()

# -------------------------------------------------------------------
# Early stopping basat en val_accuracy
# -------------------------------------------------------------------
es = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# -------------------------------------------------------------------
# Training
# -------------------------------------------------------------------
history = model.fit(
    X_train.toarray(), y_train,
    batch_size=32,
    epochs=20,
    validation_data=(X_test.toarray(), y_test),
    callbacks=[es],
    verbose=2
)

# -------------------------------------------------------------------
# Resultats
# -------------------------------------------------------------------
loss, acc = model.evaluate(X_test.toarray(), y_test, verbose=0)
print(f"Test accuracy: {acc:.4f}")


2025-12-04 19:03:06.431716: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexmiliandieguez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
207/207 - 6s - 29ms/step - accuracy: 0.6032 - loss: 1.2987 - val_accuracy: 0.8058 - val_loss: 0.5862
Epoch 2/20
207/207 - 4s - 21ms/step - accuracy: 0.8315 - loss: 0.4946 - val_accuracy: 0.8409 - val_loss: 0.4453
Epoch 3/20
207/207 - 4s - 21ms/step - accuracy: 0.9017 - loss: 0.3050 - val_accuracy: 0.8621 - val_loss: 0.4025
Epoch 4/20
207/207 - 5s - 22ms/step - accuracy: 0.9499 - loss: 0.1789 - val_accuracy: 0.8675 - val_loss: 0.3859
Epoch 5/20
207/207 - 4s - 21ms/step - accuracy: 0.9737 - loss: 0.1038 - val_accuracy: 0.8693 - val_loss: 0.3957
Epoch 6/20
207/207 - 5s - 22ms/step - accuracy: 0.9896 - loss: 0.0583 - val_accuracy: 0.8730 - val_loss: 0.4214
Epoch 7/20
207/207 - 5s - 22ms/step - accuracy: 0.9927 - loss: 0.0380 - val_accuracy: 0.8693 - val_loss: 0.4413
Epoch 8/20
207/207 - 5s - 23ms/step - accuracy: 0.9956 - loss: 0.0256 - val_accuracy: 0.8693 - val_loss: 0.4591
Epoch 9/20
207/207 - 5s - 22ms/step - accuracy: 0.9971 - loss: 0.0180 - val_accuracy: 0.8681 - val_loss: