In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib


In [3]:
# load the prepared dataset
df = pd.read_csv("dataset/stackoverflow_prepared.csv")
# split the data into features and target
X = df.drop(columns=["Employed"])
y = df["Employed"]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9018
Precision: 0.9093
Recall: 0.9084
F1 Score: 0.9089


In [4]:
# try with decision tree from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)
y_pred_dt = dt_model.predict(X_test_scaled)
# evaluate the decision tree model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
print(f"Decision Tree - Accuracy: {accuracy_dt:.4f}")
print(f"Decision Tree - Precision: {precision_dt:.4f}")
print(f"Decision Tree - Recall: {recall_dt:.4f}")
print(f"Decision Tree - F1 Score: {f1_dt:.4f}")

Decision Tree - Accuracy: 0.8613
Decision Tree - Precision: 0.8675
Decision Tree - Recall: 0.8766
Decision Tree - F1 Score: 0.8720


In [5]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
# evaluate the random forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print(f"Random Forest - Accuracy: {accuracy_rf:.4f}")
print(f"Random Forest - Precision: {precision_rf:.4f}")
print(f"Random Forest - Recall: {recall_rf:.4f}")
print(f"Random Forest - F1 Score: {f1_rf:.4f}")


Random Forest - Accuracy: 0.8960
Random Forest - Precision: 0.9058
Random Forest - Recall: 0.9009
Random Forest - F1 Score: 0.9033


In [6]:
# Build a NN model using TensorFlow
import tensorflow as tf
nn_model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, validation_split=0.2)
# evaluate the NN model
loss, accuracy_nn = nn_model.evaluate(X_test_scaled, y_test)
y_pred_nn = (nn_model.predict(X_test_scaled) > 0.5).astype("int32")
precision_nn = precision_score(y_test, y_pred_nn)
recall_nn = recall_score(y_test, y_pred_nn)
f1_nn = f1_score(y_test, y_pred_nn)
print(f"Neural Network - Accuracy: {accuracy_nn:.4f}")
print(f"Neural Network - Precision: {precision_nn:.4f}")
print(f"Neural Network - Recall: {recall_nn:.4f}")
print(f"Neural Network - F1 Score: {f1_nn:.4f}")

  if not hasattr(np, "object"):


Epoch 1/20
[1m1459/1459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8929 - loss: 0.2454 - val_accuracy: 0.8983 - val_loss: 0.2250
Epoch 2/20
[1m1459/1459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9018 - loss: 0.2203 - val_accuracy: 0.8985 - val_loss: 0.2216
Epoch 3/20
[1m1459/1459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9032 - loss: 0.2164 - val_accuracy: 0.8990 - val_loss: 0.2201
Epoch 4/20
[1m1459/1459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9059 - loss: 0.2131 - val_accuracy: 0.9028 - val_loss: 0.2180
Epoch 5/20
[1m1459/1459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9055 - loss: 0.2108 - val_accuracy: 0.9015 - val_loss: 0.2183
Epoch 6/20
[1m1459/1459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9059 - loss: 0.2095 - val_accuracy: 0.9003 - val_loss: 0.2178
Epoch 7/20
[1m1

In [12]:
import os
os.makedirs("app", exist_ok=True)
# save the best model and scaler
best_model = nn_model
best_model.save("app/best_model_nn.keras")
joblib.dump(scaler, "app/scaler.joblib")


['app/scaler.joblib']