In [33]:
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

In [34]:
encoder_embedding_file = "embeddings/{split}_encoder_embeddings.npy"
projected_embedding_file = "embeddings/{split}_project_embeddings.npy"
ground_truth_file = "embeddings/{split}_age.npy"

In [35]:
age_brackets = ["10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "more than 70"]

# Encoder experiment

In [36]:
# Load training data
X_train = np.load(encoder_embedding_file.format(split="train"))
y_train = np.load(ground_truth_file.format(split="train"))

In [37]:
# Load validation data
X_val = np.load(encoder_embedding_file.format(split="val"))
y_val = np.load(ground_truth_file.format(split="val"))

In [38]:
# Load test data
X_test = np.load(encoder_embedding_file.format(split="test"))
y_test = np.load(ground_truth_file.format(split="test"))

In [39]:
enc_scaler = preprocessing.StandardScaler().fit(X_train)

In [40]:
X_scaled = enc_scaler.transform(X_train)
lr_clf_enc_age = LogisticRegression(random_state=42, max_iter=400)
lr_clf_enc_age.fit(X_scaled, y_train)

In [41]:
y_train_preds = lr_clf_enc_age.predict(X_scaled)
print(classification_report(y_train, y_train_preds, target_names=age_brackets))

              precision    recall  f1-score   support

       10-19       0.79      0.65      0.71      9103
       20-29       0.65      0.76      0.70     25598
       30-39       0.55      0.53      0.54     19250
       40-49       0.53      0.48      0.50     10744
       50-59       0.55      0.54      0.54      6228
       60-69       0.60      0.53      0.57      2779
more than 70       0.71      0.65      0.68       842

    accuracy                           0.62     74544
   macro avg       0.63      0.59      0.61     74544
weighted avg       0.62      0.62      0.61     74544



In [42]:
X_val_scaled = enc_scaler.transform(X_val)
y_val_preds = lr_clf_enc_age.predict(X_val_scaled)
print(classification_report(y_val, y_val_preds, target_names=age_brackets))

              precision    recall  f1-score   support

       10-19       0.76      0.65      0.70       765
       20-29       0.66      0.73      0.69      2137
       30-39       0.51      0.50      0.50      1442
       40-49       0.48      0.45      0.47       853
       50-59       0.50      0.46      0.48       523
       60-69       0.48      0.50      0.49       204
more than 70       0.55      0.43      0.49        76

    accuracy                           0.59      6000
   macro avg       0.56      0.53      0.55      6000
weighted avg       0.59      0.59      0.59      6000



In [43]:
X_test_scaled = enc_scaler.transform(X_test)
y_test_preds = lr_clf_enc_age.predict(X_test_scaled)
print(classification_report(y_test, y_test_preds, target_names=age_brackets))

              precision    recall  f1-score   support

       10-19       0.75      0.61      0.67       416
       20-29       0.63      0.74      0.68      1163
       30-39       0.52      0.50      0.51       888
       40-49       0.49      0.43      0.46       500
       50-59       0.48      0.46      0.47       273
       60-69       0.50      0.47      0.48       117
more than 70       0.50      0.36      0.42        42

    accuracy                           0.58      3399
   macro avg       0.55      0.51      0.53      3399
weighted avg       0.58      0.58      0.57      3399



# Projection experiment

In [44]:
# Load training data
X_train = np.load(projected_embedding_file.format(split="train"))
y_train = np.load(ground_truth_file.format(split="train"))

In [45]:
# Load validation data
X_val = np.load(projected_embedding_file.format(split="val"))
y_val = np.load(ground_truth_file.format(split="val"))

In [46]:
# Load test data
X_test = np.load(projected_embedding_file.format(split="test"))
y_test = np.load(ground_truth_file.format(split="test"))

In [47]:
proj_scaler = preprocessing.StandardScaler().fit(X_train)

In [48]:
X_scaled = proj_scaler.transform(X_train)
lr_clf_proj_age = LogisticRegression(random_state=42, max_iter=400)
lr_clf_proj_age.fit(X_scaled, y_train)

In [49]:
y_train_preds = lr_clf_proj_age.predict(X_scaled)
print(classification_report(y_train, y_train_preds, target_names=age_brackets))

              precision    recall  f1-score   support

       10-19       0.78      0.63      0.70      9103
       20-29       0.65      0.75      0.70     25598
       30-39       0.54      0.52      0.53     19250
       40-49       0.52      0.47      0.49     10744
       50-59       0.53      0.52      0.53      6228
       60-69       0.57      0.50      0.53      2779
more than 70       0.69      0.60      0.64       842

    accuracy                           0.61     74544
   macro avg       0.61      0.57      0.59     74544
weighted avg       0.61      0.61      0.60     74544



In [50]:
X_val_scaled = proj_scaler.transform(X_val)
y_val_preds = lr_clf_proj_age.predict(X_val_scaled)
print(classification_report(y_val, y_val_preds, target_names=age_brackets))

              precision    recall  f1-score   support

       10-19       0.78      0.64      0.70       765
       20-29       0.66      0.74      0.70      2137
       30-39       0.50      0.50      0.50      1442
       40-49       0.49      0.45      0.47       853
       50-59       0.51      0.46      0.49       523
       60-69       0.48      0.50      0.49       204
more than 70       0.55      0.42      0.48        76

    accuracy                           0.59      6000
   macro avg       0.57      0.53      0.55      6000
weighted avg       0.59      0.59      0.59      6000



In [51]:
X_test_scaled = proj_scaler.transform(X_test)
y_test_preds = lr_clf_proj_age.predict(X_test_scaled)
print(classification_report(y_test, y_test_preds, target_names=age_brackets))

              precision    recall  f1-score   support

       10-19       0.75      0.62      0.68       416
       20-29       0.64      0.74      0.69      1163
       30-39       0.53      0.51      0.52       888
       40-49       0.50      0.45      0.47       500
       50-59       0.50      0.46      0.48       273
       60-69       0.51      0.48      0.49       117
more than 70       0.44      0.33      0.38        42

    accuracy                           0.59      3399
   macro avg       0.55      0.51      0.53      3399
weighted avg       0.58      0.59      0.58      3399



# Save both scalers and models

In [52]:
joblib.dump(
    enc_scaler, "../models/encoder_scaler.joblib"
)

['../models/encoder_scaler.joblib']

In [53]:
joblib.dump(
    proj_scaler, "../models/projected_scaler.joblib"
)

['../models/projected_scaler.joblib']

In [54]:
joblib.dump(
    lr_clf_enc_age, "../models/lr_clf_enc_age.joblib"
)

['../models/lr_clf_enc_age.joblib']

In [55]:
joblib.dump(
    lr_clf_proj_age, "../models/lr_clf_proj_age.joblib"
)

['../models/lr_clf_proj_age.joblib']