In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from joblib import dump
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
VECTOR_PATH = r'dataset\x_vectors.pkl'
TRAIN_TSV = r'dataset\train_21.tsv'
TEST_TSV = r'dataset\test_21.tsv'
MODEL_FOLDER = r'dataset\models\xgboostVector'

In [None]:
df_xvectors = pd.read_pickle(VECTOR_PATH)
df_train_info = pd.read_csv(TRAIN_TSV, sep='\t')
df_test_info = pd.read_csv(TEST_TSV, sep='\t')

train_paths = set(df_train_info['path'])
test_paths = set(df_test_info['path'])
df_train = df_xvectors[df_xvectors['path'].isin(train_paths)].copy()
df_test = df_xvectors[df_xvectors['path'].isin(test_paths)].copy()

print(f"Training set: {len(df_train)}")
print(f"Test set: {len(df_test)}")

def class_reduce(df):
    younger = ['teens', 'twenties']
    df['age'] = df['age'].replace(younger, 'twentiesAndUnder')
    
    older = ['sixties', 'seventies', 'eighties', 'nineties']
    df['age'] = df['age'].replace(older, '60plus')
    return df

df_train = class_reduce(df_train)
df_test = class_reduce(df_test)

le = LabelEncoder()
df_train['age_encoded'] = le.fit_transform(df_train['age'])
df_test['age_encoded'] = le.transform(df_test['age'])

y_train = df_train['age_encoded']
y_test = df_test['age_encoded']

X_train = pd.DataFrame(df_train['x_vector'].tolist())
X_test = pd.DataFrame(df_test['x_vector'].tolist())

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nUsing SMOTE")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"Final training set after SMOTE: {X_train_resampled.shape}")

In [None]:
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    n_estimators=300,
    learning_rate=0.1,
    max_depth=7,
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

print("\nStarting training process:")
xgb_model.fit(X_train_resampled, y_train_resampled)
print("Model has been trained successfully")

In [None]:
y_pred = xgb_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nAccuracy repot: ")
print(f"Accuracy on the test set: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("\nDetailed class report:")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion matrix')
plt.ylabel('Actual class')
plt.xlabel('Predicted class')
plt.show()

In [None]:
os.makedirs(MODEL_FOLDER, exist_ok=True)

MODEL_PATH = os.path.join(MODEL_FOLDER, 'final_xgboost_xvector_model.joblib')
SCALER_PATH = os.path.join(MODEL_FOLDER, 'final_xvector_scaler.joblib')
ENCODER_PATH = os.path.join(MODEL_FOLDER, 'final_age_encoder_5_classes.joblib')

print("\nSaving model to disk")
dump(xgb_model, MODEL_PATH)
dump(scaler, SCALER_PATH)
dump(le, ENCODER_PATH)

print(f"Model saved in: {MODEL_PATH}")
print(f"Scaler saved in: {SCALER_PATH}")
print(f"LabelEncoder saved in: {ENCODER_PATH}")