In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from joblib import dump, load
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
TRAIN_TSV = 'dataset/train_21.tsv'
TEST_TSV = 'dataset/test_21.tsv'
MODEL_FOLDER = 'dataset/models/xgboostAudioParameters'

In [None]:
df_train = pd.read_csv(TRAIN_TSV, sep='\t')
df_test = pd.read_csv(TEST_TSV, sep='\t')

TOP_FEATURES = [
    'f0_median', 'f0_mean', 'f0_min', 'f0_max', 'jitter_local', 
    'f0_range', 'f0_std', 'hnr', 'shimmer_local', 'formant_1', 
    'formant_2', 'formant_4', 'spectral_contrast_mean', 'mfcc_1_mean',
    'mfcc_2_mean', 'mfcc_3_mean', 'mfcc_4_mean', 'mfcc_5_mean', 
    'mfcc_8_mean', 'mfcc_13_mean', 'spectral_centroid_mean'
]

younger = ['teens', 'twenties']
df_train['age'] = df_train['age'].replace(younger, 'twentiesAndUnder')
df_test['age'] = df_test['age'].replace(younger, 'twentiesAndUnder')

older = ['sixties', 'seventies', 'eighties', 'nineties']
df_train['age'] = df_train['age'].replace(older, '60plus')
df_test['age'] = df_test['age'].replace(older, '60plus')

le = LabelEncoder()
df_train['age_encoded'] = le.fit_transform(df_train['age'])
df_test['age_encoded'] = le.transform(df_test['age'])
y_train = df_train['age_encoded']
y_test = df_test['age_encoded']
X_train = df_train[TOP_FEATURES]
X_test = df_test[TOP_FEATURES]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nUsing SMOTE: ")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"Final training set after SMOTE:: {X_train_resampled.shape}")

In [None]:
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    n_estimators=300, 
    learning_rate=0.1,
    max_depth=7,
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

print("\nStarting model training: ")
xgb_model.fit(X_train_resampled, y_train_resampled)

print("Model has been trained")

In [None]:
y_pred = xgb_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy repot: ")
print(f"Accuracy on the test set: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("\nDetailed class report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nConfusion matrix")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='YlOrRd', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion matrix')
plt.ylabel('Actual class')
plt.xlabel('Predicted class')
plt.show()

In [None]:
os.makedirs(MODEL_FOLDER, exist_ok=True)

MODEL_PATH = os.path.join(MODEL_FOLDER, 'xgboost_model_features.joblib')
SCALER_PATH = os.path.join(MODEL_FOLDER, 'xgboost_scaler_features.joblib')
ENCODER_PATH = os.path.join(MODEL_FOLDER, 'xboost_encoder_features.joblib')
FEATURES_PATH = os.path.join(MODEL_FOLDER, 'xgboost_features_list.json')


print("\nSaving model to disk")
dump(xgb_model, MODEL_PATH)
dump(scaler, SCALER_PATH)
dump(le, ENCODER_PATH)
import json
with open(FEATURES_PATH, 'w') as f:
    json.dump(TOP_FEATURES, f)


print(f"\nModel saved in: {MODEL_PATH}")
print(f"Scaler saved in: {SCALER_PATH}")
print(f"LabelEncoder saved in: {ENCODER_PATH}")
print(f"Features list saved in: {FEATURES_PATH}")