In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#SETTINGS
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
train_path = '\dataset\audio_features_metadata_train.tsv'
test_path = '\dataset\audio_features_metadata_test.tsv'

try:
    df_train = pd.read_csv(train_path, sep='\t')
    df_test = pd.read_csv(test_path, sep='\t')
    print(f"Found {len(df_train)} training rows")
    print(f"Found {len(df_test)} test rows")
except FileNotFoundError as e:
    print(f"File not found: \n{e}")

age_var = 'age' 
le = LabelEncoder()
df_train['age_encoded'] = le.fit_transform(df_train[age_var])
df_test['age_encoded'] = le.transform(df_test[age_var])

print("\nAge classes:")
for i, class_i in enumerate(le.classes_):
    print(f"  {class_i}: {i}")

drop_rows = ['client_id', 'path', 'age', 'gender']

X_train = df_train.drop(columns=drop_rows)
y_train = df_train['age_encoded']

X_test = df_test.drop(columns=drop_rows)
y_test = df_test['age_encoded']

X_test = X_test[X_train.columns]

print(f"\nData ready: {len(X_train.columns)}")

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

print("Scaling completed")


In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, oob_score=True, class_weight='balanced')

print("Starting training")
rf_model.fit(X_train_scaled, y_train)

print("Model has been trained")
print(f"Out-of-Bag (OOB) : {rf_model.oob_score_:.4f}")

In [None]:
y_pred = rf_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Cccuracy on test set: {accuracy:.4f} ({accuracy*100:.2f}%)")

print("\nFull report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


print("\nConfusion matrix:")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion matrix')
plt.ylabel('Real age class')
plt.xlabel('Predicted age class')
plt.show()

In [None]:
importances = rf_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
}).sort_values('importance', ascending=False)

print("Top 20 most important features for age group prediction:")
display(feature_importance_df.head(20))

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20), palette='viridis')
plt.title('Top 20 most important features for age group prediction')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()