In [1]:
from sklearn.linear_model import LinearRegression
from utils_io import load_step, save_step
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from utils_io import load_step, save_step
import matplotlib.pyplot as plt
import seaborn as sns

y_train = load_step("y_train_nz")
y_test = load_step("y_test_nz")
X_train_scaled = load_step("X_train_nz_scaled")
X_test_scaled = load_step("X_test_nz_scaled")

In [2]:
# 1. Extract list of all genre columns
genre_columns = [col for col in X_train_scaled.columns if col.startswith('genre_')]

# 2. Calculate mean popularity for each genre in training set
genre_means = {}
for genre in genre_columns:
    mean_pop = y_train[X_train_scaled[genre] == 1].mean()
    genre_means[genre] = mean_pop

# 3. Define prediction function
def genre_mean_predict(row, genre_means, genre_columns):
    # Find all genres assigned to this track (1 = assigned)
    assigned_genres = [genre for genre in genre_columns if row[genre] == 1]
    
    if not assigned_genres:
        # Fallback: if track has no genre, use global mean
        return np.mean(list(genre_means.values()))
    
    # For multi-label: average the means of all assigned genres
    genre_preds = [genre_means[genre] for genre in assigned_genres]
    return np.mean(genre_preds)

# 4. Generate predictions for test set
test_predictions = X_test_scaled.apply(
    lambda row: genre_mean_predict(row, genre_means, genre_columns), 
    axis=1
)


# 5. Evaluate baseline
r2 = r2_score(y_test, test_predictions)
mae = mean_absolute_error(y_test, test_predictions)
rmse = np.sqrt(mean_squared_error(y_test, test_predictions))

print(f"Genre-based Baseline:")
print(f"  R²:   {r2:.4f}")
print(f"  MAE:  {mae:.2f}")
print(f"  RMSE: {rmse:.2f}")


# will take this as baseline, as R^2 has value of aprox. 5%, provides solid baseline

Genre-based Baseline:
  R²:   0.0827
  MAE:  14.27
  RMSE: 17.29
