In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

df = pd.read_csv('/workspaces/ticket-heroes/ticket-prophet/data/processed/CLEAN_artists_spotify_lastfm.csv') 

TARGET_VARIABLE = 'avg_secondary_price' 

SELECTED_FEATURES = [
'avg_daily_price_change', 'num_years_active', 'high_demand_rate', 'avg_price_volatility', 'plays_per_listener', 
'artist_followers', 'github_total_tickets', 'high_demand_rate', 'total_tracks', 'github_avg_price', 
 'high_volatility_rate', 'artist_popularity']


X = df[SELECTED_FEATURES].copy()
y = df[TARGET_VARIABLE].copy()

# Clear out any nan/null values
combined_df = pd.concat([X, y], axis=1).dropna()
X = combined_df[SELECTED_FEATURES]
y = combined_df[TARGET_VARIABLE]
print(f"Dataset size after final cleaning: {len(X)} records")

# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42 
)
print("\nTraining Linear Regression Baseline Model...")

baseline_model = LinearRegression()


baseline_model.fit(X_train, y_train)
print("Training complete.")

# Make predictions on the test set
y_pred = baseline_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n### Baseline Model Performance (Linear Regression) ###")
print(f"Target Variable: {TARGET_VARIABLE}")
print("-" * 50)
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"R-squared (R2) Score: {r2:.4f}")
print("-" * 50)

# Check the coefficients to understand which features drive the price
print("\n### Model Coefficients (Feature Importance) ###")
coefficients = pd.Series(baseline_model.coef_, index=X.columns).sort_values(ascending=False)
print(coefficients)

Dataset size after final cleaning: 6 records

Training Linear Regression Baseline Model...
Training complete.

### Baseline Model Performance (Linear Regression) ###
Target Variable: avg_secondary_price
--------------------------------------------------
Root Mean Squared Error (RMSE): $1110.24
Mean Absolute Error (MAE): $797.10
R-squared (R2) Score: -8495.1203
--------------------------------------------------

### Model Coefficients (Feature Importance) ###
github_avg_price          0.202143
num_years_active          0.011612
avg_daily_price_change    0.005237
artist_followers          0.000020
high_demand_rate          0.000000
high_demand_rate          0.000000
high_demand_rate          0.000000
high_demand_rate          0.000000
avg_price_volatility     -0.000025
total_tracks             -0.000036
high_volatility_rate     -0.000039
plays_per_listener       -0.000339
artist_popularity        -0.000587
github_total_tickets     -0.150092
dtype: float64


In [5]:
print(df[SELECTED_FEATURES].isna().sum().sort_values(ascending=False))



avg_daily_price_change    2486
avg_price_volatility      2486
high_demand_rate          2483
high_demand_rate          2483
high_volatility_rate      2483
num_years_active          2443
github_avg_price          2402
github_total_tickets      2402
plays_per_listener          53
artist_followers             0
total_tracks                 0
artist_popularity            0
dtype: int64
