Metrics:
Primary: Mean Absolute Error (MAE) to measure average prediction error in minutes.

Secondary: Root Mean Squared Error (RMSE) to penalize larger errors, R² to assess explained variance.

Cross-Validation: Evaluate model stability using k-fold cross-validation scores on the training set.

Test Set Performance: Report MAE, RMSE, and R² on the test set to assess generalization.

Residual Analysis: Plot residuals to check for patterns (e.g., systematic over/under-prediction).

Model Comparison: Compare baseline and advanced models to select the best performer.

Iterate: If performance is poor, revisit feature engineering (e.g., add interaction terms) or try ensemble methods.



In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

df = pd.read_csv('engineered_podcast_data.csv')
print("Columns in engineered dataset:", df.columns.tolist())

# 1. Train-Test Split
# Define features (X) and target (y)
X = df.drop('Listening_Time_minutes', axis=1)
y = df['Listening_Time_minutes']

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Columns in engineered dataset: ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Listening_Time_minutes', 'Genre_Business', 'Genre_Comedy', 'Genre_Education', 'Genre_Health', 'Genre_Lifestyle', 'Genre_Music', 'Genre_News', 'Genre_Sports', 'Genre_Technology', 'Genre_True Crime', 'Publication_Day_Friday', 'Publication_Day_Monday', 'Publication_Day_Saturday', 'Publication_Day_Sunday', 'Publication_Day_Thursday', 'Publication_Day_Tuesday', 'Publication_Day_Wednesday', 'Episode_Sentiment_Negative', 'Episode_Sentiment_Neutral', 'Episode_Sentiment_Positive', 'Publication_Time_Afternoon', 'Publication_Time_Evening', 'Publication_Time_Morning', 'Publication_Time_Night', 'Podcast_Name_Encoded', 'Title_Length', 'Title_Has_interview', 'Title_Has_exclusive', 'Title_Has_special', 'Title_Has_guest', 'Title_BOW_12', 'Title_BOW_18', 'Title_BOW_19', 'Title_BOW_20', 'Title_BOW_23', 'Title_BOW_24', 'Title_BOW_26', 'Title_BOW_27', 'Title_BOW_28', 'Tit

In [9]:
# 2. Model Selection
# Initialize models
models = {
    'LinearRegression': LinearRegression()
}

# Dictionary to store trained models and their scores
model_results = {}

# 3. Train and Tune Models
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    if name == 'LinearRegression':
        # No hyperparameter tuning for Linear Regression
        model.fit(X_train, y_train)
        model_results[name] = model
    else:
        # Define parameter grid for tree-based models
        if name == 'RandomForest':
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5]
            }
        elif name == 'XGBoost':
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [3, 6],
                'learning_rate': [0.01, 0.1]
            }
        
        # Perform GridSearchCV
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=5,
            scoring='neg_mean_absolute_error',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        
        # Store best model
        model_results[name] = grid_search.best_estimator_
        print(f"Best parameters for {name}:", grid_search.best_params_)
        print(f"Best CV MAE for {name}:", -grid_search.best_score_)

    # Evaluate on training set
    y_train_pred = model_results[name].predict(X_train)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    print(f"{name} Training MAE: {train_mae:.2f}")

# 4. Feature Importance (for tree-based models)
for name in ['RandomForest', 'XGBoost']:
    if name in model_results:
        model = model_results[name]
        importances = model.feature_importances_
        feature_importance = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': importances
        }).sort_values(by='Importance', ascending=False)
        print(f"\nTop 5 features for {name}:\n", feature_importance.head())

# 5. Save Models
for name, model in model_results.items():
    joblib.dump(model, f'{name}_model.pkl')
    print(f"Saved {name} model as {name}_model.pkl")


Training LinearRegression...
LinearRegression Training MAE: 9.80
Saved LinearRegression model as LinearRegression_model.pkl


In [12]:
X.columns

Index(['Episode_Length_minutes', 'Host_Popularity_percentage',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre_Business',
       'Genre_Comedy', 'Genre_Education', 'Genre_Health', 'Genre_Lifestyle',
       'Genre_Music', 'Genre_News', 'Genre_Sports', 'Genre_Technology',
       'Genre_True Crime', 'Publication_Day_Friday', 'Publication_Day_Monday',
       'Publication_Day_Saturday', 'Publication_Day_Sunday',
       'Publication_Day_Thursday', 'Publication_Day_Tuesday',
       'Publication_Day_Wednesday', 'Episode_Sentiment_Negative',
       'Episode_Sentiment_Neutral', 'Episode_Sentiment_Positive',
       'Publication_Time_Afternoon', 'Publication_Time_Evening',
       'Publication_Time_Morning', 'Publication_Time_Night',
       'Podcast_Name_Encoded', 'Title_Length', 'Title_Has_interview',
       'Title_Has_exclusive', 'Title_Has_special', 'Title_Has_guest',
       'Title_BOW_12', 'Title_BOW_18', 'Title_BOW_19', 'Title_BOW_20',
       'Title_BOW_23', 'Title_BOW_24', 'Titl

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt

# # Load test set
# X_test = pd.read_csv('X_test.csv')
# y_test = pd.read_csv('y_test.csv').values.ravel()  # Ensure y_test is 1D
print("Test set shape:", X_test.shape)

# List of models to evaluate
model_names = ['LinearRegression']
results = {}

# Evaluate each model
for name in model_names:
    try:
        # Load model
        model = joblib.load(f'{name}_model.pkl')
        
        # Predict on test set
        y_pred = model.predict(X_test)
        
        # Compute metrics
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        # Store results
        results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}
        print(f"\n{name} Test Set Performance:")
        print(f"MAE: {mae:.2f} minutes")
        print(f"RMSE: {rmse:.2f} minutes")
        print(f"R2: {r2:.2f}")
        
        # Residual plot
        residuals = y_test - y_pred
        plt.figure(figsize=(8, 6))
        plt.scatter(y_pred, residuals, alpha=0.5)
        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel('Predicted Listening Time (minutes)')
        plt.ylabel('Residuals (Actual - Predicted)')
        plt.title(f'Residual Plot for {name}')
        plt.savefig(f'residual_plot_{name}.png')
        plt.close()
        print(f"Residual plot saved as residual_plot_{name}.png")
        
    except FileNotFoundError:
        print(f"{name} model not found. Skipping evaluation.")

# Compare models
print("\nModel Comparison:")
results_df = pd.DataFrame(results).T
print(results_df)

# Identify best model based on MAE
if results:
    best_model = results_df['MAE'].idxmin()
    print(f"\nBest model (lowest MAE): {best_model} with MAE = {results_df.loc[best_model, 'MAE']:.2f}")
else:
    print("No models evaluated.")

Test set shape: (150000, 87)

LinearRegression Test Set Performance:
MAE: 9.75 minutes
RMSE: 13.28 minutes
R2: 0.76
Residual plot saved as residual_plot_LinearRegression.png

Model Comparison:
                       MAE       RMSE        R2
LinearRegression  9.749835  13.276996  0.760435

Best model (lowest MAE): LinearRegression with MAE = 9.75
