Train-Test Split: Split data into 80% training and 20% testing sets, ensuring Listening_Time_minutes is the target variable.

Model Selection:
Start with a baseline model like Linear Regression.

Experiment with tree-based models (Random Forest, XGBoost, LightGBM) for non-linear relationships.

Optionally, test neural networks for complex patterns if dataset size permits.

Hyperparameter Tuning:
Use grid search or random search with cross-validation (e.g., 5-fold) to optimize parameters (e.g., tree depth, learning rate).

Feature Importance: For tree-based models, analyze feature importance to identify key drivers of listening time.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load engineered dataset
df = pd.read_csv('engineered_podcast_data.csv')
print("Columns in engineered dataset:", df.columns.tolist())

# 1. Train-Test Split
# Define features (X) and target (y)
X = df.drop('Listening_Time_minutes', axis=1)
y = df['Listening_Time_minutes']

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

# 2. Model Selection
# Initialize models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Dictionary to store trained models and their scores
model_results = {}

# 3. Train and Tune Models
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    if name == 'LinearRegression':
        # No hyperparameter tuning for Linear Regression
        model.fit(X_train, y_train)
        model_results[name] = model
    else:
        # Define parameter grid for tree-based models
        if name == 'RandomForest':
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5]
            }
        elif name == 'XGBoost':
            param_grid = {
                'n_estimators': [100, 200],
                'max_depth': [3, 6],
                'learning_rate': [0.01, 0.1]
            }
        
        # Perform GridSearchCV
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=5,
            scoring='neg_mean_absolute_error',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        
        # Store best model
        model_results[name] = grid_search.best_estimator_
        print(f"Best parameters for {name}:", grid_search.best_params_)
        print(f"Best CV MAE for {name}:", -grid_search.best_score_)

    # Evaluate on training set
    y_train_pred = model_results[name].predict(X_train)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    print(f"{name} Training MAE: {train_mae:.2f}")

# 4. Feature Importance (for tree-based models)
for name in ['RandomForest', 'XGBoost']:
    if name in model_results:
        model = model_results[name]
        importances = model.feature_importances_
        feature_importance = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': importances
        }).sort_values(by='Importance', ascending=False)
        print(f"\nTop 5 features for {name}:\n", feature_importance.head())

# 5. Save Models
for name, model in model_results.items():
    joblib.dump(model, f'{name}_model.pkl')
    print(f"Saved {name} model as {name}_model.pkl")

# Save test set for evaluation
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
print("Test set saved as X_test.csv and y_test.csv")

Columns in engineered dataset: ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Listening_Time_minutes', 'Genre_Business', 'Genre_Comedy', 'Genre_Education', 'Genre_Health', 'Genre_Lifestyle', 'Genre_Music', 'Genre_News', 'Genre_Sports', 'Genre_Technology', 'Genre_True Crime', 'Publication_Day_Friday', 'Publication_Day_Monday', 'Publication_Day_Saturday', 'Publication_Day_Sunday', 'Publication_Day_Thursday', 'Publication_Day_Tuesday', 'Publication_Day_Wednesday', 'Episode_Sentiment_Negative', 'Episode_Sentiment_Neutral', 'Episode_Sentiment_Positive', 'Publication_Time_Afternoon', 'Publication_Time_Evening', 'Publication_Time_Morning', 'Publication_Time_Night', 'Podcast_Name_Encoded', 'Title_Length', 'Title_Has_interview', 'Title_Has_exclusive', 'Title_Has_special', 'Title_Has_guest', 'Title_BOW_12', 'Title_BOW_18', 'Title_BOW_19', 'Title_BOW_20', 'Title_BOW_23', 'Title_BOW_24', 'Title_BOW_26', 'Title_BOW_27', 'Title_BOW_28', 'Tit

