**This notebook is based on [the notebook by @INVISIBLE](https://www.kaggle.com/code/vyacheslavbolotin/ps-s5e4-hyperspace-as-feats).**

**Replace OrdinalEncoder/LabelEncoder with more powerful encoding.**

**OrdinalEncoder uses lexicographical order by default, so we can replace it with more powerful encoding.[Discussion here](https://www.kaggle.com/competitions/playground-series-s5e4/discussion/573253#3178652).**

**You can use "compare version" to see the changes made.**

In [None]:
pip install -qq scikit-learn==1.6.1

In [None]:
import os, gc, psutil
import polars as pl, pandas as pd;  pd.set_option('display.max_columns', 100) 

import numpy as np

import matplotlib.pyplot as plt;    plt.style.use("seaborn-v0_8")
import seaborn as sns;              sns.set_palette("husl")

import warnings;                    warnings.simplefilter('ignore')

import lightgbm as lgb

from sklearn.preprocessing   import TargetEncoder
from sklearn.model_selection import KFold

from tqdm import tqdm


def memory_usage():
    """
    Returns the current memory usage of the process in MB.
    """
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / (1024 * 1024)  # Convert bytes to megabytes
    return f"Memory Usage: {mem:.2f} MB"

In [None]:
original_df       = pd.read_csv("/kaggle/input/podcast-listening-time-prediction-dataset/podcast_dataset.csv")
train_df          = pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv', index_col='id')
test_df           = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv',  index_col='id')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e4/sample_submission.csv')


# Concatenate original data with synthetics ones
train_df = pd.concat([train_df, original_df], axis=0, ignore_index=True); train_df.drop_duplicates()


print("\nData Info:");                  display(train_df.info())
print("\nNumerical Features Summary:"); display(train_df.describe())
print("\nFirst 3 rows of Dataset:");    display(train_df.head(3))

In [None]:
# Visual analitic

def numerical_distrib_analysis(data, numerical_features):
    """
    Analyzes the distribution of numerical features using histograms and boxplots.

    :param data: Pandas DataFrame containing the dataset
    :param numerical_features: List of numerical column names
    """
    for feature in numerical_features:
        plt.figure(figsize=(5, 2))

        # Histogram with KDE curve
        plt.subplot(1, 2, 1)
        sns.histplot(data[feature], kde=True, bins=30)
        plt.title(f"Histogram of {feature}")
        plt.xlabel(feature)
        plt.ylabel("Frequency")

        # Boxplot to detect outliers
        plt.subplot(1, 2, 2)
        sns.boxplot(x=data[feature])
        plt.title(f"Boxplot of {feature}")

        plt.tight_layout()
        plt.show()

        # Additional statistics
        print(f"\nStatistics for {feature}:")
        print(f"Skewness: {data[feature].skew():.2f}")
        print(f"Missing Values: {data[feature].isnull().sum()}")


def categorical_distrib_analysis(data, categorical_features, top_n=10):
    """
    Analyzes and visualizes the distribution of categorical features.

    :param data: Pandas DataFrame containing the dataset
    :param categorical_features: List of categorical column names
    :param top_n: Max number of top categories to display for high-cardinality features
    """
    for feature in categorical_features:
        plt.figure(figsize=(4, 3))

        unique_count = data[feature].nunique()

        if unique_count > top_n:
            # Show only the top_n most frequent categories
            top_categories = data[feature].value_counts().nlargest(top_n)
            sns.barplot(x=top_categories.index, y=top_categories.values, palette="pastel")
            plt.title(f"Top {top_n} Categories of {feature}")
        else:
            # Show all categories
            sns.countplot(x=data[feature], order=data[feature].value_counts().index, palette="pastel")
            plt.title(f"Distribution of {feature}")

        plt.xlabel(feature)
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.show()

        # Print stats
        print(f"Feature: {feature}")
        print(f"Number of Unique Values: {unique_count}")
        print(f"Missing Values: {data[feature].isnull().sum()}\n")


def numerical_correlation_analysis(data, numerical_features, target):
    """
    Analyzes and visualizes relationships between numerical features and the target.

    :param data: Pandas DataFrame containing the dataset
    :param numerical_features: List of numerical column names
    :param target: Name of the target variable
    """
    for feature in numerical_features:
        if feature != target:
            # Scatter plot: feature vs target
            plt.figure(figsize=(4, 3))
            sns.scatterplot(x=data[feature], y=data[target], alpha=0.5)
            plt.title(f"{feature} vs {target}")
            plt.xlabel(feature)
            plt.ylabel(target)
            plt.show()

    # Correlation matrix
    correlation_matrix = data[numerical_features].corr()
    plt.figure(figsize=(5, 4))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Matrix of Numerical Features")
    plt.show()


def categorical_correlation_analysis(data, categorical_features, target, high_cardinality_threshold=10):
    """
    Visualizes categorical features against the target using boxplots.

    :param data: Pandas DataFrame containing the dataset
    :param categorical_features: List of categorical column names
    :param target: Name of the target variable
    :param high_cardinality_threshold: Max number of unique values allowed for plotting
    """
    for feature in categorical_features:
        if data[feature].nunique() <= high_cardinality_threshold:
            # Boxplot: target distribution per category
            plt.figure(figsize=(5, 3))
            sns.boxplot(x=data[feature], y=data[target], palette='husl')
            plt.title(f"{feature} vs {target}")
            plt.xlabel(feature)
            plt.ylabel(target)
            plt.xticks(rotation=45)
            plt.show()
        else:
            print(f"Skipping {feature}: too many unique values ({data[feature].nunique()})\n")


# Visual analitic in action -----------------------------------------------------


numerical_features = [
    'Episode_Length_minutes', 
    'Host_Popularity_percentage',
    'Guest_Popularity_percentage', 
    'Number_of_Ads',
    'Listening_Time_minutes',
]
categorical_features = [
    'Podcast_Name', 
    'Genre', 
    'Publication_Day',
    'Publication_Time', 
    'Episode_Sentiment'
]

In [None]:
def pfe(df):
    
    _dict_podc = {
        'Mystery Matters': 1.01, 'News Roundup':    1.08, 'Global News':        1.15,
        'Joke Junction'  : 1.02, 'Daily Digest':    1.09, 'Tech Talks':         1.16,
        'Study Sessions' : 1.03, 'Music Matters':   1.10, 'Sport Spot':         1.17,
        'Digital Digest' : 1.04, 'Sports Central':  1.11, 'Funny Folks':        1.18,
        'Mind & Body'    : 1.05, 'Melody Mix':      1.12, 'Sports Weekly':      1.19,
        'Fitness First'  : 1.06, 'Game Day':        1.13, 'Business Briefs':    1.20,
        'Criminal Minds' : 1.07, 'Gadget Geek':     1.14, 'Tech Trends':        1.21, 
        
        'Innovators'     : 2.01, 'Health Hour':     2.08, 'Detective Diaries':  2.15,
        'Sound Waves'    : 2.02, 'Brain Boost':     2.09, "Athlete's Arena":    2.16, 
        'Wellness Wave'  : 2.03, 'Style Guide':     2.10, 'World Watch':        2.17, 
        'Humor Hub'      : 2.04, 'Money Matters':   2.11, 'Healthy Living':     2.18, 
        'Home & Living'  : 2.05, 'Market Masters':  2.12, 'Finance Focus':      2.19,
        'Learning Lab'   : 2.06, 'Lifestyle Lounge':2.13, 'Crime Chronicles':   2.20,
        'Comedy Corner'  : 2.07, 'Life Lessons':    2.14, 'Educational Nuggets':2.21,   
        
        'Current Affairs': 3.01, 'Laugh Line':      3.02, 'True Crime Stories': 3.03, 
        'Fashion Forward': 3.04, 'Tune Time':       3.05, 'Business Insights':  3.06, "TheEnd":3.07
    }
    _dict_genr = {
        'True Crime':      4.01, 'Comedy':          4.02, 'Education':          4.03, 
        'Technology':      4.04, 'Health':          4.05, 'News':               4.06,
        'Music':           4.07, 'Sports':          4.08, 'Business':           4.09, 'Lifestyle': 4.10
    }
    _dict_week = {
        'Monday':          5.01, 'Tuesday':         5.02, 'Wednesday':          5.03, 
        'Thursday':        5.04, 'Friday':          5.05, 'Saturday':           5.06, 'Sunday': 5.07
    }    
    _dict_time = {
        'Morning':         7.01, 'Afternoon':       7.02, 'Evening':            7.03, 'Night': 3
    }
    _dict_sent = {
        'Negative':        8.01, 'Neutral':         8.02, 'Positive':           8.03
    }

    def _2_NaN(x):
        if x['x1']<0 and x['x2']<0: return -21
        if x['x1']<0              : return -3
        if               x['x2']<0: return -7
        return +7

        
    df['x1'] = df.Episode_Length_minutes;      df['x1'].fillna("NaN", inplace=True)     
    df['x2'] = df.Guest_Popularity_percentage; df['x2'].fillna("NaN", inplace=True)
    df['x3'] = df.Episode_Title       .apply(lambda x: int(x[8:]) / 100 + 10)
    df['x1'] = df['x1']               .apply(lambda x: -3 if x=="NaN" else x)
    df['x2'] = df['x2']               .apply(lambda x: -7 if x=="NaN" else x)
    df["x4"] = df                     .apply(lambda x: _2_NaN(x), axis=1)
    df['x5'] = df['Genre']            .replace(_dict_genr)
    df['x6'] = df['Podcast_Name']     .replace(_dict_podc)
    df['x7'] = df['Publication_Day']  .replace(_dict_week)
    df['x8'] = df['Publication_Time'] .replace(_dict_time)
    df['x9'] = df['Episode_Sentiment'].replace(_dict_sent)
   
    p1 = df['x1'].mean()
    p2 = df['x2'].mean()
    p3 = df['x3'].mean()
    p4 = df['x4'].mean()
    p5 = df['x5'].mean()
    p6 = df['x6'].mean()
    p7 = df['x7'].mean()
    p8 = df['x8'].mean()
    p9 = df['x9'].mean()

    df['x10'] = np.sqrt(
        (p1-df.x1)**2+
        (p2-df.x2)**2+
        (p3-df.x3)**2+
        (p4-df.x4)**2+
        (p5-df.x5)**2+
        (p6-df.x6)**2+
        (p7-df.x7)**2+
        (p8-df.x8)**2+
        (p9-df.x9)**2)

    polar_df = pl.from_pandas(df)
    
    polar_df = polar_df.with_columns(     
        _2_1 = ((pl.col('x1')-pl.col('x3'))**2+(pl.col('x2')-pl.col('x4'))**2).sqrt(),
        _2_2 = ((pl.col('x1')-pl.col('x5'))**2+(pl.col('x2')-pl.col('x6'))**2).sqrt(),
        _2_3 = ((pl.col('x1')-pl.col('x7'))**2+(pl.col('x2')-pl.col('x8'))**2).sqrt(),
        _2_4 = ((pl.col('x3')-pl.col('x5'))**2+(pl.col('x4')-pl.col('x6'))**2).sqrt(),
        _2_5 = ((pl.col('x3')-pl.col('x7'))**2+(pl.col('x4')-pl.col('x8'))**2).sqrt(),
        _2_6 = ((pl.col('x5')-pl.col('x7'))**2+(pl.col('x6')-pl.col('x8'))**2).sqrt(),
        _3_1 = ((pl.col('x1')-pl.col('x4'))**2+(pl.col('x2')-pl.col('x5'))**2+(pl.col('x3')-pl.col('x6'))**2).sqrt(),
        _3_2 = ((pl.col('x1')-pl.col('x7'))**2+(pl.col('x2')-pl.col('x8'))**2+(pl.col('x3')-pl.col('x9'))**2).sqrt(),
        _3_3 = ((pl.col('x4')-pl.col('x7'))**2+(pl.col('x5')-pl.col('x8'))**2+(pl.col('x6')-pl.col('x9'))**2).sqrt(),
        _4_1 = ((pl.col('x1')-pl.col('x5'))**2+(pl.col('x2')-pl.col('x6'))**2+(pl.col('x3')-pl.col('x7'))**2+(pl.col('x4')-pl.col('x8'))**2).sqrt(),)
                
    df = polar_df.to_pandas()

    p1 = df['_2_1'].mean()
    p2 = df['_2_2'].mean()
    p3 = df['_2_3'].mean()
    p4 = df['_2_4'].mean()
    p5 = df['_2_5'].mean()
    p6 = df['_2_6'].mean()
    p7 = df['_3_1'].mean()
    p8 = df['_3_2'].mean()
    p9 = df['_3_3'].mean()
    p0 = df['_4_1'].mean()

    df['_x_11'] = np.sqrt(
        (p1-df._2_1)**2+
        (p2-df._2_2)**2+
        (p3-df._2_3)**2+
        (p4-df._2_4)**2+
        (p5-df._2_5)**2+
        (p6-df._2_6)**2+
        (p7-df._3_1)**2+
        (p8-df._3_2)**2+
        (p9-df._3_3)**2+
        (p0-df._4_1)**2)

    return df

In [None]:
print("Preparing managing null values..")

train_df = pfe(train_df)
test_df  = pfe(test_df)

print("Missing Values per Column:"); print(train_df.isnull().sum())
print("Missing Values per Column:"); print(test_df .isnull().sum())

# Replacing null values by median
train_df['Episode_Length_minutes'].fillna(train_df['Episode_Length_minutes'].median(), inplace=True)
test_df ['Episode_Length_minutes'].fillna(train_df['Episode_Length_minutes'].median(), inplace=True)

# Null values could mean no guest 
train_df['Guest_Popularity_percentage'].fillna(train_df['Guest_Popularity_percentage'].median(), inplace=True)
test_df ['Guest_Popularity_percentage'].fillna(test_df ['Guest_Popularity_percentage'].median(), inplace=True)


train_df = train_df[train_df['Number_of_Ads']<10] # Deleting outliers 


train_df.dropna(inplace=True)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class CategoryMeanTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols=None):
        self.cat_cols = cat_cols
        self.mappings_ = {}
    def fit(self, X, y):
        X = X.copy()
        if self.cat_cols is None:
            self.cat_cols = X.select_dtypes(include=['category']).columns.tolist()
        self.mappings_ = {}
        for col in self.cat_cols:
            df_temp = pd.DataFrame({col: X[col], 'y': y})
            group_means = df_temp.groupby(col, dropna=False)['y'].mean()
            sorted_categories = group_means.sort_values().index
            self.mappings_[col] = {cat: i for i, cat in enumerate(sorted_categories)}
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for col, mapping in self.mappings_.items():
            if col in X.columns:
                X[col] = X[col].map(mapping)
        return X

cmtencoder = CategoryMeanTransformer(cat_cols=categorical_features)
x_col = ['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',]
y_col = ['Listening_Time_minutes']
train_df[x_col] = cmtencoder.fit_transform(train_df[x_col],np.array(train_df[y_col]).reshape(-1,))
test_df[x_col] = cmtencoder.transform(test_df[x_col])

In [None]:
gc.collect()

# creating most relevant feature 
train_df['Episode_Num'] = train_df['Episode_Title'].str[8:].astype('category')
test_df ['Episode_Num'] = test_df ['Episode_Title'].str[8:].astype('category')

train_df = train_df.drop(columns=['Episode_Title'])
test_df  = test_df .drop(columns=['Episode_Title'])

display(train_df['Podcast_Name'].nunique(), train_df.head())

In [None]:
from tqdm import tqdm
from itertools import combinations

columns_to_encode = [
    'Episode_Length_minutes', 
    'Episode_Num', 
    'Host_Popularity_percentage', 
    'Number_of_Ads', 
    'Episode_Sentiment', 
    'Publication_Day', 
    'Publication_Time',
    'Genre',
    'Guest_Popularity_percentage'
]

pair_size = [2, 3, 4]


for r in pair_size: 
    combinations_list = list(combinations(columns_to_encode,r))
    batch_size = 20

    print('\n pair_size:', r, '\n')
    
    for i in range(0, len(combinations_list), batch_size):
        
        batch = combinations_list[i : i +batch_size]
        
        for cols in tqdm(batch):
            new_col_name = '_'.join(cols)

            train_df[new_col_name] = train_df[list(cols)].astype(str).agg('_'.join, axis=1) 
            test_df [new_col_name] = test_df [list(cols)].astype(str).agg('_'.join, axis=1) 
            test_df [new_col_name] = test_df [new_col_name].astype('category')
            train_df[new_col_name] = train_df[new_col_name].astype('category')
            
        gc.collect()
        
        print(f"Memory usage: {train_df.memory_usage(deep=True).sum() / (1024*1024):.2f} MB")
        print(f"Total number of columns: {len(train_df.columns)}")

    print("~"*19)

### Models 

In [None]:
X = train_df.drop(columns=['Listening_Time_minutes'])
y = train_df['Listening_Time_minutes']

In [None]:
gc.collect()

N_SPLITS =    7  
cv = KFold(N_SPLITS, random_state=42, shuffle=True)
y_pred = np.zeros(len(sample_submission))

for idx_train, idx_valid in cv.split(X, y):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    X_test = test_df[X.columns].copy()

    encoded_columns = train_df.columns[30:] # original: train_df.columns[11:] + 19 new feats
    encoder = TargetEncoder(random_state=42)

    X_train[encoded_columns] = encoder.fit_transform(X_train[encoded_columns], y_train)  
    X_valid[encoded_columns] = encoder.transform(X_valid[encoded_columns])
    X_test [encoded_columns] = encoder.transform(X_test[encoded_columns])

    model = lgb.LGBMRegressor(
        n_iter           = 1000,
        max_depth        = -1,
        num_leaves       = 2048,
        colsample_bytree = 0.7,
        learning_rate    = 0.015,
        objective        = 'l2',
        metric           = 'rmse', 
        verbosity        = -1,
        #max_bin          = 1024,
        #device           = "gpu",
    )


    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[
            lgb.log_evaluation(100),
            lgb.early_stopping(stopping_rounds=100)
            ],
    )

    y_pred += model.predict(X_test)

pred_lgbm = y_pred / N_SPLITS

# Training until validation scores don't improve for 100 rounds
# [100]	valid_0's rmse: 13.1727
# [200]	valid_0's rmse: 11.9743
# [300]	valid_0's rmse: 11.8755
# [400]	valid_0's rmse: 11.8561
# [500]	valid_0's rmse: 11.8528
# [600]	valid_0's rmse: 11.8519
# Early stopping, best iteration is:
# [535]	valid_0's rmse: 11.8512

In [None]:
submission_lgbm = pd.DataFrame({'id': sample_submission.id, 'Listening_Time_minutes' : pred_lgbm})
submission_lgbm.to_csv('submission.csv', index=False)
submission_lgbm.head()