## Dataset Description
The dataset for this competition (both train and test) was generated from a deep learning model trained on the Podcast Listening Time Prediction dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.

Files
train.csv - the training dataset; Listening_Time_minutes is the target <br>
test.csv - the test dataset; your objective is to predict the Listening_Time_minutes for each row <br>
sample_submission.csv - a sample submission file in the correct format.

In [None]:
!pip install lightgbm
!pip install xgboost
!pip install plotly

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pickle as pk
import warnings
warnings.simplefilter(action='ignore',category=Warning)
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [None]:
train = pd.read_csv(r'/kaggle/input/playground-series-s5e4/train.csv')

In [None]:
submission = pd.read_csv(r'/kaggle/input/playground-series-s5e4/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.drop(columns='id',axis=1,inplace=True)

In [None]:
train.columns

## Test Dataset Manipulation

In [None]:
test_df = pd.read_csv(r'/kaggle/input/playground-series-s5e4/test.csv')

In [None]:
test_df.head()

In [None]:
test = test_df.drop(columns='id',axis=1)

In [None]:
test.head()

In [None]:
test.columns

In [None]:
print("Training Data Shape is ",train.shape)
print("Testing Data Shape is ",test.shape)

### Exploratory Data Analysis

In [None]:
# Function for EDA visualizations
def plot_eda(df, target_col=None):
    """Generate exploratory data analysis plots"""
    # Missing values heatmap
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
    plt.title('Missing Values Heatmap')
    plt.tight_layout()
    plt.show()
    
    # Missing values percentages
    missing_percent = df.isnull().mean().sort_values(ascending=False) * 100
    plt.figure(figsize=(12, 6))
    missing_percent[missing_percent > 0].plot(kind='bar')
    plt.title('Percentage of Missing Values by Feature')
    plt.ylabel('Percentage')
    plt.tight_layout()
    plt.show()
    
    if target_col and target_col in df.columns:
        # Target distribution (for regression, use histogram)
        plt.figure(figsize=(10, 5))
        sns.histplot(df[target_col], bins=30, kde=True)
        plt.title(f'Distribution of {target_col}')
        plt.xlabel(target_col)
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.show()
        
        # Correlation heatmap for numerical features
        numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
        plt.figure(figsize=(12, 10))
        correlation = df[numerical_cols].corr()
        mask = np.triu(correlation)
        sns.heatmap(correlation, annot=True, fmt=".2f", cmap="coolwarm", mask=mask)
        plt.title('Correlation Heatmap of Numerical Features')
        plt.tight_layout()
        plt.show()
        
        # Scatter plots for numerical features against target
        for col in numerical_cols:
            if col != target_col:
                plt.figure(figsize=(12, 6))
                sns.scatterplot(x=df[col], y=df[target_col])
                plt.title(f'{col} vs {target_col}')
                plt.xlabel(col)
                plt.ylabel(target_col)
                plt.tight_layout()
                plt.show()

        # Feature relationships with target for categorical features
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
        categorical_cols = [col for col in categorical_cols if col != target_col and df[col].nunique() < 10]
        
        for col in categorical_cols[:3]:  # Limit to first 3 categorical features
            plt.figure(figsize=(12, 6))
            sns.boxplot(x=col, y=target_col, data=df)
            plt.title(f'{col} vs {target_col}')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()

# Run EDA on train data
plot_eda(train, 'Listening_Time_minutes')

In [None]:
plot_eda(test)

In [None]:
(train.isnull().mean())*100

In [None]:
(test.isnull().mean())*100

In [None]:
train.info()

In [None]:
train.describe()

## Data PreProcessing

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.columns

## For Train Data

In [None]:
## getting all different type of feature
num_features = [feature for feature in train.columns if  train[feature].dtype != 'O']
print("Number of numerical features : ",len(num_features))
cat_features = [feature for feature in train.columns if train[feature].dtype == 'O']
print("Number of categorical features : ",len(cat_features))
discrete_features = [feature for feature in num_features if len(train[feature].unique()) <= 25]
print("Number of discrete features : ",len(discrete_features))
continuous_features  = [feature for feature in num_features if len(train[feature].unique()) > 25]
print("Number of continuous features :  ",len(continuous_features))

In [None]:
## check Missing Values 
### these are the features with nan value
features_with_nan = [features for features in train.columns if train[features].isnull().sum() >= 1]
for feature in features_with_nan:
    print(feature,np.round(train[feature].isnull().mean()*100,5), '% missing values')

In [None]:
## statistics on numerical columns (Null cols)
train[features_with_nan].select_dtypes(exclude='object').describe()

In [None]:
train['Podcast_Name'].value_counts()

In [None]:
train['Episode_Title'].value_counts()

In [None]:
train['Genre'].value_counts()

In [None]:
train['Publication_Day'].value_counts()

In [None]:
train['Publication_Time'].value_counts()

In [None]:
train['Episode_Sentiment'].value_counts()

In [None]:
cat_features

In [None]:
num_features

In [None]:
(train.isnull().mean())*100

In [None]:
### Imputing Null Values
## for numerical values
# Episode Length minutes
train['Episode_Length_minutes'] = train['Episode_Length_minutes'].fillna(train['Episode_Length_minutes'].median())
# Guest Popularity percentage
train['Guest_Popularity_percentage'] = train['Guest_Popularity_percentage'].fillna(train['Guest_Popularity_percentage'].median())
# Number of ads
train['Number_of_Ads'] =  train['Number_of_Ads'].fillna(0)

In [None]:
train.head()

In [None]:
(train.isnull().mean())*100

## For Test Data

In [None]:
## getting all different type of feature
num_features_test = [feature for feature in test.columns if  test[feature].dtype != 'O']
print("Number of numerical features : ",len(num_features_test))
cat_features_test = [feature for feature in test.columns if test[feature].dtype == 'O']
print("Number of categorical features : ",len(cat_features_test))
discrete_features_test = [feature for feature in num_features_test if len(test[feature].unique()) <= 25]
print("Number of discrete features : ",len(discrete_features_test))
continuous_features_test  = [feature for feature in num_features_test if len(test[feature].unique()) > 25]
print("Number of continuous features :  ",len(continuous_features_test))

In [None]:
(test.isnull().mean())*100

In [None]:
test.describe()

In [None]:
### Imputing Null Values
## for numerical values
# Episode Length minutes
test['Episode_Length_minutes'] = test['Episode_Length_minutes'].fillna(test['Episode_Length_minutes'].median())
# Guest Popularity percentage
test['Guest_Popularity_percentage'] = test['Guest_Popularity_percentage'].fillna(test['Guest_Popularity_percentage'].median())


In [None]:
(test.isnull().mean())*100

In [None]:
test.head()

## Label Encoding

In [None]:
## Independent features and  dependent features
x = train.drop(['Listening_Time_minutes'],axis=1)
y = train['Listening_Time_minutes']

In [None]:
le1 = LabelEncoder()
le2 = LabelEncoder()
le3  = LabelEncoder()
le4 = LabelEncoder()

In [None]:
x['Podcast_Name'] = le1.fit_transform(x['Podcast_Name'])

In [None]:
x['Episode_Title'] = le2.fit_transform(x['Episode_Title'])

In [None]:
x['Genre'] = le3.fit_transform(x['Genre'])

In [None]:
x['Publication_Day'] = le4.fit_transform(x['Publication_Day'])

## Feature Encoding  and Scaling
### One Hot Encoding for Columns which had lesser unique values and not ordinal 

In [None]:
## Creating Column Transformer with 3 types of transformer
num_features = ['Episode_Length_minutes','Host_Popularity_percentage','Guest_Popularity_percentage',
 'Number_of_Ads']
onehot_columns = ['Publication_Time','Episode_Sentiment']

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features),
        
        
    ],remainder='passthrough'
    
)

In [None]:
x  = preprocessor.fit_transform(x)

In [None]:
pd.DataFrame(x).head()

In [None]:
## seperate dataset into train and test dataset 
x_train,x_cv,y_train,y_cv = train_test_split(x,y,test_size=0.25,random_state=42)

In [None]:
x_train.shape

In [None]:
x_cv.shape

In [None]:
x_train_df = pd.DataFrame(x_train)
y_train_df = pd.Series(y_train)

In [None]:
# Define the sample size (e.g., 80% of the original dataset)
sample_size = int(0.8 * len(x_train_df))  # 10% of the original dataset


In [None]:
# Randomly sample the data
x_train_sampled = x_train_df.sample(n=sample_size, random_state=42)
y_train_sampled = y_train_df.sample(n=sample_size, random_state=42)

In [None]:
## Create a function for Evaluation 
def evaluate_model(true,pred):
    mae = mean_absolute_error(true,pred)
    mse = mean_squared_error(true,pred)
    rmse = np.sqrt(mse)
    score = r2_score(true,pred)
    return mae , mse ,rmse ,score

In [None]:
## Beginning  Model training
models = {
    "Random Forest regresssor": RandomForestRegressor(),
    "AdaBoost Regressor" : AdaBoostRegressor(),
    "GradientBoost Regressor": GradientBoostingRegressor(),
    "XGBoost Regressor": XGBRegressor(),
    "LightGBM Regression": LGBMRegressor(force_col_wise=True),
}

In [None]:
train_scores = {}
test_scores = {}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train_sampled,y_train_sampled) # Train model first dataset
    

    # Make Prediction
    y_train_pred = model.predict(x_train_sampled)
    y_cv_pred = model.predict(x_cv)   # Evaluate Train and Test dataset 
    model_train_mae , model_train_mse ,model_train_rmse ,model_train_r2 = evaluate_model(y_train_sampled, y_train_pred)

    model_cv_mae , model_cv_mse ,model_cv_rmse ,model_cv_r2 =  evaluate_model(y_cv,y_cv_pred)

    print(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Mean Squared Error: {:.4f}".format(model_train_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    train_scores[list(models.keys())[i]] =  model_train_r2

    print('-'*35)

    print('Model performance for Test set')
    print("- Mean Squared Error: {:.4f}".format(model_cv_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_cv_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_cv_mae))
    print("- R2 Score: {:.4f}".format(model_cv_r2))
    test_scores[list(models.keys())[i]] = model_cv_r2
    
    print('='*35)
    print('\n')

In [None]:
#Initialize few parameter for Hyperparamter tuning

rf_params = {"max_depth": [ele for ele in range(6,10,2)],
             "max_features": [5, 7, 8],
             "min_samples_split": [ele for ele in range(2,8,2)],
             'criterion':['squared_error'],
             "n_estimators": [ele for ele in range(200,500,50)]}

xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [ele for ele in range(6,10,2)],
                  "n_estimators": [ele for ele in range(200,500,50)],
                  "colsample_bytree": [round(i, 1) for i in np.arange(0.1, 0.6, 0.1)]}

ada_params = {
    "n_estimators": [50,60,70,80],
    'loss':['linear', 'square', 'exponential']
}

gradient_params={"loss": ['squared_error','huber','absolute_error'],
             "criterion": ['friedman_mse','squared_error'],
             "min_samples_split": [2, 8, 15, 12],
             "n_estimators": [100, 200, 500],
              "max_depth": [5, 8,  None, 10],
            }
light_gbm_params = {
    "n_estimators": [ele for ele in range(200,500,50)],
    "colsample_bytree":[round(i, 1) for i in np.arange(0.1, 0.6, 0.1)],
    
}


In [None]:
rf_params

In [None]:
## Models list for HyperParameter Tuning 
randomcv_models = [
    ("RF", RandomForestRegressor(),rf_params),
    ("XG",XGBRegressor(),xgboost_params),
    # ("ADA",AdaBoostRegressor(),ada_params),
    # ("GRA",GradientBoostingRegressor(),gradient_params),
    ("LIGHT",LGBMRegressor(force_col_wise=True),light_gbm_params)
]

In [None]:
randomcv_models

In [None]:
model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=10,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(x_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

In [None]:
model_param 

In [None]:
## Retraining the models with best parameters
### creating variables for parameter of the models 

#1. for RANDOM FOREST
estimator = model_param['RF']['n_estimators']
min_sample_split = model_param['RF']['min_samples_split']
max_feature = model_param['RF']['max_features']
max_depths = model_param['RF']['max_depth']
#2. for XGBOOST
estimate = model_param['XGB']['n_estimators']
learning_rate_xgb = model_param['XGB']['learning_rate']
max_deep = model_param['XGB']['max_depth']
colsample_bytree_xgb = model_param['XGB']['colsample_bytree']
#3 for lightgbm
estimate_light = model_param['lightGBM']['n_estimators']
learning_rate_reg = model_param['lightGBM']['learning_rate ']
max_depth_light = model_param['lightGBM']['max_depth']
colsample_light = model_param['lightGBM']['colsample_bytree']


## creating training score and testing score dictionaries
train_best_score ,test_best_score = {} , {}
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators=estimator,min_samples_split=min_sample_split,
                                                     max_features=max_feature,max_depth=max_depths, 
                                                     n_jobs=-1),
     "XGBBoost Regressor": XGBRegressor(n_estimators=estimate,learning_rate=learning_rate_xgb,
                                                     colsample_bytree = colsample_bytree_xgb,
                                       max_depth=max_deep,n_jobs=-1),
    "LightGBM Regressor":LGBMRegressor(n_estimators=estimate_light,learning_rate=learning_rate_reg,max_depth=max_depth_light,
                                       colsample_bytree=colsample_light,n_jobs = -1)
                                                          
                                                          
    
}
accuracy_score_train,accuracy_score_test, = {},{}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

   # Evaluate Train and Test dataset 
    model_train_mae , model_train_mse ,model_train_rmse ,model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_mse ,model_test_rmse ,model_test_r2 =  evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Mean Squared Error: {:.4f}".format(model_train_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    train_best_score[model] = model_train_rmse
    accuracy_score_train[model] = model_train_r2
    

    print('-'*35)

    print('Model performance for Test set')
    print("- Mean Squared Error: {:.4f}".format(model_test_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    test_best_score[model] = model_test_rmse
    accuracy_score_test[model] = model_test_r2
    
    print('='*35)
    print('\n')