# Model Inference Notebook

* Author: Finian O'Neill
* Purpose: Load in the test data and then make predictions using the best model from model training.

### Setup

In [9]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
import json

In [3]:
def load_best_model(saved_paths, results, task='classification', metric=None):
    """
    Load the best model based on evaluation metrics
    
    Parameters:
    -----------
    saved_paths : list
        List of saved model file paths
    results : dict
        Dictionary containing evaluation metrics for each model
    task : str, default='classification'
        Type of task: 'classification' or 'regression'
    metric : str, optional
        Specific metric to use for model selection
        
    Returns:
    --------
    object
        The best trained model
    """
    if metric is None:
        metric = 'accuracy' if task == 'classification' else 'rmse'
    
    model_names = [path.split('_')[1].split('.')[0] for path in saved_paths]
    
    if task == 'classification':
        scores = [results[name][metric] for name in model_names]
        best_idx = np.argmax(scores)
    else:  # Regression - lower RMSE is better
        scores = [results[name][metric] for name in model_names]
        best_idx = np.argmin(scores)
    
    best_model_path = saved_paths[best_idx]
    best_model = joblib.load(best_model_path)
    
    return best_model

### Data Load

In [5]:
# load in the test dataset
test_df = pd.read_csv('data/test.csv')
test_df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
0,750000,Educational Nuggets,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral
1,750001,Sound Waves,Episode 23,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral
2,750002,Joke Junction,Episode 11,69.1,Comedy,67.89,Friday,Evening,97.51,0.0,Positive
3,750003,Comedy Corner,Episode 73,115.39,Comedy,23.4,Sunday,Morning,51.75,2.0,Positive
4,750004,Life Lessons,Episode 50,72.32,Lifestyle,58.1,Wednesday,Morning,11.3,2.0,Neutral


In [10]:
# load in the saved results
with open('artifacts/tree_model_results.json', 'r') as file:
    results = json.load(file)
file.close()

In [12]:
# construct the saved paths
saved_paths = {
    'random_forest': 'artifacts/models/model_random_forest.pkl',
    'xgboost': 'artifacts/models/model_xgboost.pkl',
    'lightgbm': 'artifacts/models/model_lightgbm.pkl',
    'catboost': 'artifacts/models/model_catboost.pkl'
}

In [14]:
# load in the podcast name cluster map
with open('artifacts/podcast_name_cluster_map.json', 'r') as file:
    podcast_name_cluster_map = json.load(file)
file.close()

### Feature Engineering

#### 1) Podcast_Name

In [16]:
# create copy of original dataframe to encode
test_encoded_df = test_df.copy()

In [17]:
# create the 'Podcast_Name_clustered' column
test_encoded_df['Podcast_Name_clustered'] = test_encoded_df['Podcast_Name'].map(podcast_name_cluster_map).fillna('unknown')
test_encoded_df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered
0,750000,Educational Nuggets,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral,Tech Trends_cluster_2
1,750001,Sound Waves,Episode 23,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral,Tech Trends_cluster_2
2,750002,Joke Junction,Episode 11,69.1,Comedy,67.89,Friday,Evening,97.51,0.0,Positive,Tech Talks_cluster_0
3,750003,Comedy Corner,Episode 73,115.39,Comedy,23.4,Sunday,Morning,51.75,2.0,Positive,Tech Talks_cluster_0
4,750004,Life Lessons,Episode 50,72.32,Lifestyle,58.1,Wednesday,Morning,11.3,2.0,Neutral,Sports Weekly_cluster_1


In [18]:
# now one hot encode the clustered values
test_encoded_df = pd.get_dummies(test_encoded_df, columns=['Podcast_Name_clustered'])
test_encoded_df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2
0,750000,Educational Nuggets,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral,False,False,True
1,750001,Sound Waves,Episode 23,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral,False,False,True
2,750002,Joke Junction,Episode 11,69.1,Comedy,67.89,Friday,Evening,97.51,0.0,Positive,False,True,False
3,750003,Comedy Corner,Episode 73,115.39,Comedy,23.4,Sunday,Morning,51.75,2.0,Positive,False,True,False
4,750004,Life Lessons,Episode 50,72.32,Lifestyle,58.1,Wednesday,Morning,11.3,2.0,Neutral,True,False,False


In [19]:
# drop the no longer needed 'Podcast_Name' column
test_encoded_df = test_encoded_df.drop(columns=['Podcast_Name'])
test_encoded_df.head()

Unnamed: 0,id,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2
0,750000,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral,False,False,True
1,750001,Episode 23,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral,False,False,True
2,750002,Episode 11,69.1,Comedy,67.89,Friday,Evening,97.51,0.0,Positive,False,True,False
3,750003,Episode 73,115.39,Comedy,23.4,Sunday,Morning,51.75,2.0,Positive,False,True,False
4,750004,Episode 50,72.32,Lifestyle,58.1,Wednesday,Morning,11.3,2.0,Neutral,True,False,False


#### 2) Episode_Title

In [20]:
# from this list, it seems like episodes range from 1 to 100. I will simplify assumptions and just bin
# as the following for each podcast
# early --> 1 - 33
# mid --> 34 - 66
# late --> 67 - 100
early_list = ['Episode {counter}'.format(counter=n + 1) for n in range(100) if n + 1 <= 33]
mid_list = ['Episode {counter}'.format(counter=n + 1) for n in range(100) if (n + 1 > 33) and (n + 1 <= 66)]
late_list = ['Episode {counter}'.format(counter=n + 1) for n in range(100) if (n + 1 > 66) and (n + 1 <= 100)]

# create episode dict
episode_map = {}
# add early entries
for i in range(len(early_list)):
    episode_map[early_list[i]] = 'early'
# add mid entries
for j in range(len(mid_list)):
    episode_map[mid_list[j]] = 'mid'
# add late entries
for k in range(len(late_list)):
    episode_map[late_list[k]] = 'late'
    
print(episode_map)

{'Episode 1': 'early', 'Episode 2': 'early', 'Episode 3': 'early', 'Episode 4': 'early', 'Episode 5': 'early', 'Episode 6': 'early', 'Episode 7': 'early', 'Episode 8': 'early', 'Episode 9': 'early', 'Episode 10': 'early', 'Episode 11': 'early', 'Episode 12': 'early', 'Episode 13': 'early', 'Episode 14': 'early', 'Episode 15': 'early', 'Episode 16': 'early', 'Episode 17': 'early', 'Episode 18': 'early', 'Episode 19': 'early', 'Episode 20': 'early', 'Episode 21': 'early', 'Episode 22': 'early', 'Episode 23': 'early', 'Episode 24': 'early', 'Episode 25': 'early', 'Episode 26': 'early', 'Episode 27': 'early', 'Episode 28': 'early', 'Episode 29': 'early', 'Episode 30': 'early', 'Episode 31': 'early', 'Episode 32': 'early', 'Episode 33': 'early', 'Episode 34': 'mid', 'Episode 35': 'mid', 'Episode 36': 'mid', 'Episode 37': 'mid', 'Episode 38': 'mid', 'Episode 39': 'mid', 'Episode 40': 'mid', 'Episode 41': 'mid', 'Episode 42': 'mid', 'Episode 43': 'mid', 'Episode 44': 'mid', 'Episode 45': 'mid

In [21]:
# bin the actual 'Episode_Title' column
test_encoded_df['Episode_Title_binned'] = test_encoded_df['Episode_Title'].map(episode_map).fillna('unknown')
test_encoded_df.head()

Unnamed: 0,id,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2,Episode_Title_binned
0,750000,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral,False,False,True,late
1,750001,Episode 23,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral,False,False,True,early
2,750002,Episode 11,69.1,Comedy,67.89,Friday,Evening,97.51,0.0,Positive,False,True,False,early
3,750003,Episode 73,115.39,Comedy,23.4,Sunday,Morning,51.75,2.0,Positive,False,True,False,late
4,750004,Episode 50,72.32,Lifestyle,58.1,Wednesday,Morning,11.3,2.0,Neutral,True,False,False,mid


In [22]:
# now one hot encode the binned values
test_encoded_df = pd.get_dummies(test_encoded_df, columns=['Episode_Title_binned'])
test_encoded_df.head()

Unnamed: 0,id,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2,Episode_Title_binned_early,Episode_Title_binned_late,Episode_Title_binned_mid
0,750000,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral,False,False,True,False,True,False
1,750001,Episode 23,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral,False,False,True,True,False,False
2,750002,Episode 11,69.1,Comedy,67.89,Friday,Evening,97.51,0.0,Positive,False,True,False,True,False,False
3,750003,Episode 73,115.39,Comedy,23.4,Sunday,Morning,51.75,2.0,Positive,False,True,False,False,True,False
4,750004,Episode 50,72.32,Lifestyle,58.1,Wednesday,Morning,11.3,2.0,Neutral,True,False,False,False,False,True


In [23]:
# drop the no longer needed 'Episode_Title' column
test_encoded_df = test_encoded_df.drop(columns=['Episode_Title'])
test_encoded_df.head()

Unnamed: 0,id,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2,Episode_Title_binned_early,Episode_Title_binned_late,Episode_Title_binned_mid
0,750000,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral,False,False,True,False,True,False
1,750001,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral,False,False,True,True,False,False
2,750002,69.1,Comedy,67.89,Friday,Evening,97.51,0.0,Positive,False,True,False,True,False,False
3,750003,115.39,Comedy,23.4,Sunday,Morning,51.75,2.0,Positive,False,True,False,False,True,False
4,750004,72.32,Lifestyle,58.1,Wednesday,Morning,11.3,2.0,Neutral,True,False,False,False,False,True


#### 3) Genre

In [24]:
# one-hot encode the Genre since there are only 10 categories
test_encoded_df = pd.get_dummies(test_encoded_df, columns=['Genre'])
test_encoded_df.head()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,...,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,Genre_Music,Genre_News,Genre_Sports,Genre_Technology,Genre_True Crime
0,750000,78.96,38.11,Saturday,Evening,53.33,1.0,Neutral,False,False,...,False,False,True,False,False,False,False,False,False,False
1,750001,27.87,71.29,Sunday,Morning,,0.0,Neutral,False,False,...,False,False,False,False,False,True,False,False,False,False
2,750002,69.1,67.89,Friday,Evening,97.51,0.0,Positive,False,True,...,False,True,False,False,False,False,False,False,False,False
3,750003,115.39,23.4,Sunday,Morning,51.75,2.0,Positive,False,True,...,False,True,False,False,False,False,False,False,False,False
4,750004,72.32,58.1,Wednesday,Morning,11.3,2.0,Neutral,True,False,...,False,False,False,False,True,False,False,False,False,False


#### 4) Publication_Day

In [25]:
# one-hot encode the Publication_Day since there are only 7 categories
test_encoded_df = pd.get_dummies(test_encoded_df, columns=['Publication_Day'])
test_encoded_df.head()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2,...,Genre_Sports,Genre_Technology,Genre_True Crime,Publication_Day_Friday,Publication_Day_Monday,Publication_Day_Saturday,Publication_Day_Sunday,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday
0,750000,78.96,38.11,Evening,53.33,1.0,Neutral,False,False,True,...,False,False,False,False,False,True,False,False,False,False
1,750001,27.87,71.29,Morning,,0.0,Neutral,False,False,True,...,False,False,False,False,False,False,True,False,False,False
2,750002,69.1,67.89,Evening,97.51,0.0,Positive,False,True,False,...,False,False,False,True,False,False,False,False,False,False
3,750003,115.39,23.4,Morning,51.75,2.0,Positive,False,True,False,...,False,False,False,False,False,False,True,False,False,False
4,750004,72.32,58.1,Morning,11.3,2.0,Neutral,True,False,False,...,False,False,False,False,False,False,False,False,False,True


#### 5) Publication_Time

In [26]:
# one-hot encode the Publication_Time since there are only 4 categories
test_encoded_df = pd.get_dummies(test_encoded_df, columns=['Publication_Time'])
test_encoded_df.head()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2,Episode_Title_binned_early,...,Publication_Day_Monday,Publication_Day_Saturday,Publication_Day_Sunday,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night
0,750000,78.96,38.11,53.33,1.0,Neutral,False,False,True,False,...,False,True,False,False,False,False,False,True,False,False
1,750001,27.87,71.29,,0.0,Neutral,False,False,True,True,...,False,False,True,False,False,False,False,False,True,False
2,750002,69.1,67.89,97.51,0.0,Positive,False,True,False,True,...,False,False,False,False,False,False,False,True,False,False
3,750003,115.39,23.4,51.75,2.0,Positive,False,True,False,False,...,False,False,True,False,False,False,False,False,True,False
4,750004,72.32,58.1,11.3,2.0,Neutral,True,False,False,False,...,False,False,False,False,False,True,False,False,True,False


#### 6) Episode_Sentiment

In [27]:
# for the sentiment map to values to ascribe ordinality
episode_sentiment_map = {
    'Negative': -1,
    'Neutral': 0,
    'Positive': 1
}

test_encoded_df['Episode_Sentiment_mapped'] = test_encoded_df['Episode_Sentiment'].map(episode_sentiment_map)
test_encoded_df.head()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2,Episode_Title_binned_early,...,Publication_Day_Saturday,Publication_Day_Sunday,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_mapped
0,750000,78.96,38.11,53.33,1.0,Neutral,False,False,True,False,...,True,False,False,False,False,False,True,False,False,0
1,750001,27.87,71.29,,0.0,Neutral,False,False,True,True,...,False,True,False,False,False,False,False,True,False,0
2,750002,69.1,67.89,97.51,0.0,Positive,False,True,False,True,...,False,False,False,False,False,False,True,False,False,1
3,750003,115.39,23.4,51.75,2.0,Positive,False,True,False,False,...,False,True,False,False,False,False,False,True,False,1
4,750004,72.32,58.1,11.3,2.0,Neutral,True,False,False,False,...,False,False,False,False,True,False,False,True,False,0


In [28]:
# drop the no longer needed 'Episode_Sentiment' column
test_encoded_df = test_encoded_df.drop(columns=['Episode_Sentiment'])
test_encoded_df.head()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2,Episode_Title_binned_early,Episode_Title_binned_late,...,Publication_Day_Saturday,Publication_Day_Sunday,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_mapped
0,750000,78.96,38.11,53.33,1.0,False,False,True,False,True,...,True,False,False,False,False,False,True,False,False,0
1,750001,27.87,71.29,,0.0,False,False,True,True,False,...,False,True,False,False,False,False,False,True,False,0
2,750002,69.1,67.89,97.51,0.0,False,True,False,True,False,...,False,False,False,False,False,False,True,False,False,1
3,750003,115.39,23.4,51.75,2.0,False,True,False,False,True,...,False,True,False,False,False,False,False,True,False,1
4,750004,72.32,58.1,11.3,2.0,True,False,False,False,False,...,False,False,False,False,True,False,False,True,False,0


In [30]:
# store the ID column and then drop
id_vals = test_encoded_df['id']
test_encoded_df = test_encoded_df.drop(columns=['id'])
test_encoded_df.head()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2,Episode_Title_binned_early,Episode_Title_binned_late,Episode_Title_binned_mid,...,Publication_Day_Saturday,Publication_Day_Sunday,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_mapped
0,78.96,38.11,53.33,1.0,False,False,True,False,True,False,...,True,False,False,False,False,False,True,False,False,0
1,27.87,71.29,,0.0,False,False,True,True,False,False,...,False,True,False,False,False,False,False,True,False,0
2,69.1,67.89,97.51,0.0,False,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,1
3,115.39,23.4,51.75,2.0,False,True,False,False,True,False,...,False,True,False,False,False,False,False,True,False,1
4,72.32,58.1,11.3,2.0,True,False,False,False,False,True,...,False,False,False,False,True,False,False,True,False,0


### Pre-Processing

In [31]:
# examine null values across columns
test_encoded_df.isna().sum()

Episode_Length_minutes                            28736
Host_Popularity_percentage                            0
Guest_Popularity_percentage                       48832
Number_of_Ads                                         0
Podcast_Name_clustered_Sports Weekly_cluster_1        0
Podcast_Name_clustered_Tech Talks_cluster_0           0
Podcast_Name_clustered_Tech Trends_cluster_2          0
Episode_Title_binned_early                            0
Episode_Title_binned_late                             0
Episode_Title_binned_mid                              0
Genre_Business                                        0
Genre_Comedy                                          0
Genre_Education                                       0
Genre_Health                                          0
Genre_Lifestyle                                       0
Genre_Music                                           0
Genre_News                                            0
Genre_Sports                                    

In [32]:
# examine null values across columns
test_encoded_df.isna().sum() / len(test_encoded_df)

Episode_Length_minutes                            0.114944
Host_Popularity_percentage                        0.000000
Guest_Popularity_percentage                       0.195328
Number_of_Ads                                     0.000000
Podcast_Name_clustered_Sports Weekly_cluster_1    0.000000
Podcast_Name_clustered_Tech Talks_cluster_0       0.000000
Podcast_Name_clustered_Tech Trends_cluster_2      0.000000
Episode_Title_binned_early                        0.000000
Episode_Title_binned_late                         0.000000
Episode_Title_binned_mid                          0.000000
Genre_Business                                    0.000000
Genre_Comedy                                      0.000000
Genre_Education                                   0.000000
Genre_Health                                      0.000000
Genre_Lifestyle                                   0.000000
Genre_Music                                       0.000000
Genre_News                                        0.0000

In [33]:
# based on the data distribution the filling with mean strategy seems best for 'Episode_Length_minutes'
test_encoded_df['Episode_Length_minutes'] = test_encoded_df['Episode_Length_minutes'].fillna(test_encoded_df['Episode_Length_minutes'].mean(), inplace=False)

In [34]:
# examine null values across columns
test_encoded_df.isna().sum() / len(test_encoded_df)

Episode_Length_minutes                            0.000000
Host_Popularity_percentage                        0.000000
Guest_Popularity_percentage                       0.195328
Number_of_Ads                                     0.000000
Podcast_Name_clustered_Sports Weekly_cluster_1    0.000000
Podcast_Name_clustered_Tech Talks_cluster_0       0.000000
Podcast_Name_clustered_Tech Trends_cluster_2      0.000000
Episode_Title_binned_early                        0.000000
Episode_Title_binned_late                         0.000000
Episode_Title_binned_mid                          0.000000
Genre_Business                                    0.000000
Genre_Comedy                                      0.000000
Genre_Education                                   0.000000
Genre_Health                                      0.000000
Genre_Lifestyle                                   0.000000
Genre_Music                                       0.000000
Genre_News                                        0.0000

In [35]:
# based on the data distribution the filling with mean strategy seems best for 'Guest_Popularity_percentage'
test_encoded_df['Guest_Popularity_percentage'] = test_encoded_df['Guest_Popularity_percentage'].fillna(test_encoded_df['Guest_Popularity_percentage'].mean(), inplace=False)

In [36]:
# examine null values across columns
test_encoded_df.isna().sum() / len(test_encoded_df)

Episode_Length_minutes                            0.0
Host_Popularity_percentage                        0.0
Guest_Popularity_percentage                       0.0
Number_of_Ads                                     0.0
Podcast_Name_clustered_Sports Weekly_cluster_1    0.0
Podcast_Name_clustered_Tech Talks_cluster_0       0.0
Podcast_Name_clustered_Tech Trends_cluster_2      0.0
Episode_Title_binned_early                        0.0
Episode_Title_binned_late                         0.0
Episode_Title_binned_mid                          0.0
Genre_Business                                    0.0
Genre_Comedy                                      0.0
Genre_Education                                   0.0
Genre_Health                                      0.0
Genre_Lifestyle                                   0.0
Genre_Music                                       0.0
Genre_News                                        0.0
Genre_Sports                                      0.0
Genre_Technology            

In [37]:
# make sure all data types are numerical (or boolean)
test_encoded_df.dtypes

Episode_Length_minutes                            float64
Host_Popularity_percentage                        float64
Guest_Popularity_percentage                       float64
Number_of_Ads                                     float64
Podcast_Name_clustered_Sports Weekly_cluster_1       bool
Podcast_Name_clustered_Tech Talks_cluster_0          bool
Podcast_Name_clustered_Tech Trends_cluster_2         bool
Episode_Title_binned_early                           bool
Episode_Title_binned_late                            bool
Episode_Title_binned_mid                             bool
Genre_Business                                       bool
Genre_Comedy                                         bool
Genre_Education                                      bool
Genre_Health                                         bool
Genre_Lifestyle                                      bool
Genre_Music                                          bool
Genre_News                                           bool
Genre_Sports  

### Model Inference

In [38]:
# make copy of test encoded and convert to X_pred
X_pred = test_encoded_df.copy()
X_pred.head()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Podcast_Name_clustered_Sports Weekly_cluster_1,Podcast_Name_clustered_Tech Talks_cluster_0,Podcast_Name_clustered_Tech Trends_cluster_2,Episode_Title_binned_early,Episode_Title_binned_late,Episode_Title_binned_mid,...,Publication_Day_Saturday,Publication_Day_Sunday,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_mapped
0,78.96,38.11,53.33,1.0,False,False,True,False,True,False,...,True,False,False,False,False,False,True,False,False,0
1,27.87,71.29,52.192796,0.0,False,False,True,True,False,False,...,False,True,False,False,False,False,False,True,False,0
2,69.1,67.89,97.51,0.0,False,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,1
3,115.39,23.4,51.75,2.0,False,True,False,False,True,False,...,False,True,False,False,False,False,False,True,False,1
4,72.32,58.1,11.3,2.0,True,False,False,False,False,True,...,False,False,False,False,True,False,False,True,False,0


In [39]:
### REFACTOR TO GENERALIZE
# hard coded load of the random forest model since it had the lowest RMSE
best_model = joblib.load(saved_paths['random_forest'])

In [40]:
# make predictions
y_pred = best_model.predict(X_pred)

### Construct Submission

In [43]:
# build submission dataframe for Kaggle submission
submit_df = pd.DataFrame()
submit_df['id'] = list(id_vals)
submit_df['Listening_Time_minutes'] = list(y_pred)
submit_df.head()

Unnamed: 0,id,Listening_Time_minutes
0,750000,55.843055
1,750001,23.471401
2,750002,48.588094
3,750003,76.435868
4,750004,46.301973


In [47]:
# make the id column the index
submit_df = submit_df.set_index('id')
submit_df.head()

Unnamed: 0_level_0,Listening_Time_minutes
id,Unnamed: 1_level_1
750000,55.843055
750001,23.471401
750002,48.588094
750003,76.435868
750004,46.301973


In [48]:
# output the dataframe to file to be uploaded to Kaggle
submit_df.to_csv('data/submissions/submission_1.csv')