In [440]:
import numpy as np, pandas as pd, polars as pl
from enum import Enum

# Import and clean data

In [441]:
train = pd.read_csv("train.csv").drop('id', axis=1)
train.info()
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 750000 non-null  object 
 1   Episode_Title                750000 non-null  object 
 2   Episode_Length_minutes       662907 non-null  float64
 3   Genre                        750000 non-null  object 
 4   Host_Popularity_percentage   750000 non-null  float64
 5   Publication_Day              750000 non-null  object 
 6   Publication_Time             750000 non-null  object 
 7   Guest_Popularity_percentage  603970 non-null  float64
 8   Number_of_Ads                749999 non-null  float64
 9   Episode_Sentiment            750000 non-null  object 
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), object(6)
memory usage: 62.9+ MB


Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [442]:
test = pd.read_csv("test.csv").drop('id', axis=1)
test.info()
test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 250000 non-null  object 
 1   Episode_Title                250000 non-null  object 
 2   Episode_Length_minutes       221264 non-null  float64
 3   Genre                        250000 non-null  object 
 4   Host_Popularity_percentage   250000 non-null  float64
 5   Publication_Day              250000 non-null  object 
 6   Publication_Time             250000 non-null  object 
 7   Guest_Popularity_percentage  201168 non-null  float64
 8   Number_of_Ads                250000 non-null  float64
 9   Episode_Sentiment            250000 non-null  object 
dtypes: float64(4), object(6)
memory usage: 19.1+ MB


Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
0,Educational Nuggets,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral
1,Sound Waves,Episode 23,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral
2,Joke Junction,Episode 11,69.1,Comedy,67.89,Friday,Evening,97.51,0.0,Positive
3,Comedy Corner,Episode 73,115.39,Comedy,23.4,Sunday,Morning,51.75,2.0,Positive
4,Life Lessons,Episode 50,72.32,Lifestyle,58.1,Wednesday,Morning,11.3,2.0,Neutral


### Split Categorical and Numerical Data

In [443]:
train_n = train.iloc[:,[2,4,7,8,10]]
train_c = train.iloc[:,[0,1,3,5,6,9,10]]
print(train_n.head())
train_c.head()

   Episode_Length_minutes  Host_Popularity_percentage  \
0                     NaN                       74.81   
1                  119.80                       66.95   
2                   73.90                       69.97   
3                   67.17                       57.22   
4                  110.51                       80.07   

   Guest_Popularity_percentage  Number_of_Ads  Listening_Time_minutes  
0                          NaN            0.0                31.41998  
1                        75.95            2.0                88.01241  
2                         8.97            0.0                44.92531  
3                        78.70            2.0                46.27824  
4                        58.68            3.0                75.61031  


Unnamed: 0,Podcast_Name,Episode_Title,Genre,Publication_Day,Publication_Time,Episode_Sentiment,Listening_Time_minutes
0,Mystery Matters,Episode 98,True Crime,Thursday,Night,Positive,31.41998
1,Joke Junction,Episode 26,Comedy,Saturday,Afternoon,Negative,88.01241
2,Study Sessions,Episode 16,Education,Tuesday,Evening,Negative,44.92531
3,Digital Digest,Episode 45,Technology,Monday,Morning,Positive,46.27824
4,Mind & Body,Episode 86,Health,Monday,Afternoon,Neutral,75.61031


### Categorical Data

#### Episode Number -> Int

In [444]:
train_c['Episode_Title'].unique()

array(['Episode 98', 'Episode 26', 'Episode 16', 'Episode 45',
       'Episode 86', 'Episode 19', 'Episode 47', 'Episode 44',
       'Episode 32', 'Episode 81', 'Episode 66', 'Episode 62',
       'Episode 76', 'Episode 37', 'Episode 20', 'Episode 82',
       'Episode 72', 'Episode 61', 'Episode 100', 'Episode 54',
       'Episode 17', 'Episode 36', 'Episode 97', 'Episode 27',
       'Episode 31', 'Episode 88', 'Episode 38', 'Episode 92',
       'Episode 74', 'Episode 30', 'Episode 63', 'Episode 67',
       'Episode 77', 'Episode 4', 'Episode 93', 'Episode 24', 'Episode 1',
       'Episode 2', 'Episode 25', 'Episode 56', 'Episode 75',
       'Episode 12', 'Episode 21', 'Episode 6', 'Episode 85',
       'Episode 23', 'Episode 33', 'Episode 7', 'Episode 53',
       'Episode 15', 'Episode 43', 'Episode 71', 'Episode 69',
       'Episode 13', 'Episode 89', 'Episode 3', 'Episode 64',
       'Episode 73', 'Episode 79', 'Episode 94', 'Episode 80',
       'Episode 42', 'Episode 10', 'Episode 48

In [445]:
def cutEpisode(title):
    return int(title[8:])

train_c['Episode_Title'] = train_c['Episode_Title'].apply(cutEpisode)
train_c['Episode_Title'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 750000 entries, 0 to 749999
Series name: Episode_Title
Non-Null Count   Dtype
--------------   -----
750000 non-null  int64
dtypes: int64(1)
memory usage: 5.7 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_c['Episode_Title'] = train_c['Episode_Title'].apply(cutEpisode)


#### Genre, Publication Day and Time, Sentiment -> Enum

In [446]:
print(train_c['Genre'].unique(), train_c['Genre'].info())

<class 'pandas.core.series.Series'>
RangeIndex: 750000 entries, 0 to 749999
Series name: Genre
Non-Null Count   Dtype 
--------------   ----- 
750000 non-null  object
dtypes: object(1)
memory usage: 5.7+ MB
['True Crime' 'Comedy' 'Education' 'Technology' 'Health' 'News' 'Music'
 'Sports' 'Business' 'Lifestyle'] None


In [447]:
test = pd.get_dummies(train_c['Genre'])
test.head()

Unnamed: 0,Business,Comedy,Education,Health,Lifestyle,Music,News,Sports,Technology,True Crime
0,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0


--------------------

In [448]:
print(train_c['Publication_Day'].unique(), train_c['Publication_Day'].info())

<class 'pandas.core.series.Series'>
RangeIndex: 750000 entries, 0 to 749999
Series name: Publication_Day
Non-Null Count   Dtype 
--------------   ----- 
750000 non-null  object
dtypes: object(1)
memory usage: 5.7+ MB
['Thursday' 'Saturday' 'Tuesday' 'Monday' 'Sunday' 'Wednesday' 'Friday'] None


--------------------

In [449]:
print(train_c['Publication_Time'].unique(), train_c['Publication_Time'].info())

<class 'pandas.core.series.Series'>
RangeIndex: 750000 entries, 0 to 749999
Series name: Publication_Time
Non-Null Count   Dtype 
--------------   ----- 
750000 non-null  object
dtypes: object(1)
memory usage: 5.7+ MB
['Night' 'Afternoon' 'Evening' 'Morning'] None


In [450]:
def pubTimeToEnum(dfTime):
    match dfTime:
        case 'Morning': return 0
        case 'Afternoon': return 1
        case 'Evening': return 2
        case 'Night': return 3

0 = Morning   
1 = Afternoon    
2 = Evening  
3 = Night  

In [451]:
train_c['Publication_Time'] = train_c['Publication_Time'].apply(pubTimeToEnum)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_c['Publication_Time'] = train_c['Publication_Time'].apply(pubTimeToEnum)


In [454]:
train_c['Publication_Time'].head()

0    3
1    1
2    2
3    0
4    1
Name: Publication_Time, dtype: int64

# Evaluation

In [453]:
#from sklearn.metrics import mean_squared_error
#mse = mean_squared_error(actual, predicted)