In [2]:
import numpy as np, pandas as pd, polars as pl
from enum import Enum
from sklearn import preprocessing as skp
from sklearn.model_selection import KFold

import sklearn.linear_model as skl
from sklearn.ensemble import HistGradientBoostingRegressor as gbr
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error as rmse

In [3]:
train = pd.read_csv('train.csv', index_col = 'id')
test = pd.read_csv('test.csv', index_col = 'id')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 750000 non-null  object 
 1   Episode_Title                750000 non-null  object 
 2   Episode_Length_minutes       662907 non-null  float64
 3   Genre                        750000 non-null  object 
 4   Host_Popularity_percentage   750000 non-null  float64
 5   Publication_Day              750000 non-null  object 
 6   Publication_Time             750000 non-null  object 
 7   Guest_Popularity_percentage  603970 non-null  float64
 8   Number_of_Ads                749999 non-null  float64
 9   Episode_Sentiment            750000 non-null  object 
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), object(6)
memory usage: 68.7+ MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250000 entries, 750000 to 999999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 250000 non-null  object 
 1   Episode_Title                250000 non-null  object 
 2   Episode_Length_minutes       221264 non-null  float64
 3   Genre                        250000 non-null  object 
 4   Host_Popularity_percentage   250000 non-null  float64
 5   Publication_Day              250000 non-null  object 
 6   Publication_Time             250000 non-null  object 
 7   Guest_Popularity_percentage  201168 non-null  float64
 8   Number_of_Ads                250000 non-null  float64
 9   Episode_Sentiment            250000 non-null  object 
dtypes: float64(4), object(6)
memory usage: 21.0+ MB


In [6]:
categorical_map = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6, # Day

        'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Night': 3,                                            # Time

        'Negative': 0, 'Neutral': 1, 'Positive': 2,                                                        # Sentiment

        'True Crime': 0, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6, # Genre
        'Sports': 7, 'Business': 8, 'Lifestyle': 9
        }

categories = ['Genre', 'Episode_Title', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

In [7]:
train.drop('Podcast_Name', inplace=True, axis=1)
test.drop('Podcast_Name', inplace=True, axis=1)

In [8]:
train['Episode_Title'] = train['Episode_Title'].str[8:]

In [9]:
train = train.replace(categorical_map)

In [10]:
for c in categories:
    train[c] = train[c].astype('category')

In [11]:
x = train.drop('Listening_Time_minutes', axis=1)
y = train['Listening_Time_minutes']

In [12]:
kf = KFold(5, shuffle=True, random_state=55)

In [13]:
for train_idx, val_idx in kf.split(x,y):
    x_t, y_t = x.iloc[train_idx], y.iloc[train_idx]
    x_v, y_v = x.iloc[val_idx], y.iloc[val_idx]
    model = gbr(categorical_features=categories, l2_regularization=1, max_bins=255).fit(x_t, y_t)
    y_p = model.predict(x_v)
    print(mse(y_v, y_p))

KeyboardInterrupt: 

In [None]:
for train_idx, val_idx in kf.split(x,y):
    x_t, y_t = x.iloc[train_idx], y.iloc[train_idx]
    x_v, y_v = x.iloc[val_idx], y.iloc[val_idx]
    model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric = 'rmse', enable_categorical = True)
    model.fit(x_t,y_t)
    y_p = model.predict(x_v)
    print(rmse(y_v, y_p))

NameError: name 'root_mean_squared_error' is not defined

In [None]:
model.score(x, y)

0.7696748780522011

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric = 'rmse', enable_categorical = True)

In [None]:
xgb_model.fit(x,y)

In [None]:
train.head()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   Episode_Title                750000 non-null  category
 1   Episode_Length_minutes       662907 non-null  float64 
 2   Genre                        750000 non-null  category
 3   Host_Popularity_percentage   750000 non-null  float64 
 4   Publication_Day              750000 non-null  category
 5   Publication_Time             750000 non-null  category
 6   Guest_Popularity_percentage  603970 non-null  float64 
 7   Number_of_Ads                749999 non-null  float64 
 8   Episode_Sentiment            750000 non-null  category
 9   Listening_Time_minutes       750000 non-null  float64 
dtypes: category(5), float64(5)
memory usage: 37.9 MB
