In [22]:
import numpy as np, pandas as pd, polars as pl
from sklearn.model_selection import KFold
from sklearn import preprocessing as skp
from sklearn.decomposition import PCA 

import xgboost as xgb
from sklearn.metrics import root_mean_squared_error as rmse

In [23]:
train = pd.read_csv('train.csv', index_col = 'id')
test = pd.read_csv('test.csv', index_col = 'id')

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 750000 non-null  object 
 1   Episode_Title                750000 non-null  object 
 2   Episode_Length_minutes       662907 non-null  float64
 3   Genre                        750000 non-null  object 
 4   Host_Popularity_percentage   750000 non-null  float64
 5   Publication_Day              750000 non-null  object 
 6   Publication_Time             750000 non-null  object 
 7   Guest_Popularity_percentage  603970 non-null  float64
 8   Number_of_Ads                749999 non-null  float64
 9   Episode_Sentiment            750000 non-null  object 
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), object(6)
memory usage: 68.7+ MB


In [25]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250000 entries, 750000 to 999999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 250000 non-null  object 
 1   Episode_Title                250000 non-null  object 
 2   Episode_Length_minutes       221264 non-null  float64
 3   Genre                        250000 non-null  object 
 4   Host_Popularity_percentage   250000 non-null  float64
 5   Publication_Day              250000 non-null  object 
 6   Publication_Time             250000 non-null  object 
 7   Guest_Popularity_percentage  201168 non-null  float64
 8   Number_of_Ads                250000 non-null  float64
 9   Episode_Sentiment            250000 non-null  object 
dtypes: float64(4), object(6)
memory usage: 21.0+ MB


In [26]:
categorical_map = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6, # Day

        'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Night': 3,                                            # Time

        'Negative': 0, 'Neutral': 1, 'Positive': 2,                                                        # Sentiment

        'True Crime': 0, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6, # Genre
        'Sports': 7, 'Business': 8, 'Lifestyle': 9
        }

categories = ['Genre', 'Episode_Title', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

In [27]:
train.drop('Podcast_Name', inplace=True, axis=1)
test.drop('Podcast_Name', inplace=True, axis=1)

In [28]:
train['Episode_Title'] = train['Episode_Title'].str[8:]
test['Episode_Title'] = test['Episode_Title'].str[8:]

In [29]:
train = train.replace(categorical_map).fillna(0)
test = test.replace(categorical_map).fillna(0)

In [30]:
x = train.drop('Listening_Time_minutes', axis=1)
y = train['Listening_Time_minutes']

In [31]:
poly = skp.PolynomialFeatures(interaction_only=True, include_bias=False)
x_poly = poly.fit_transform(x)
test_poly = poly.fit_transform(test)

In [32]:
cols = poly.get_feature_names_out(x.columns)
x = pd.DataFrame(x_poly, columns=cols)

test_cols = poly.get_feature_names_out(test.columns)
test = pd.DataFrame(test_poly, columns=test_cols)

In [33]:
x = x.iloc[:,:23]
test = test.iloc[:,:23]

In [34]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 23 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   Episode_Title                                       750000 non-null  float64
 1   Episode_Length_minutes                              750000 non-null  float64
 2   Genre                                               750000 non-null  float64
 3   Host_Popularity_percentage                          750000 non-null  float64
 4   Publication_Day                                     750000 non-null  float64
 5   Publication_Time                                    750000 non-null  float64
 6   Guest_Popularity_percentage                         750000 non-null  float64
 7   Number_of_Ads                                       750000 non-null  float64
 8   Episode_Sentiment                                   750000 non-n

In [35]:
for c in categories: 
    x[c] = x[c].astype('category')
    test[c] = test[c].astype('category')

In [None]:
K = 100
kf = KFold(K, shuffle=True, random_state=55)

In [None]:
pred = pd.DataFrame()
pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [None]:
for train_idx, val_idx in kf.split(x,y):
    x_t, y_t = x.iloc[train_idx], y.iloc[train_idx]
    x_v, y_v = x.iloc[val_idx], y.iloc[val_idx]
    model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric = 'rmse', enable_categorical = True, device = 'cuda', tree_method="hist")#, min_child_weight=4)
    model.fit(x_t, y_t, eval_set=[(x_v,y_v)])
    pred = pd.concat([pd.DataFrame(model.predict(test)), pred], axis=1)
    #xgb.plot_importance(model.get_booster().get_score(importance_type='weight'), show_values=True)

[0]	validation_0-rmse:21.28865
[1]	validation_0-rmse:17.63386
[2]	validation_0-rmse:15.51901
[3]	validation_0-rmse:14.34095
[4]	validation_0-rmse:13.71682
[5]	validation_0-rmse:13.40152
[6]	validation_0-rmse:13.23647
[7]	validation_0-rmse:13.14268
[8]	validation_0-rmse:13.09110
[9]	validation_0-rmse:13.06118
[10]	validation_0-rmse:13.05233
[11]	validation_0-rmse:13.04171
[12]	validation_0-rmse:13.02915
[13]	validation_0-rmse:13.02073
[14]	validation_0-rmse:13.00972
[15]	validation_0-rmse:13.00720
[16]	validation_0-rmse:13.00068
[17]	validation_0-rmse:12.99889
[18]	validation_0-rmse:12.99427
[19]	validation_0-rmse:12.99666
[20]	validation_0-rmse:12.99788
[21]	validation_0-rmse:12.99511
[22]	validation_0-rmse:12.99397
[23]	validation_0-rmse:12.99567
[24]	validation_0-rmse:12.98959
[25]	validation_0-rmse:12.98839
[26]	validation_0-rmse:12.98961
[27]	validation_0-rmse:12.98527
[28]	validation_0-rmse:12.98162
[29]	validation_0-rmse:12.98481
[30]	validation_0-rmse:12.99076
[31]	validation_0-

In [None]:
pred.reset_index(drop=True, inplace=True)
pred.index += 750000
pred_sum = pred.sum(axis=1) / K

In [None]:
pred.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19
750000,52.422321,52.161697,52.887894,53.7742,53.656181,53.514206,53.341064,54.078854,52.930553,52.063507,...,52.096928,54.374897,52.739147,53.164509,51.267963,51.915543,54.571411,54.525528,54.800144,54.264595
750001,18.61894,18.313286,18.225765,18.258863,18.470175,18.16728,17.67573,18.523539,18.3416,17.102133,...,17.817486,17.623583,18.58997,19.009201,18.66168,18.220932,18.62182,18.428267,18.445499,18.393614
750002,49.408112,50.371239,48.327209,48.148037,48.922752,48.777225,47.703999,48.212605,49.463329,49.633575,...,49.737312,49.978252,47.920681,46.594982,49.707146,49.549095,49.720924,49.119438,49.751686,50.421898
750003,78.056595,80.410545,83.02375,79.001389,82.482407,80.8134,78.928375,79.774796,75.693535,83.510269,...,76.436401,79.674911,79.389191,77.818634,82.324066,81.756889,78.519585,81.700142,78.213562,80.849266
750004,49.724411,49.096329,48.715485,48.357048,48.951626,49.196938,47.60582,49.398361,48.2677,49.117085,...,49.73243,49.098606,48.171635,48.40012,49.263012,49.253521,48.492901,49.603424,48.799591,49.454044


In [None]:
pred_sum.rename('Listening_Time_minutes', inplace=True)
pred_sum.head()

750000    53.340137
750001    18.317774
750002    49.138416
750003    80.038475
750004    48.960129
Name: Listening_Time_minutes, dtype: float32

In [None]:
pred_sum.to_csv('submission_scaled.csv', index_label='id')