In [1]:
import numpy as np, pandas as pd, polars as pl
from sklearn.model_selection import KFold
from sklearn import preprocessing as skp
from sklearn.decomposition import PCA 
import seaborn as sns

import xgboost as xgb
from sklearn.metrics import root_mean_squared_error as rmse
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv', index_col = 'id')
test = pd.read_csv('test.csv', index_col = 'id')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 750000 non-null  object 
 1   Episode_Title                750000 non-null  object 
 2   Episode_Length_minutes       662907 non-null  float64
 3   Genre                        750000 non-null  object 
 4   Host_Popularity_percentage   750000 non-null  float64
 5   Publication_Day              750000 non-null  object 
 6   Publication_Time             750000 non-null  object 
 7   Guest_Popularity_percentage  603970 non-null  float64
 8   Number_of_Ads                749999 non-null  float64
 9   Episode_Sentiment            750000 non-null  object 
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), object(6)
memory usage: 68.7+ MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250000 entries, 750000 to 999999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 250000 non-null  object 
 1   Episode_Title                250000 non-null  object 
 2   Episode_Length_minutes       221264 non-null  float64
 3   Genre                        250000 non-null  object 
 4   Host_Popularity_percentage   250000 non-null  float64
 5   Publication_Day              250000 non-null  object 
 6   Publication_Time             250000 non-null  object 
 7   Guest_Popularity_percentage  201168 non-null  float64
 8   Number_of_Ads                250000 non-null  float64
 9   Episode_Sentiment            250000 non-null  object 
dtypes: float64(4), object(6)
memory usage: 21.0+ MB


In [5]:
categorical_map = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6, # Day

        'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Night': 3,                                            # Time

        'Negative': 0, 'Neutral': 1, 'Positive': 2,                                                        # Sentiment

        'True Crime': 0, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6, # Genre
        'Sports': 7, 'Business': 8, 'Lifestyle': 9
        }

categories = ['Genre', 'Podcast_Name', 'Episode_Title', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

In [6]:
podcast = {}
for i, n in enumerate(train['Podcast_Name'].unique()):
    podcast[n] = i

categorical_map.update(podcast)

In [7]:
train['Episode_Title'] = train['Episode_Title'].str[8:]
test['Episode_Title'] = test['Episode_Title'].str[8:]

In [8]:
train = train.replace(categorical_map).fillna(0)
test = test.replace(categorical_map).fillna(0)

In [9]:
x = train.drop('Listening_Time_minutes', axis=1)
y = train['Listening_Time_minutes']

In [10]:
poly = skp.PolynomialFeatures(interaction_only=True, include_bias=False)
x_poly = poly.fit_transform(x)
test_poly = poly.fit_transform(test)

In [11]:
cols = poly.get_feature_names_out(x.columns)
x = pd.DataFrame(x_poly, columns=cols)

test_cols = poly.get_feature_names_out(test.columns)
test = pd.DataFrame(test_poly, columns=test_cols)

In [12]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 55 columns):
 #   Column                                                  Non-Null Count   Dtype  
---  ------                                                  --------------   -----  
 0   Podcast_Name                                            750000 non-null  float64
 1   Episode_Title                                           750000 non-null  float64
 2   Episode_Length_minutes                                  750000 non-null  float64
 3   Genre                                                   750000 non-null  float64
 4   Host_Popularity_percentage                              750000 non-null  float64
 5   Publication_Day                                         750000 non-null  float64
 6   Publication_Time                                        750000 non-null  float64
 7   Guest_Popularity_percentage                             750000 non-null  float64
 8   Number_of_Ads           

In [13]:
important_features_bygain = ['Episode_Length_minutes','Episode_Title Episode_Length_minutes', 'Episode_Title', 'Podcast_Name','Number_of_Ads',
'Episode_Length_minutes Host_Popularity_percentage', 'Podcast_Name Episode_Length_minutes', 'Host_Popularity_percentage']
idx = []
for i, col in enumerate(x.columns): 
    if col in important_features_bygain: idx.append(i)

In [14]:
x_n = x.iloc[idx]
test_n = test.iloc[idx]

In [15]:
for c in categories: 
    x_n[c] = x_n[c].astype('category')
    test_n[c] = test_n[c].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_n[c] = x_n[c].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_n[c] = test_n[c].astype('category')


In [16]:
K = 20
kf = KFold(K, shuffle=True, random_state=55)

In [17]:
pred = pd.DataFrame()
pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [None]:
for train_idx, val_idx in kf.split(x,y):
    x_t, y_t = x.iloc[train_idx], y.iloc[train_idx]
    x_v, y_v = x.iloc[val_idx], y.iloc[val_idx]
    model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric = 'rmse', enable_categorical = True, device = 'cuda', tree_method="hist", 
                             min_child_weight=10, learning_rate=0.2, n_estimators=1000, importance_type='gain')#, max_depth=5)
    model.fit(x_t, y_t, eval_set=[(x_v,y_v)])
    pred = pd.concat([pd.DataFrame(model.predict(test)), pred], axis=1)

[0]	validation_0-rmse:23.17940
[1]	validation_0-rmse:20.18729
[2]	validation_0-rmse:18.00625
[3]	validation_0-rmse:16.45673
[4]	validation_0-rmse:15.37132
[5]	validation_0-rmse:14.63072
[6]	validation_0-rmse:14.13065
[7]	validation_0-rmse:13.79995
[8]	validation_0-rmse:13.57968
[9]	validation_0-rmse:13.43296
[10]	validation_0-rmse:13.33807
[11]	validation_0-rmse:13.27384
[12]	validation_0-rmse:13.23161
[13]	validation_0-rmse:13.20060
[14]	validation_0-rmse:13.17871
[15]	validation_0-rmse:13.16247
[16]	validation_0-rmse:13.15075
[17]	validation_0-rmse:13.14048
[18]	validation_0-rmse:13.13413
[19]	validation_0-rmse:13.12908
[20]	validation_0-rmse:13.12430
[21]	validation_0-rmse:13.11880
[22]	validation_0-rmse:13.11406
[23]	validation_0-rmse:13.11146
[24]	validation_0-rmse:13.10894
[25]	validation_0-rmse:13.10700
[26]	validation_0-rmse:13.10377
[27]	validation_0-rmse:13.10091
[28]	validation_0-rmse:13.09933
[29]	validation_0-rmse:13.09696
[30]	validation_0-rmse:13.09229
[31]	validation_0-

KeyboardInterrupt: 

In [None]:
pred.reset_index(drop=True, inplace=True)
pred.index += 750000
pred_sum = pred.sum(axis=1) / K

In [None]:
pred.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19
750000,53.298668,52.361591,52.966549,54.165043,52.794289,52.463997,54.422012,53.661659,53.204292,52.611717,52.574097,51.988705,53.003082,53.903122,52.627926,52.728592,52.328392,53.663254,52.523216,52.492809
750001,18.290716,18.291853,18.550024,17.69342,17.678173,18.410538,18.791288,17.600857,18.507013,17.377609,17.931257,17.445681,17.96777,17.52174,17.242069,17.585854,18.316914,18.515453,17.381926,17.87236
750002,51.407646,51.691353,50.344582,49.786228,50.446213,47.886288,48.775753,49.328094,50.009674,49.296379,50.415104,48.872059,49.812183,49.067169,49.045589,50.989155,50.456177,50.861061,48.210011,50.618336
750003,74.652893,74.343224,74.413376,68.889664,68.728775,68.19075,72.52578,72.874405,70.181282,67.215385,70.741333,74.452812,73.686584,71.370468,65.698746,76.342751,71.606705,70.798782,67.864761,72.338287
750004,49.914356,49.379372,48.245712,49.059353,47.89912,48.376598,47.694958,48.669712,48.780224,49.650269,50.076809,48.93285,49.45153,48.402325,48.276005,49.503704,48.960411,48.821632,50.136833,49.574738


In [None]:
pred_sum.rename('Listening_Time_minutes', inplace=True)
pred_sum.head()

750000    52.989155
750001    17.948627
750002    49.865959
750003    71.345840
750004    48.990326
Name: Listening_Time_minutes, dtype: float32

In [None]:
pred_sum.to_csv('submissions/submission_newparams2_10000.csv', index_label='id')