In [None]:
import sys
sys.path.append('../src')

In [None]:
import pandas as pd
import numpy as np
from utils import quick_eda

df_train = pd.read_csv('../data/playground-series-s5e4/train.csv')
quick_eda(df_train)

In [None]:
import matplotlib.pyplot as plt

df_train['Episode_Index'] = df_train['Episode_Title'].str.extract(r'(\d+)').astype(int)

grouped_df = (
    df_train
    .groupby('Episode_Index', as_index=False)
    .agg({'Listening_Time_minutes': 'mean'})
)

plt.figure(figsize=(15, 5))
plt.plot(grouped_df['Episode_Index'], grouped_df['Listening_Time_minutes'])
plt.xlabel('Episode number')
plt.ylabel('Listening time (in minutes)')
plt.ylim(35, 55)
plt.grid()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

x = df_train.drop(columns=['id', 'Episode_Title', 'Episode_Index', 'Listening_Time_minutes'])
y = df_train['Listening_Time_minutes']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

print(f'Treino: {x_train.shape}')
print(f'Validação: {x_valid.shape}')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import TargetEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error
from scipy.stats import uniform, randint

class GroupImputer(OneToOneFeatureMixin, BaseEstimator, TransformerMixin):
    def __init__(self, group_col, impute_cols):
        self.group_col = group_col
        self.impute_cols = impute_cols
        self.group_means = None
        self.overall_means = None

    def fit(self, X, y=None):
        X = pd.DataFrame(X, columns=[self.group_col] + self.impute_cols)
        self.group_means = X.groupby(self.group_col)[self.impute_cols].mean()
        self.overall_means = X[self.impute_cols].mean()
        return self

    def transform(self, X):
        X = pd.DataFrame(X, columns=[self.group_col] + self.impute_cols)
        for col in self.impute_cols:
            X[col] = X[col].fillna(X[self.group_col].map(self.group_means[col]).fillna(self.overall_means[col]))
        return X[self.impute_cols]

preprocessor = ColumnTransformer(
    [
        (
            'target_encoding', 
            TargetEncoder(target_type='continuous'),
            ['Podcast_Name']
        ),
        (
            'onehot_encoding',
            OneHotEncoder(handle_unknown='ignore', sparse_output=False),
            ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
        ),
        (
            'missing_imputation',
            GroupImputer(group_col='Podcast_Name', impute_cols=['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads']), 
            ['Podcast_Name', 'Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads']
        )
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

param_grid = {
    'regressor__n_estimators': randint(150, 201),
    'regressor__max_depth': randint(4, 9),
    'regressor__min_samples_split': randint(2, 5),
    'regressor__min_samples_leaf': randint(1, 4),
    'regressor__max_features': ['sqrt', 'log2', None]
}

grid_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

grid_search.fit(x_train, y_train)

y_pred = grid_search.predict(x_valid)
score = root_mean_squared_error(y_valid, y_pred)

print(f'RMSE (validação): {score:.4f}')

In [None]:
df_train['Episode_Number'] = df_train['Episode_Title'].str.extract(r'(\d+)').astype(int)
df_train['Episode_Length_minutes'] = df_train.groupby('Podcast_Name')['Episode_Length_minutes'].transform(lambda x: x.fillna(x.mean()))
df_train['Genre_encoded'] = df_train.groupby('Genre')['Listening_Time_minutes'].transform('mean')

In [None]:
quick_eda(x_train)