In [None]:
import numpy as np
import pandas as pd


from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer, \
  RobustScaler, FunctionTransformer, LabelEncoder

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

from sklearn.svm import SVR, NuSVR, LinearSVR
from pprint import pformat, pprint

In [None]:
SEED=13

In [None]:
def RMSE(y, yhat) -> float:
    return np.sqrt(np.mean(np.square(y - yhat)))

def MAE(y, yhat) -> float:
    return np.mean(np.absolute(yhat - y))

def MAPE(y, yhat) -> float:
    return np.mean(np.absolute((y - yhat) / y)) * 100

In [None]:
df = pd.read_csv('./Facebook_metrics/dataset_Facebook.csv', sep=';', engine='python')
new_columns = {'Lifetime People who have liked your Page and engaged with your post':'LPE'}
df.rename(columns=new_columns, inplace=True)

df = df[['Category', 'Page total likes', 'Type', 'Post Month', 'Post Hour', 'Post Weekday', 'Paid', 'LPE']].copy()

y = df['LPE']
X = df.loc[:, df.columns != 'LPE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, random_state=SEED)

print( "X Entrenamiento y validación",  X_train.shape)
print( "X Pruebas", X_test.shape)

print( "Y Entrenamiento y validación", y_train.shape)
print( "Y Pruebas", y_test.shape)


In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])
numeric_features = ['Page total likes']


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
])
categorical_features = ['Category', 'Paid', 'Post Month', 'Post Hour', 'Post Weekday']

onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])
onehot_features = ['Type']


ColumnPreprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_features),
    ('onehot',      onehot_transformer,      onehot_features),
    ('numeric',     numeric_transformer,     numeric_features),
])


# mi_regressor = RandomForestRegressor()



In [None]:
repeated_kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=SEED)

In [46]:
hidden = [(i, i) for i in range(4, 22, 4)] + [(100,), (100,100)]

print(hidden)

[(4, 4), (8, 8), (12, 12), (16, 16), (20, 20), (100,), (100, 100)]


In [49]:
mi_regressor = TransformedTargetRegressor(
    regressor=MLPRegressor(), 
    func=np.log1p,
    inverse_func=np.expm1
)

fullPipeline = Pipeline(steps=[
    ('preprocessor', ColumnPreprocessor),
    ('transformed_regressor', mi_regressor)
])

hidden_layers =[(i, i) for i in range(5, 20, 3)] + [(100,), (100,100)]
param_grid = {    
    'transformed_regressor__regressor__max_iter': [5000],
    'transformed_regressor__regressor__hidden_layer_sizes': hidden_layers,
    'transformed_regressor__regressor__alpha': [0.0001, 0.001],
    'transformed_regressor__regressor__learning_rate_init': [0.001, 0.01, 0.1],    
}

search = GridSearchCV(
    fullPipeline, 
    param_grid, 
    cv=repeated_kfold,
    scoring=make_scorer(MAPE, greater_is_better=True),
    n_jobs=-1,
)

search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % search.best_score_)
pprint(search.best_params_)

Best parameter (CV score=104.755):
{'transformed_regressor__regressor__alpha': 0.001,
 'transformed_regressor__regressor__hidden_layer_sizes': (100, 100),
 'transformed_regressor__regressor__learning_rate_init': 0.001,
 'transformed_regressor__regressor__max_iter': 5000}


In [None]:
mi_regressor = TransformedTargetRegressor(
    regressor=RandomForestRegressor(), 
    func=np.log1p,
    inverse_func=np.expm1
)

# mi_regressor = RandomForestRegressor()

fullPipeline = Pipeline(steps=[
    ('preprocessor', ColumnPreprocessor),
    ('transformed_regressor', mi_regressor)
])


param_grid = {    
    'transformed_regressor__regressor__n_estimators': [50, 100, 200],
    'transformed_regressor__regressor__max_features': ['sqrt', 'log2'],
    'transformed_regressor__regressor__max_depth': [4, 5, 5, 6, 7, 8],
    'transformed_regressor__regressor__criterion': ['squared_error', 'absolute_error'],
}

search = GridSearchCV(
    fullPipeline, 
    param_grid, 
    cv=repeated_kfold,
    scoring=make_scorer(MAPE, greater_is_better=True),
    n_jobs=-1,
)

search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % search.best_score_)
pprint(search.best_params_)

In [50]:


mi_regressor = TransformedTargetRegressor(
    regressor=LinearRegression(),
    func=np.log1p,
    inverse_func=np.expm1
)

fullPipeline = Pipeline(steps=[
    ('preprocessor', ColumnPreprocessor),
    ('transformed_regressor', mi_regressor)
])

param_grid = {
    "preprocessor__numeric__imputer__strategy": ['mean', 'median'],
    # "preprocessor__numeric__scaler": [RobustScaler(), PowerTransformer(), None],
    "preprocessor__categorical__imputer__strategy": ['most_frequent', 'constant'],
    # "transformed_regressor__transformer": [None, PowerTransformer(), RobustScaler()],
}

search = GridSearchCV(
    fullPipeline, 
    param_grid, 
    cv=repeated_kfold,
    scoring=make_scorer(MAPE, greater_is_better=True),
    n_jobs=-1,
)

search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % search.best_score_)
pprint(search.best_params_)

Best parameter (CV score=85.814):
{'preprocessor__categorical__imputer__strategy': 'most_frequent',
 'preprocessor__numeric__imputer__strategy': 'mean'}


In [61]:


mi_regressor = TransformedTargetRegressor(
    regressor=SVR(),
    func=np.log1p,
    inverse_func=np.expm1
)

fullPipeline = Pipeline(steps=[
    ('preprocessor', ColumnPreprocessor),
    ('transformed_regressor', mi_regressor)
])

param_grid = {
    "transformed_regressor__regressor__kernel": ['rbf', 'linear', 'poly'],
    "transformed_regressor__regressor__C": [0.0001, 0.001, 0.01],
    "transformed_regressor__regressor__gamma": ['scale', 'auto'],
    "transformed_regressor__regressor__degree": [3, 4],
}

search = GridSearchCV(
    fullPipeline, 
    param_grid, 
    cv=repeated_kfold,
    scoring=make_scorer(MAPE, greater_is_better=True),
    n_jobs=-1,
)

search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % search.best_score_)
pprint(search.best_params_)

Best parameter (CV score=92.876):
{'transformed_regressor__regressor__C': 0.0001,
 'transformed_regressor__regressor__degree': 3,
 'transformed_regressor__regressor__gamma': 'scale',
 'transformed_regressor__regressor__kernel': 'rbf'}
