In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn
from hyperopt import hp, tpe, fmin
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

seed = 798589991

In [3]:
comp_data = pd.read_csv("data/competition_data.csv")

Cosas para hacer:

- Platform -> pasar a una categoria que sea simplemente Desktop o Mobile.
- Chequear si algun uid / user_id se repite, si no, no nos esta dando data y los podemos eliminar.
- Garantia -> pasar a binario tiene o no tiene.
- Foto -> no nos sirve a no ser que de alguna manera determinemos si es buena o mala calidad (bastante complicado a priori), eliminar.
- Separar date en año, mes, dia, hora.
- Deal print -> no parece aportar nada, son todos distintos, eliminar.
- Category id, domain id, full name. Con category y domain tenemos la misma data que full name, podriamos eliminar full name y ver como funciona, porque su OHE va a ser eterno.
- etl version es siempre lo mismo, eliminar.
- title, product id e item id nos dan la misma informacion, dejar una.
- "benefit ignore should be dropped" -> eliminar benefit
- "decimals ignore should be dropped" -> eliminar decimals
- hay descuento? -> original_price - price != 0, crear columna "in_discount"
- rn leftover from ETL, discard -> eliminar rn.
- ver si desagregar tags puede aportar algo

In [4]:
# Dropeo las columnas.
comp_data.drop('benefit', inplace=True, axis=1)
comp_data.drop('user_id', inplace=True, axis=1)
comp_data.drop('uid', inplace=True, axis=1)
comp_data.drop('main_picture', inplace=True, axis=1)
comp_data.drop('category_id', inplace=True, axis=1)
comp_data.drop('domain_id', inplace=True, axis=1)
comp_data.drop('deal_print_id', inplace=True, axis=1)
comp_data.drop('etl_version', inplace=True, axis=1)
comp_data.drop('product_id', inplace=True, axis=1)
comp_data.drop('title', inplace=True, axis=1)
comp_data.drop('site_id', inplace=True, axis=1)
comp_data.drop('item_id', inplace=True, axis=1)
comp_data.drop('print_server_timestamp', inplace=True, axis=1)

In [10]:
# Divido platform en solo desktop (web desde computadora), ios (app ios), android (app android), mobile (web desde celular).

type_of_product = []
for i in range(len(comp_data['platform'])):
    check_plat = comp_data['platform'][i].split('/')
    item_split = comp_data['full_name'][i].split(' -> ')
    comp_data['platform'][i] = check_plat[2]
    comp_data['full_name'][i] = item_split[0]
    type_of_product.append(item_split[-1])

comp_data['type_product'] = type_of_product

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_data['platform'][i] = check_plat[2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_data['full_name'][i] = item_split[0]


In [5]:
# Transformo garantía en una columna binaria (True, False, NaN)
warranty = [True] * 199972
for i in range(len(comp_data['warranty'])):
    if comp_data['warranty'][i] == "Sin garantía":
        warranty[i] = False
    else:
        if pd.notna(comp_data['warranty'][i]):
            warranty[i] = True
        else:
            warranty[i] = np.nan

comp_data['binary_warranty'] = warranty
comp_data['binary_warranty'] = comp_data['binary_warranty'].astype(bool)

comp_data.drop('warranty', inplace=True, axis=1)

In [6]:
# Creo una columna con el descuento (en porcentaje).

discount = ((comp_data['original_price'] - comp_data['price']) / comp_data['original_price']) * 100
comp_data['discount_%'] = discount

comp_data.drop('original_price', inplace=True, axis=1)

In [7]:
# Consigo los tags posibles.
unique_tags = []
for list in comp_data['tags']:
    list_split = list[1:len(list)-1].split(', ')
    for item in list_split:
        if not (item in unique_tags):
            unique_tags.append(item)

In [8]:
# Separo los tags en columnas de booleanos.
for tag in unique_tags:
    comp_data[tag] = comp_data['tags'].apply(lambda x: tag in x)

comp_data.drop('tags', inplace=True, axis=1)

In [9]:
# Hacer algo inteligente con la date.
comp_data['date'] = pd.to_datetime(comp_data['date'])
comp_data['month'] = comp_data['date'].dt.month
comp_data['day'] = comp_data['date'].dt.day
comp_data['day_of_week'] = comp_data['date'].dt.dayofweek
comp_data.drop('date', inplace=True, axis=1)

In [11]:
cols_to_encode = ['full_name', 'listing_type_id', 'logistic_type', 'platform', 'type_product']
comp_data_encoded = pd.get_dummies(comp_data[cols_to_encode])
comp_data = pd.concat([comp_data, comp_data_encoded], axis=1)
comp_data.drop(columns=cols_to_encode, inplace=True, axis=1)

In [12]:
# Antes de empezar el entrenamiento del modelo, paso a int las columnas de booleano. Lo hago así para no tener problemas con los NaNs
comp_data.replace({False: 0, True: 1}, inplace=True)

In [17]:
# Dividimos entre la data que tenemos y la de evaluación para submitear.

# La información que tenemos para entrenar y validar.
local_data = comp_data[comp_data["ROW_ID"].isna()]

# La información en la que no tenemos las y, para predecir con el modelo ya entrenado y subir a Kaggle.
kaggle_data = comp_data[comp_data["ROW_ID"].notna()] 

# Entrenamos un modelo de xgboost
y = local_data[['conversion']].copy()
X = local_data.drop(columns=['conversion', 'ROW_ID'], axis = 1)

val_test_size = 0.3 # Proporción de la suma del test de validación y del de test.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = val_test_size, random_state = seed, stratify = y)

# # Define the search space for hyperparameters
# space = {
#     'max_depth': hp.choice('max_depth', range(1, 10)),
#     'learning_rate': hp.loguniform('learning_rate', -5, 0),
#     'n_estimators': hp.choice('n_estimators', range(50, 300)),
#     'gamma': hp.loguniform('gamma', -5, 0),
#     'subsample': hp.uniform('subsample', 0.5, 1.0),  # Add subsample
#     'min_child_weight': hp.choice('min_child_weight', range(1, 10)),  # Add min_child_weight
#     'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),  # Add colsample_bytree
#     'reg_lambda': hp.loguniform('reg_lambda', -5, 5),  # Add reg_lambda
# }

# # Objective function for hyperparameter optimization
# def objective(params):
#     cls = xgb.XGBClassifier(
#         objective='binary:logistic',
#         seed=seed,
#         eval_metric='auc',
#         max_depth=params['max_depth'],
#         learning_rate=params['learning_rate'],
#         n_estimators=params['n_estimators'],
#         gamma=params['gamma'],
#         subsample=params['subsample'],  # Use subsample
#         min_child_weight=params['min_child_weight'],  # Use min_child_weight
#         colsample_bytree=params['colsample_bytree'],  # Use colsample_bytree
#         reg_lambda=params['reg_lambda'],  # Use reg_lambda
#     )

#     cls.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    
#     y_pred = cls.predict_proba(X_val)[:, 1]
#     auc_roc = sklearn.metrics.roc_auc_score(y_val, y_pred)
    
#     return -auc_roc  # We want to maximize AUC-ROC, so we negate it for minimization

# # Set up Hyperopt search
# best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=350)  # Adjust max_evals as needed

# # Print the best hyperparameters
# print("Best Hyperparameters:")
# print(best)

best_max_depth = 6
best_learning_rate = 0.09512855850107411
best_n_estimators = 224
best_gamma = 0.10113552100045467
best_subsample = 0.9158989170972806
best_min_child_weight = 3
best_colsample_bytree = 0.9000335771350197
best_reg_lambda = 12.654060098206006

final_cls = make_pipeline(StandardScaler(), SimpleImputer(), xgb.XGBClassifier(
        objective='binary:logistic',
        seed=seed,
        eval_metric='auc',
        max_depth=best_max_depth,
        learning_rate=best_learning_rate,
        n_estimators=best_n_estimators,
        gamma=best_gamma,
        subsample=best_subsample,
        min_child_weight=best_min_child_weight,
        colsample_bytree=best_colsample_bytree,
        reg_lambda=best_reg_lambda,
    )
)

final_cls.fit(X_train, y_train)


# Chequeamos el valor debajo de la curva AUC-ROC
y_pred = final_cls.predict_proba(X_val)[:, 1]
auc_roc = sklearn.metrics.roc_auc_score(y_val, y_pred)
print('AUC-ROC validación: %0.5f' % auc_roc)

AUC-ROC validación: 0.88985


In [19]:
# Para hacer submit.
all_data_cls = make_pipeline(StandardScaler(), SimpleImputer(), xgb.XGBClassifier(
    objective='binary:logistic',
    seed=seed,
    eval_metric='auc',
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    n_estimators=best_n_estimators,
    gamma=best_gamma,
    subsample=best_subsample,
    min_child_weight=best_min_child_weight,
    colsample_bytree=best_colsample_bytree,
    reg_lambda=best_reg_lambda,
))

all_data_cls.fit(X, y)

# Predicción en la data de kaggle para submitear.
kaggle_data = kaggle_data.drop(columns=["conversion"])
y_preds = all_data_cls.predict_proba(kaggle_data.drop(columns=["ROW_ID"]))[:, final_cls.classes_ == 1].squeeze()

# Generamos el archivo para submit en base a lo predicho.
submission_df = pd.DataFrame({"ROW_ID": kaggle_data["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("xgboost_model/xgboost_model.csv", sep=",", index=False)