In [37]:
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import confusion_matrix

seed = 798589991

In [2]:
comp_data = pd.read_csv("data/competition_data.csv")

Cosas para hacer:

- Platform -> pasar a una categoria que sea simplemente Desktop o Mobile.
- Chequear si algun uid / user_id se repite, si no, no nos esta dando data y los podemos eliminar.
- Garantia -> pasar a binario tiene o no tiene.
- Foto -> no nos sirve a no ser que de alguna manera determinemos si es buena o mala calidad (bastante complicado a priori), eliminar.
- Separar date en año, mes, dia, hora.
- Deal print -> no parece aportar nada, son todos distintos, eliminar.
- Category id, domain id, full name. Con category y domain tenemos la misma data que full name, podriamos eliminar full name y ver como funciona, porque su OHE va a ser eterno.
- etl version es siempre lo mismo, eliminar.
- title, product id e item id nos dan la misma informacion, dejar una.
- "benefit ignore should be dropped" -> eliminar benefit
- "decimals ignore should be dropped" -> eliminar decimals
- hay descuento? -> original_price - price != 0, crear columna "in_discount"
- rn leftover from ETL, discard -> eliminar rn.
- ver si desagregar tags puede aportar algo

In [3]:
# Dropeo las columnas.
comp_data.drop('benefit', inplace=True, axis=1)
comp_data.drop('user_id', inplace=True, axis=1)
comp_data.drop('uid', inplace=True, axis=1)
comp_data.drop('main_picture', inplace=True, axis=1)
comp_data.drop('full_name', inplace=True, axis=1)
comp_data.drop('deal_print_id', inplace=True, axis=1)
comp_data.drop('etl_version', inplace=True, axis=1)
comp_data.drop('product_id', inplace=True, axis=1)
comp_data.drop('title', inplace=True, axis=1)
comp_data.drop('site_id', inplace=True, axis=1)

In [4]:
# Divido platform en solo Desktop, Ios, Android
for i in range(len(comp_data['platform'])):
    check_plat = comp_data['platform'][i].split('/')
    comp_data['platform'][i] = check_plat[2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_data['platform'][i] = check_plat[2]


In [None]:
# Transformo garantía en una columna binaria (True, False, NaN)
warranty = [True] * 199972
for i in range(len(comp_data['warranty'])):
    if comp_data['warranty'][i] == "Sin garantía":
        warranty[i] = False
    else:
        if pd.notna(comp_data['warranty'][i]):
            warranty[i] = True
        else:
            warranty[i] = np.nan

comp_data['binary_warranty'] = warranty
comp_data.drop('warranty', inplace=True, axis=1)

In [24]:
# Creo una columna con el descuento (en porcentaje).

discount = ((comp_data['original_price'] - comp_data['price']) / comp_data['original_price']) * 100
comp_data['discount_%'] = discount

comp_data.drop('original_price', inplace=True, axis=1)

In [17]:
# Consigo los tags posibles.
unique_tags = []
for list in comp_data['tags']:
    list_split = list[1:len(list)-1].split(', ')
    for item in list_split:
        if not (item in unique_tags):
            unique_tags.append(item)

unique_tags

['good_quality_picture',
 'good_quality_thumbnail',
 'today_promotion',
 'brand_verified',
 'extended_warranty_eligible',
 'immediate_payment',
 'cart_eligible',
 'incomplete_technical_specs',
 'loyalty_discount_eligible',
 'deal_of_the_day',
 'dragged_bids_and_visits',
 'lightning_deal',
 'poor_quality_picture',
 'ahora-12',
 'catalog_listing_eligible',
 'poor_quality_thumbnail',
 'supermarket_eligible',
 'under_infractions']

In [23]:
# Separo los tags en columnas de booleanos.
for tag in unique_tags:
    comp_data[tag] = comp_data['tags'].apply(lambda x: tag in x)

comp_data.drop('tags', inplace=True, axis=1)

In [26]:
# One Hot Encoding

columns_to_encode = ['category_id', 'item_id', 'listing_type_id', 'logistic_type', 'platform']

# Apply one-hot encoding to categorical columns
encoded_columns = pd.get_dummies(comp_data[columns_to_encode], drop_first=True)

# Concatenate the original DataFrame and the one-hot encoded columns
comp_data_encoded = pd.concat([comp_data, encoded_columns], axis=1)

# Drop the original categorical columns
comp_data_encoded.drop(columns=columns_to_encode, inplace=True)

In [None]:
comp_data['binary_warranty'] = comp_data['binary_warranty'].astype(bool)
comp_data = pd.get_dummies(comp_data, columns=['platform'])

In [62]:
object_columns = ['category_id']
comp_data.drop(columns=object_columns, inplace=True, axis=1)

KeyError: "['date', 'domain_id', 'is_pdp', 'item_id', 'listing_type_id', 'logistic_type', 'print_server_timestamp'] not found in axis"

In [61]:
# Dividimos entre la data que tenemos y la de evaluación para submitear.

# La información que tenemos para entrenar y validar.
local_data = comp_data[comp_data["ROW_ID"].isna()]

# La información en la que no tenemos las y, para predecir con el modelo ya entrenado y subir a Kaggle.
kaggle_data = comp_data[comp_data["ROW_ID"].notna()] 

# Entrenamos un modelo de xgboost
y = local_data[['conversion']].copy()
X = local_data.drop(columns=['conversion', 'ROW_ID'], axis = 1)

val_test_size = 0.3 # Proporción de la suma del test de validación y del de test.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = val_test_size, random_state = seed, stratify = y)

cls = xgb.XGBClassifier(objective = 'binary:logistic', seed = seed, eval_metric = 'auc')
cls.fit(X_train, y_train, eval_set = [(X_val, y_val)])

# Chequeamos el valor debajo de la curva AUC-ROC
y_pred = cls.predict_proba(X_val)[:, 1]
auc_roc = sklearn.metrics.roc_auc_score(y_val, y_pred)
print('AUC-ROC validación: %0.5f' % auc_roc)


# Predicción en la data de kaggle para submitear.
kaggle_data = kaggle_data.drop(columns=["conversion"])
y_preds = cls.predict_proba(kaggle_data.drop(columns=["ROW_ID"]))[:, cls.classes_ == 1].squeeze()

# Generamos el archivo para submit en base a lo predicho.
submission_df = pd.DataFrame({"ROW_ID": kaggle_data["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("xgboost_model/xgboost_model.csv", sep=",", index=False)


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:category_id: object