In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn
from hyperopt import hp, tpe, fmin
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

seed = 798589991

In [None]:
comp_data = pd.read_csv("data/competition_data.csv")

Cosas para hacer:

- Platform -> pasar a una categoria que sea simplemente Desktop o Mobile.
- Chequear si algun uid / user_id se repite, si no, no nos esta dando data y los podemos eliminar.
- Garantia -> pasar a binario tiene o no tiene.
- Foto -> no nos sirve a no ser que de alguna manera determinemos si es buena o mala calidad (bastante complicado a priori), eliminar.
- Separar date en año, mes, dia, hora.
- Deal print -> no parece aportar nada, son todos distintos, eliminar.
- Category id, domain id, full name. Con category y domain tenemos la misma data que full name, podriamos eliminar full name y ver como funciona, porque su OHE va a ser eterno.
- etl version es siempre lo mismo, eliminar.
- title, product id e item id nos dan la misma informacion, dejar una.
- "benefit ignore should be dropped" -> eliminar benefit
- "decimals ignore should be dropped" -> eliminar decimals
- hay descuento? -> original_price - price != 0, crear columna "in_discount"
- rn leftover from ETL, discard -> eliminar rn.
- ver si desagregar tags puede aportar algo

In [None]:
# Dropeo las columnas.
comp_data.drop('benefit', inplace=True, axis=1)
comp_data.drop('user_id', inplace=True, axis=1)
comp_data.drop('uid', inplace=True, axis=1)
comp_data.drop('main_picture', inplace=True, axis=1)
comp_data.drop('category_id', inplace=True, axis=1)
comp_data.drop('domain_id', inplace=True, axis=1)
comp_data.drop('deal_print_id', inplace=True, axis=1)
comp_data.drop('etl_version', inplace=True, axis=1)
comp_data.drop('product_id', inplace=True, axis=1)
comp_data.drop('title', inplace=True, axis=1)
comp_data.drop('site_id', inplace=True, axis=1)
comp_data.drop('item_id', inplace=True, axis=1)
comp_data.drop('print_server_timestamp', inplace=True, axis=1)
comp_data.drop('accepts_mercadopago', inplace=True, axis=1)

In [None]:
# Divido platform en solo desktop (web desde computadora), ios (app ios), android (app android), mobile (web desde celular).

type_of_product = []
for i in range(len(comp_data['platform'])):
    check_plat = comp_data['platform'][i].split('/')
    item_split = comp_data['full_name'][i].split(' -> ')
    comp_data['platform'][i] = check_plat[2]
    comp_data['full_name'][i] = item_split[0]
    type_of_product.append(item_split[-1])

comp_data['type_product'] = type_of_product

In [None]:
# Transformo garantía en una columna binaria (True, False, NaN)
warranty = [True] * 199972
for i in range(len(comp_data['warranty'])):
    if comp_data['warranty'][i] == "Sin garantía":
        warranty[i] = False
    else:
        if pd.notna(comp_data['warranty'][i]):
            warranty[i] = True
        else:
            warranty[i] = np.nan

comp_data['binary_warranty'] = warranty
comp_data['binary_warranty'] = comp_data['binary_warranty'].astype(bool)

comp_data.drop('warranty', inplace=True, axis=1)

In [None]:
# Creo una columna con el descuento (en porcentaje).

discount = ((comp_data['original_price'] - comp_data['price']) / comp_data['original_price']) * 100
comp_data['discount_%'] = discount

comp_data.drop('original_price', inplace=True, axis=1)

In [None]:
# Consigo los tags posibles.
unique_tags = []
for list in comp_data['tags']:
    list_split = list[1:len(list)-1].split(', ')
    for item in list_split:
        if not (item in unique_tags):
            unique_tags.append(item)

In [None]:
# Separo los tags en columnas de booleanos.
for tag in unique_tags:
    comp_data[tag] = comp_data['tags'].apply(lambda x: tag in x)

comp_data.drop('tags', inplace=True, axis=1)

In [None]:
# Hacer algo inteligente con la date.
comp_data['date'] = pd.to_datetime(comp_data['date'])
comp_data['month'] = comp_data['date'].dt.month
comp_data['day'] = comp_data['date'].dt.day
comp_data['day_of_week'] = comp_data['date'].dt.dayofweek
comp_data.drop('date', inplace=True, axis=1)

In [None]:
cols_to_encode = ['full_name', 'listing_type_id', 'logistic_type', 'platform', 'type_product']
comp_data_encoded = pd.get_dummies(comp_data[cols_to_encode])
comp_data = pd.concat([comp_data, comp_data_encoded], axis=1)
comp_data.drop(columns=cols_to_encode, inplace=True, axis=1)

In [None]:
# Antes de empezar el entrenamiento del modelo, paso a int las columnas de booleano. Lo hago así para no tener problemas con los NaNs
comp_data.replace({False: 0, True: 1}, inplace=True)

In [None]:
# comp_data.to_csv("data/eng_data.csv")
# comp_data.to_pickle("data/eng_data.pkt")