In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import gensim
import re
import nltk

seed = 798589991

In [2]:
comp_data = pd.read_csv("data/competition_data.csv")

In [3]:
comp_data.columns

Index(['accepts_mercadopago', 'available_quantity',
       'avg_gmv_item_domain_30days', 'avg_gmv_item_sel', 'avg_gmv_seller_bday',
       'avg_qty_orders_item_domain_30days', 'avg_qty_orders_item_sel_30days',
       'avg_si_item_sel_30day', 'benefit', 'boosted', 'category_id',
       'conversion', 'date', 'deal_print_id', 'domain_id', 'etl_version',
       'free_shipping', 'fulfillment', 'full_name', 'health', 'is_pdp',
       'product_id', 'item_id', 'listing_type_id', 'logistic_type',
       'main_picture', 'offset', 'original_price', 'platform', 'price',
       'print_position', 'print_server_timestamp', 'qty_items_dom',
       'qty_items_sel', 'site_id', 'sold_quantity', 'tags', 'title',
       'total_asp_item_domain_30days', 'total_asp_item_sel_30days',
       'total_gmv_domain_bday', 'total_gmv_item_30days', 'total_items_domain',
       'total_items_seller', 'total_orders_domain_30days',
       'total_orders_item_30days', 'total_orders_sel_30days',
       'total_si_domain_30days

In [4]:
comp_data[['print_server_timestamp', 'date']]

Unnamed: 0,print_server_timestamp,date
0,2020-03-06T00:19:30.735-0400,2020-03-06
1,2020-04-01T21:20:11.738-0400,2020-04-01
2,2020-04-08T18:38:48.360-0400,2020-04-08
3,2020-04-25T22:01:19.829-0400,2020-04-25
4,2020-03-10T13:20:56.633-0400,2020-03-10
...,...,...
199967,2020-04-12T14:35:44.784-0400,2020-04-12
199968,2020-04-03T17:32:53.035-0400,2020-04-03
199969,2020-04-01T22:51:08.898-0400,2020-04-01
199970,2020-04-08T11:32:27.563-0400,2020-04-08


In [5]:
comp_data['platform'].str.split('/').str[2].value_counts()

android    124781
desktop     35892
mobile      25092
ios         14207
Name: platform, dtype: int64

In [6]:
comp_data['warranty'].str.split().str[0].str.upper().eq('SIN').value_counts()

False    176190
True      23782
Name: warranty, dtype: int64

In [7]:
# counts_ult = comp_data['full_name'].str.split(' -> ').str[-1].value_counts()
# counts_pri = comp_data['full_name'].str.split(' -> ').str[0].value_counts()
# tal vez dejar la más amplia como OHE y la otra hacer algo de word2vec?

In [8]:
# Dropeo las columnas.
comp_data.drop('benefit', inplace=True, axis=1)
comp_data.drop('user_id', inplace=True, axis=1)
comp_data.drop('uid', inplace=True, axis=1)
comp_data.drop('main_picture', inplace=True, axis=1)
comp_data.drop('category_id', inplace=True, axis=1)
comp_data.drop('domain_id', inplace=True, axis=1)
comp_data.drop('deal_print_id', inplace=True, axis=1)
comp_data.drop('etl_version', inplace=True, axis=1)
comp_data.drop('product_id', inplace=True, axis=1)
# comp_data.drop('title', inplace=True, axis=1) # habría que usarlo con word2vec?
comp_data.drop('site_id', inplace=True, axis=1)
comp_data.drop('item_id', inplace=True, axis=1)
comp_data.drop('date', inplace=True, axis=1)
comp_data.drop('accepts_mercadopago', inplace=True, axis=1)

In [9]:
# Diferenciar desktop, ios, android o mobile(/web/mobile debe ser desde navegador y /mobile/ios desde app)
comp_data['platform'] = comp_data['platform'].str.split('/').str[2]

In [10]:
# Obtener primera y última categoría
comp_data['category_first'] = comp_data['full_name'].str.split(' -> ').str[0]
comp_data['category_last'] = comp_data['full_name'].str.split(' -> ').str[-1]

comp_data.drop('full_name', inplace=True, axis=1)

In [11]:
# Transformo garantía en una columna binaria (True, False, NaN)
comp_data['warranty'] = (
    ~comp_data['warranty'].str.split().str[0].str.upper().eq('SIN')
).where(comp_data['warranty'].notna()).astype(bool)

In [12]:
# Creo una columna con el descuento (en porcentaje).
discount = (((comp_data['original_price'] - comp_data['price']) / comp_data['original_price']) * 100).astype(int)
comp_data['discount_%'] = discount

comp_data.drop('original_price', inplace=True, axis=1)

In [13]:
# Consigo los tags posibles.
unique_tags = []
for list in comp_data['tags']:
    list_split = list[1:len(list)-1].split(', ')
    for item in list_split:
        if not (item in unique_tags):
            unique_tags.append(item)

# Separo los tags en columnas de booleanos.
for tag in unique_tags:
    comp_data[tag] = comp_data['tags'].apply(lambda x: tag in x)

comp_data.drop('tags', inplace=True, axis=1)

In [14]:
# Hacer algo inteligente con la date.
comp_data['date'] = pd.to_datetime(comp_data['print_server_timestamp'])
comp_data['month'] = comp_data['date'].dt.month
comp_data['day'] = comp_data['date'].dt.day
comp_data['day_of_week'] = comp_data['date'].dt.dayofweek
comp_data['hour'] = comp_data['date'].dt.hour
comp_data['minute'] = comp_data['date'].dt.minute
comp_data['second'] = comp_data['date'].dt.second

comp_data.drop('date', inplace=True, axis=1)
comp_data.drop('print_server_timestamp', inplace=True, axis=1)

### Word2Vec

In [15]:
def tokenizer(raw_text):
    """
    Tokeniza y preprocesa un texto.

    Args:
        raw_text (str): Texto sin procesar.

    Returns:
        list: Lista de oraciones, donde cada oración es una lista de palabras.
    """
    sentences = sent_tokenize(raw_text)
    sentences = [word_tokenize(e) for e in sentences]
    sentences = [[e2 for e2 in e1 if re.compile("[A-Za-z]").search(e2[0])] for e1 in sentences]
    sentences = [[e2.lower() for e2 in e1] for e1 in sentences]
    return(sentences)

def average_vectors(title_tokens, model, stopwords=None):
    """
    Calcula el vector promedio de un conjunto de tokens utilizando un modelo Word2Vec.

    Args:
        title_tokens (list): Lista de tokens.
        model (gensim.models.Word2Vec): Modelo Word2Vec.
        stopwords (set, optional): Conjunto de palabras stopwords. Defaults to None.

    Returns:
        numpy.ndarray: Vector promedio.
    """
    title_tokens = [e2 for e1 in title_tokens for e2 in e1]
    title_tokens = [e for e in title_tokens if e in model.wv]
    if stopwords is not None:
        title_tokens = [e for e in title_tokens if e not in stopwords]
    if len(title_tokens) == 0:
        output = np.zeros(model.wv.vector_size)
    else:
        output = np.array([model.wv.get_vector(e) for e in title_tokens]).mean(0)
    return output


##### Para title

In [16]:
comp_data['title_tokens'] = comp_data['title'].map(tokenizer)

nltk.download('stopwords')
STOP_WORDS_SP = set(stopwords.words('spanish'))

# Creación del modelo Word2Vec
w2v_title = gensim.models.Word2Vec(vector_size=300,
                                   window=3,
                                   min_count=5,
                                   negative=15,
                                   sample=0.01,
                                   workers=8,
                                   sg=1)

# Creación del vocabulario a partir del corpus
w2v_title.build_vocab([e2 for e1 in comp_data['title_tokens'].values for e2 in e1], 
                       progress_per=10000)

# Entrenamiento del modelo Word2Vec
w2v_title.train([e2 for e1 in comp_data['title_tokens'].values for e2 in e1],
                total_examples=w2v_title.corpus_count,
                epochs=30, report_delay=1)

title_embs = comp_data['title_tokens'].map(lambda x: average_vectors(x, w2v_title, STOP_WORDS_SP))
embedding_title_columns = pd.DataFrame(title_embs.tolist(), columns=[f'title_emb_{i}' for i in range(300)])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaston\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Para category last

In [17]:
comp_data['category_last_tokens'] = comp_data['category_last'].map(tokenizer)

nltk.download('stopwords')
STOP_WORDS_SP = set(stopwords.words('spanish'))

# Creación del modelo Word2Vec
w2v_category = gensim.models.Word2Vec(vector_size=300,
                                      window=3,
                                      min_count=5,
                                      negative=15,
                                      sample=0.01,
                                      workers=8,
                                      sg=1)

# Creación del vocabulario a partir del corpus
w2v_category.build_vocab([e2 for e1 in comp_data['category_last_tokens'].values for e2 in e1], 
                          progress_per=10000)

# Entrenamiento del modelo Word2Vec
w2v_category.train([e2 for e1 in comp_data['category_last_tokens'].values for e2 in e1],
                    total_examples=w2v_title.corpus_count,
                    epochs=30, report_delay=1)

category_embs = comp_data['category_last_tokens'].map(lambda x: average_vectors(x, w2v_category, STOP_WORDS_SP))
embedding_category_columns = pd.DataFrame(title_embs.tolist(), columns=[f'category_emb_{i}' for i in range(300)])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaston\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
comp_data.drop('title_tokens', inplace=True, axis=1)
comp_data.drop('category_last_tokens', inplace=True, axis=1)
comp_data.drop('title', inplace=True, axis=1)
comp_data.drop('category_last', inplace=True, axis=1)
comp_data = pd.concat([comp_data, embedding_title_columns], axis=1)
comp_data = pd.concat([comp_data, embedding_category_columns], axis=1)

In [19]:
cols_to_encode = ['category_first', 'listing_type_id', 'logistic_type', 'platform']
comp_data_encoded = pd.get_dummies(comp_data[cols_to_encode])
comp_data = pd.concat([comp_data, comp_data_encoded], axis=1)
comp_data.drop(columns=cols_to_encode, inplace=True, axis=1)

In [20]:
# Antes de empezar el entrenamiento del modelo, paso a int las columnas de booleano. Lo hago así para no tener problemas con los NaNs
comp_data.replace({False: 0, True: 1}, inplace=True)

In [21]:
comp_data.to_csv("data/2609_data_w2v.csv")