In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import spacy

from os import path
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
DATA_DIR = "../data/meli"

In [None]:
nlp_es = spacy.load("es")
nlp_pt = spacy.load("pt")

In [None]:
train_data = pd.read_csv(path.join(DATA_DIR, "train.csv.gz"))
test_data = pd.read_csv(path.join(DATA_DIR, "test.csv"))

In [None]:
train_data["normalized_title"] = train_data.title.str.lower()
train_data["normalized_title"] = train_data.normalized_title.str.replace("\s+", " ")

In [None]:
test_data["normalized_title"] = test_data.title.str.lower()
test_data["normalized_title"] = test_data.normalized_title.str.replace("\s+", " ")

In [None]:
def process_title(row):
    if row.language == "portuguese":
        doc = nlp_pt(row.normalized_title, disable=["parser", "ner"])
    else:
        doc = nlp_es(row.normalized_title, disable=["parser", "ner"])
    
    return [(t.text, t.pos_) for t in doc]

In [None]:
def get_list_values(series, column):
    for reg in series:
        yield [v[column] for v in reg]

In [None]:
train_data["tokens"] = train_data.apply(process_title, axis=1)

train_data["words"] = [r for r in get_list_values(train_data.tokens, 0)]
train_data["pos"] = [r for r in get_list_values(train_data.tokens, 1)]

In [None]:
train_data[["title", "label_quality", "language", "words", "pos", "split", "category"]].to_parquet(
    DATA_DIR + "/train_tokenized.parquet", index=None
)

In [None]:
test_data["tokens"] = test_data.apply(process_title, axis=1)

test_data["words"] = [r for r in get_list_values(test_data.tokens, 0)]
test_data["pos"] = [r for r in get_list_values(test_data.tokens, 1)]

In [None]:
test_data[["id", "title", "language", "words", "pos"]].to_parquet(
    DATA_DIR + "/test_tokenized.parquet", index=None)

In [None]:
reliable_indices = train_data[train_data.label_quality == "reliable"].index

valid_reliable_categories = train_data.loc[reliable_indices]["category"].value_counts()
valid_reliable_categories = set(valid_reliable_categories[valid_reliable_categories >= 5].index)
valid_reliable_indices = train_data[(train_data.label_quality == "reliable") &
                                    (train_data.category.isin(valid_reliable_categories))].index

unreliable_indices = train_data[train_data.label_quality == "unreliable"].index

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.05, random_state=42)
train_index_reliable, dev_index_reliable = next(sss.split(train_data.loc[valid_reliable_indices],
                                                          train_data.loc[valid_reliable_indices]["category"]))

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.05, random_state=42)
train_index_unreliable, dev_index_unreliable = next(sss.split(train_data.loc[unreliable_indices],
                                                              train_data.loc[unreliable_indices]["category"]))

In [None]:
train_index = np.hstack([
    train_data.loc[valid_reliable_indices].iloc[train_index_reliable].index.values,
    train_data.loc[unreliable_indices].iloc[train_index_unreliable].index.values
])

dev_index = np.hstack([
    train_data.loc[valid_reliable_indices].iloc[dev_index_reliable].index.values,
    train_data.loc[unreliable_indices].iloc[dev_index_unreliable].index.values
])

In [None]:
train_data.loc[train_index, "split"] = "train"
train_data.loc[dev_index, "split"] = "dev"
train_data.split.fillna("dev", inplace=True)

In [None]:
train_data.category.value_counts()

In [None]:
train_data[train_data.split=="dev"].category.value_counts()

In [None]:
train_data[train_data.split=="dev"].groupby(["language", "label_quality"]).size()

In [None]:
train_data.to_parquet(path.join(DATA_DIR, "./train_tokenized.parquet"), index=None)