In [1]:
import pandas as pd
import tensorflow as tf

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

In [2]:
DATA_DIR = "../data"

In [3]:
train_df = pd.read_parquet(DATA_DIR + "/meli/train_reliable.parquet")
train_df.head()

Unnamed: 0,title,label_quality,language,words,pos,split,category
16,Projeto Unidade Hidraulica 3000 Psi,reliable,portuguese,"[projeto, unidade, hidraulica, 3000, psi]","[NOUN, NOUN, ADJ, NUM, NOUN]",train,AIR_COMPRESSORS
25,Tapete Capacho 120x60 Churrasqueira + Frete Gr...,reliable,portuguese,"[tapete, capacho, 120x60, churrasqueira, +, fr...","[NOUN, VERB, NUM, ADJ, PROPN, ADJ, ADJ]",train,CARPETS
57,Camiseta Raglan Crepúsculo Jealous Baby Look,reliable,portuguese,"[camiseta, raglan, crepúsculo, jealous, baby, ...","[VERB, VERB, ADJ, NOUN, ADJ, NOUN]",train,T_SHIRTS
82,Unidade De Dvd Gravador Com Defeito Apenas Par...,reliable,portuguese,"[unidade, de, dvd, gravador, com, defeito, ape...","[NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADV, ADP, NOUN]",train,DVD_RECORDERS
99,Fan Dell R320 / R420 0hr6c0 - 24h,reliable,portuguese,"[fan, dell, r320, /, r420, 0hr6c0, -, 24h]","[ADV, VERB, NOUN, PUNCT, NOUN, NUM, PUNCT, NOUN]",train,DESKTOP_COMPUTER_COOLERS_AND_FANS


In [4]:
train_df["normalized_title"] = train_df["words"].apply(lambda words: " ".join(words))

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_iris
iris = load_iris()

In [11]:
param_grid = {
    "loss": ["hinge", "squared_hinge"],
    "dual": [True],
    "C": [2.0, 1.0, 0.5, 0.25],
    "class_weight": [None, "balanced"],
    "max_iter": [500, 1000, 1500, 2000],
    "random_state": [42]
}

search = RandomizedSearchCV(LinearSVC(), param_grid, n_iter=10, scoring="balanced_accuracy")
search.fit(iris["data"], iris["target"])



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                       fit_intercept=True, intercept_scaling=1,
                                       loss='squared_hinge', max_iter=1000,
                                       multi_class='ovr', penalty='l2',
                                       random_state=None, tol=0.0001,
                                       verbose=0),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'C': [2.0, 1.0, 0.5, 0.25],
                                        'class_weight': [None, 'balanced'],
                                        'dual': [True],
                                        'loss': ['hinge', 'squared_hinge'],
                                        'max_iter': [500, 1000, 1500, 2000],
                                        'random_state': [42]},
                   pre_dispatch='2*n_jobs

In [5]:
train_df[(train_df.split == "train") & (train_df.label_quality == "unreliable")].to_parquet(
    DATA_DIR + "/meli/train_unreliable.parquet", index=None)

In [13]:
train_df = pd.read_parquet(DATA_DIR + "/meli/train_reliable.parquet")
dev_df = pd.read_parquet(DATA_DIR + "/meli/dev.parquet")

test_df = pd.read_parquet(DATA_DIR + "/meli/test.parquet")

In [None]:
lbl_enc = LabelEncoder()
dev_df["target"] = lbl_enc.fit_transform(dev_df.category)
train_df["target"] = lbl_enc.transform(train_df.category)

In [None]:
es_train_df = train_df[train_df.language == "spanish"]
pt_train_df = train_df[train_df.language == "portuguese"]

es_dev_df = dev_df[dev_df.language == "spanish"]
pt_dev_df = dev_df[dev_df.language == "portuguese"]

In [None]:
es_test_df = test_df[test_df.language == "spanish"]
pt_test_df = test_df[test_df.language == "portuguese"]

# Spanish

In [None]:
es_sw = set(stopwords.words("spanish"))

def token_extractor(tokens):
    return tokens.tolist()

es_count = CountVectorizer(strip_accents="unicode", min_df=2,
                           analyzer=token_extractor, ngram_range=(1, 2),
                           max_features=30000)
es_count.fit(list(es_train_df.words) + list(es_dev_df.words))
es_train_cv = es_count.transform(es_train_df.words)
es_dev_cv = es_count.transform(es_dev_df.words)

In [None]:
%%time
es_clf = SGDClassifier(loss="hinge", verbose=0, random_state=42, n_jobs=-1, max_iter=1500)
es_clf.fit(es_train_cv, es_train_df.target)

In [None]:
es_train_df["predictions"] = es_clf.predict(es_train_cv)
es_dev_df["predictions"] = es_clf.predict(es_dev_cv)

In [None]:
print(balanced_accuracy_score(es_train_df.target, es_train_df.predictions))
print(balanced_accuracy_score(es_dev_df.target, es_dev_df.predictions))

In [None]:
balanced_accuracy_score(es_dev_df[es_dev_df.label_quality=="reliable"].target,
                        es_dev_df[es_dev_df.label_quality=="reliable"].predictions)

In [None]:
es_test_cv = es_count.transform(es_test_df.words)
es_test_df["predictions"] = es_clf.predict(es_test_cv)
es_test_df["category"] = lbl_enc.inverse_transform(es_test_df.predictions)
es_test_df.head()

# Portuguese

In [None]:
pt_sw = set(stopwords.words("spanish"))

def token_extractor(tokens):
    return tokens.tolist()

pt_count = CountVectorizer(strip_accents="unicode", min_df=2,
                           analyzer=token_extractor, ngram_range=(1, 2),
                           max_features=20000)
pt_count.fit(list(pt_train_df.words) + list(pt_dev_df.words))
pt_train_cv = pt_count.transform(pt_train_df.words)
pt_dev_cv = pt_count.transform(pt_dev_df.words)

In [None]:
%%time
pt_clf = LinearSVC(verbose=10, random_state=42)
pt_clf.fit(pt_train_cv, pt_train_df.target)

In [None]:
pt_train_df["predictions"] = pt_clf.predict(pt_train_cv)
pt_dev_df["predictions"] = pt_clf.predict(pt_dev_cv)

In [None]:
print(balanced_accuracy_score(pt_train_df.target, pt_train_df.predictions))
print(balanced_accuracy_score(pt_dev_df.target, pt_dev_df.predictions))

In [None]:
balanced_accuracy_score(pt_dev_df[pt_dev_df.label_quality=="reliable"].target,
                        pt_dev_df[pt_dev_df.label_quality=="reliable"].predictions)

In [None]:
pt_test_cv = pt_count.transform(pt_test_df.words)
pt_test_df["predictions"] = pt_clf.predict(pt_test_cv)
pt_test_df["category"] = lbl_enc.inverse_transform(pt_test_df.predictions)
pt_test_df.head()

# Final results

In [None]:
submission = pd.concat([es_test_df, pt_test_df])[["id", "title", "category"]].sort_values("id")
submission.head()

In [None]:
submission[["id", "category"]].to_csv(DATA_DIR + "/meli/submission_1.csv", index=False)