In [None]:
import pandas as pd
import csv
import gzip

from datetime import datetime
from gensim.corpora import Dictionary

from tqdm import tqdm_notebook

from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import balanced_accuracy_score

In [None]:
DATA_DIR = "../data/meli"

In [None]:
print(f"{datetime.now()}")
dataset_train = pd.read_parquet(DATA_DIR + "/train_tokenized.parquet")
print(f"{datetime.now()}")

dataset_train.head()

In [None]:
es_df = dataset_train[dataset_train.language == "spanish"]
pt_df = dataset_train[dataset_train.language == "portuguese"]

es_df_train = es_df[es_df.split == "train"]
es_df_dev = es_df[es_df.split == "dev"]

pt_df_train = pt_df[pt_df.split == "train"]
pt_df_dev = pt_df[pt_df.split == "dev"]

In [None]:
print(f"{datetime.now()}")
dataset_test = pd.read_parquet(DATA_DIR + "/test_tokenized.parquet")
print(f"{datetime.now()}")

dataset_test.head()

In [None]:
es_df_test = dataset_test[dataset_test.language == "spanish"]
pt_df_test = dataset_test[dataset_test.language == "portuguese"]

In [None]:
es_vec = TfidfVectorizer(input="content", analyzer=lambda x: x.tolist(), max_features=20000, min_df=2)
es_df_train_tfidf = es_vec.fit_transform(es_df_train.words)

In [None]:
es_df_dev_tfidf = es_vec.transform(es_df_dev.words)

In [None]:
es_model = SGDClassifier(n_jobs=-1, loss="hinge", verbose=10, random_state=42)
es_model.fit(es_df_train_tfidf, es_df_train.category)
print(balanced_accuracy_score(es_df_dev.category, es_model.predict(es_df_dev_tfidf)))

In [None]:
es_model = SGDClassifier(n_jobs=-1, loss="hinge", verbose=10, random_state=42)
es_model.fit(es_df_train_tfidf, es_df_train.category)
print(balanced_accuracy_score(es_df_dev.category, es_model.predict(es_df_dev_tfidf)))

In [None]:
print(balanced_accuracy_score(es_df_dev.category, es_model.predict(es_df_dev_tfidf)))

In [None]:
es_vec_1 = CountVectorizer(input="content", analyzer=lambda x: x.tolist(), max_features=20000, min_df=2)
es_df_train_bow = es_vec_1.fit_transform(es_df_train.words)

In [None]:
es_df_dev_bow = es_vec_1.transform(es_df_dev.words)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
model = MultinomialNB()
model.fit(es_df_train_bow, es_df_train.category)

# Portugues

In [None]:
pt_vec = TfidfVectorizer(input="content", analyzer=lambda x: x.tolist(), max_features=20000, min_df=2)
pt_df_train_tfidf = pt_vec.fit_transform(pt_df_train.words)

In [None]:
pt_df_dev_tfidf = pt_vec.transform(pt_df_dev.words)

In [None]:
pt_model = SGDClassifier(n_jobs=-1, loss="log", verbose=10, random_state=42)

pt_model.fit(pt_df_train_tfidf, pt_df_train.category)
print(balanced_accuracy_score(pt_df_train.category, pt_model.predict(pt_df_train_tfidf)))
print(balanced_accuracy_score(pt_df_dev.category, pt_model.predict(pt_df_dev_tfidf)))