In [35]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import uuid

from joblib import dump, load

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import balanced_accuracy_score

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [36]:
spanish_stemmer = SnowballStemmer('spanish')
portuguese_stemmer = SnowballStemmer('portuguese')

def spanish_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [spanish_stemmer.stem(token) for token in tokens]

def portuguese_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [portuguese_stemmer.stem(token) for token in tokens]


def my_preprocessor(doc):
    return re.sub('[0-9' + string.punctuation + ']', '', doc).lower()

In [37]:
spanish_vectorizer = load('models/spanish_vectorizer_ab7462a2182a438694b2dc8bf9989dcd.joblib')
spanish_classifier = load('models/spanish_classifier_ab7462a2182a438694b2dc8bf9989dcd.joblib')

portuguese_vectorizer = load('models/portuguesevectorizer_6786acbe757d4d1581bf2afccf7a803c.joblib')
portuguese_classifier = load('models/portugueseclassifier_9f0906a94b4545aa8ebe4aa7c277ae2f_350.joblib')

In [38]:
test = pd.read_csv('data/test.csv')
test.head()

Unnamed: 0,id,title,language
0,0,Kit Maternidade Bolsa-mala Baby/bebe Vinho Men...,portuguese
1,1,Trocador De Fraldas Fisher Price Feminino Rosa...,portuguese
2,2,Motor Ventoinha - Fiat Idea / Palio 1.8 - A 04...,portuguese
3,3,Amortecedor Mola Batente D Dir New Civic 14 - ...,portuguese
4,4,Cadeirinha De Carro Bebê Princesa Princess 9 A...,portuguese


In [39]:
test_portuguese = test[test.language=='portuguese']
test_spanish = test[test.language=='spanish']

In [40]:
test_features_spanish = spanish_vectorizer.transform(test_spanish['title'])
test_features_portuguese = portuguese_vectorizer.transform(test_portuguese['title'])

In [41]:
predictions_spanish = spanish_classifier.predict(test_features_spanish)
predictions_portuguese = portuguese_classifier.predict(test_features_portuguese)

df_spanish = pd.DataFrame(data={'id': test_spanish.index, 'category': predictions_spanish})
df_portuguese = pd.DataFrame(data={'id': test_portuguese.index, 'category': predictions_portuguese})
df_spanish.set_index('id', inplace=True)
df_portuguese.set_index('id', inplace=True)
df_spanish.head()


Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
9,HARD_DRIVES_AND_SSDS
10,FOOD_SLICERS
14,KITCHEN_TOWELS
15,MUSICAL_KEYBOARDS
19,TURNTABLES


In [42]:
df_portuguese.head()

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
0,DIAPER_BAGS
1,BABY_PLAYARDS
2,ENGINE_COOLING_FAN_SHROUDS
3,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS
4,BABY_CAR_SEATS


In [43]:
df = df_spanish.append(df_portuguese)
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
0,DIAPER_BAGS
1,BABY_PLAYARDS
2,ENGINE_COOLING_FAN_SHROUDS
3,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS
4,BABY_CAR_SEATS


In [44]:
df.to_csv('output/submission_2.csv')