In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import uuid

from joblib import dump, load

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import balanced_accuracy_score

from collections import defaultdict

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('punkt') ## algorithm for tokenization
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
train = pd.read_csv('data/train.csv')

In [3]:
train_spanish = train[train.language == 'spanish']
train_portuguese = train[train.language == 'portuguese']

In [4]:
train_spanish_reliable = train_spanish[train_spanish.label_quality == 'reliable']
train_portuguese_reliable = train_portuguese[train_portuguese.label_quality == 'reliable']

In [5]:
def filter_small_categories(df):
    grouped = df.groupby(['category']).count()
    available_categories = list(grouped[grouped['title'] > 1].index.to_numpy())
    filtered = df[df.category.isin(available_categories)]
    return filtered

In [13]:
def my_preprocessor(doc):
    return re.sub('[0-9¡¨ª®°´·º»½¿ø' + string.punctuation + ']', '', doc).lower()

spanish_stemmer = SnowballStemmer('spanish')
portuguese_stemmer = SnowballStemmer('portuguese')

def spanish_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [spanish_stemmer.stem(token) for token in tokens]

def portuguese_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [portuguese_stemmer.stem(token) for token in tokens]

In [14]:
def train(titles, labels, language='spanish', max_features=8000, min_df=3, max_df=0.7):
    titles_train, titles_test, y_train, y_test = train_test_split(titles, labels, test_size=0.1, random_state=42, stratify=labels)
    print(titles_train.shape, titles_test.shape, y_train.shape, y_test.shape)
    
    run_id = uuid.uuid4().hex
    
    if language == 'spanish':
        stop_words = list(map(lambda word: spanish_stemmer.stem(word), stopwords.words(language)))
        vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df, strip_accents='unicode', 
                                        stop_words=stop_words, tokenizer=spanish_stemmer_tokenizer, 
                                        preprocessor=my_preprocessor)
    else:
        stop_words = list(map(lambda word: portuguese_stemmer.stem(word), stopwords.words(language)))
        vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df, strip_accents='unicode', 
                                        stop_words=stop_words, tokenizer=portuguese_stemmer_tokenizer, 
                                        preprocessor=my_preprocessor)
    
    vectorizer.fit(titles_train)
    dump(vectorizer, 'models/' + language + '_vectorizer_' + run_id + '.joblib')
    print('Saved vectorizer')
    
    tokens = vectorizer.get_feature_names()
    print(tokens)
    X_train = vectorizer.transform(titles_train)
    X_test = vectorizer.transform(titles_test)
    samples = X_train.shape[0]
    batches = 200
    epochs  = 400

    classifier = MultinomialNB()
    
    categories = y_train.unique()

    for epoch in range(epochs):
        for batch in range(batches):
            start = (samples * batch)//batches
            end = (samples * (batch + 1))//batches
            classifier.partial_fit(X_train[ start:end ], y_train[ start:end ], classes=categories)
        if epoch % 50 == 0:
            y_predicted = classifier.predict(X_test)
            score = balanced_accuracy_score(y_true=y_test, y_pred=y_predicted)
            print('finished epoch: ', epoch, 'with score:', score)
            dump(classifier, 'models/checkpoint/' + language + 'classifier_' + run_id + '_' + str(epoch) + '.joblib') 

    y_predicted = classifier.predict(X_test)
    score = balanced_accuracy_score(y_true=y_test, y_pred=y_predicted)
    print('Final score: ', score)

    dump(classifier, 'models/' + language + '_classifier_' + run_id + '.joblib') 

# Portuguese

In [15]:
# filtered = filter_small_categories(train_portuguese_reliable)
# train(filtered['title'], filtered['category'], language='portuguese')

# Spanish

In [16]:
filtered = filter_small_categories(train_spanish_reliable)
train(filtered['title'], filtered['category'], language='spanish')

(441810,) (49091,) (441810,) (49091,)


  'stop_words.' % sorted(inconsistent))


Saved vectorizer
['aa', 'aaa', 'ab', 'abaj', 'abb', 'abc', 'abdom', 'abdominal', 'abec', 'abercrombi', 'abershop', 'abertur', 'abiert', 'ableton', 'abrazader', 'abre', 'abrig', 'abril', 'abrir', 'abroj', 'abs', 'absolut', 'abu', 'ac', 'acab', 'acaci', 'academy', 'acamp', 'acanal', 'acapulc', 'acc', 'accent', 'acces', 'accesori', 'access', 'accion', 'accord', 'acdc', 'acdelc', 'ace', 'aceit', 'acept', 'acer', 'acerbis', 'acet', 'acetat', 'acid', 'acme', 'acne', 'acod', 'acolch', 'acondicion', 'acopl', 'acord', 'acordeon', 'acordon', 'acqu', 'acquapiu', 'acril', 'act', 'action', 'activ', 'actuador', 'actualiz', 'acuarel', 'acuari', 'acuat', 'acust', 'acusticaelectric', 'ad', 'adapt', 'adat', 'addict', 'addon', 'adducci', 'adel', 'adelgaz', 'adhes', 'adicional', 'adid', 'adizer', 'adm', 'admiral', 'admision', 'adn', 'adolescent', 'adopcion', 'adorn', 'adp', 'adult', 'advanc', 'advantag', 'advantix', 'adventur', 'advocat', 'ae', 'aer', 'aere', 'aeroccin', 'aerograf', 'af', 'afa', 'afeit', 

finished epoch:  0 with score: 0.4410517953478592




finished epoch:  50 with score: 0.7167710107871551




finished epoch:  100 with score: 0.7302780177100964




finished epoch:  150 with score: 0.7345756175982333




finished epoch:  200 with score: 0.7374886347162098




finished epoch:  250 with score: 0.7388537833524548




finished epoch:  300 with score: 0.7395714410624257




finished epoch:  350 with score: 0.7403055433794286
Final score:  0.7411976797722051


