In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import uuid
import time

from joblib import dump, load

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import balanced_accuracy_score

from collections import defaultdict

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from unidecode import unidecode

nltk.download('punkt') ## algorithm for tokenization
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
train = pd.read_csv('data/train.csv')
unreliable_weight = 0.4
train['weight'] = train['label_quality'].apply(lambda quality: 1. if quality == 'reliable' else unreliable_weight)

In [3]:
train_spanish = train[train.language == 'spanish']
train_portuguese = train[train.language == 'portuguese']

In [4]:
train_spanish_reliable = train_spanish[train_spanish.label_quality == 'reliable']
train_portuguese_reliable = train_portuguese[train_portuguese.label_quality == 'reliable']

In [5]:
def filter_small_categories(df):
    grouped = df.groupby(['category']).count()
    available_categories = list(grouped[grouped['title'] > 1].index.to_numpy())
    filtered = df[df.category.isin(available_categories)]
    return filtered

In [6]:
def my_preprocessor(doc):
    return re.sub('[0-9¡¨ª®°´·º»½¿' + string.punctuation + ']', '', unidecode(doc)).lower()

spanish_stemmer = SnowballStemmer('spanish')
portuguese_stemmer = SnowballStemmer('portuguese')

def spanish_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [spanish_stemmer.stem(token) for token in tokens]

def portuguese_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [portuguese_stemmer.stem(token) for token in tokens]

In [7]:
def elapsed_time(start, message):
    end = time.time()
    print( message + ': ', end-start )

In [8]:
def train(titles, labels, weights, language='spanish', batches=200, epochs=400, max_features=8000, 
          min_df=3, max_df=0.7, test_size=0.1):
    titles_train, titles_test, y_train, y_test, w_train, w_test = train_test_split(titles, labels, weights, test_size=test_size, random_state=42, stratify=labels)
    print(titles_train.shape, titles_test.shape, y_train.shape, y_test.shape)
    
    run_id = uuid.uuid4().hex
    print('Run id: ', run_id)
    
    if language == 'spanish':
        stop_words = list(map(lambda word: spanish_stemmer.stem(unidecode(word)), stopwords.words(language)))
        vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df, strip_accents='unicode', 
                                        stop_words=stop_words, tokenizer=spanish_stemmer_tokenizer, 
                                        preprocessor=my_preprocessor)
    else:
        stop_words = list(map(lambda word: portuguese_stemmer.stem(unidecode(word)), stopwords.words(language)))
        vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df, strip_accents='unicode', 
                                        stop_words=stop_words, tokenizer=portuguese_stemmer_tokenizer, 
                                        preprocessor=my_preprocessor)
    
    start_time = time.time()
    vectorizer.fit(titles_train)
    elapsed_time(start_time, 'Fit vectorizer')
    
    dump(vectorizer, 'models/' + language + '_vectorizer_' + run_id + '.joblib')
    
    tokens = vectorizer.get_feature_names()

    start_time = time.time()
    X_train = vectorizer.transform(titles_train)
    elapsed_time(start_time, 'Word2Vec X_train')
    
    start_time = time.time()
    X_test = vectorizer.transform(titles_test)
    elapsed_time(start_time, 'Word2Vec X_test')
    
    samples = X_train.shape[0]
    print('samples per batch: ', samples//batches)

    classifier = MultinomialNB()
    
    categories = y_train.unique()

    for epoch in range(epochs):
        
        for batch in range(batches):
            start_time = time.time()
            start = (samples * batch)//batches
            end = (samples * (batch + 1))//batches
            classifier.partial_fit(X_train[ start:end ], y_train[ start:end ], 
                                   sample_weight=w_train[ start:end ], classes=categories)
            if(batch==0):
                elapsed_time(start_time, 'Batch')
        if epoch % 5 == 0:
            y_predicted = classifier.predict(X_test)
            score = balanced_accuracy_score(y_true=y_test, y_pred=y_predicted, sample_weight=w_test)
            print('finished epoch: ', epoch, 'with score:', score)
            dump(classifier, 'models/checkpoint/' + language + '_classifier_' + run_id + '_' + str(epoch) + '.joblib') 

    y_predicted = classifier.predict(X_test)
    score = balanced_accuracy_score(y_true=y_test, y_pred=y_predicted, sample_weight=w_test)
    print('Final score: ', score)

    dump(classifier, 'models/' + language + '_classifier_' + run_id + '.joblib') 

# Portuguese

In [9]:
filtered = filter_small_categories(train_portuguese)

train(filtered['title'], filtered['category'], filtered['weight'], language='portuguese', max_features = 20000,
     batches=800, epochs=60, test_size=0.002)

(9980000,) (20000,) (9980000,) (20000,)
Run id:  cb7e1220a901413aa002e26667cf29e6


  'stop_words.' % sorted(inconsistent))


Fit vectorizer:  2419.16743850708
Word2Vec X_train:  2273.612967967987
Word2Vec X_test:  4.434809923171997
samples per batch:  12475
Batch:  1.2439532279968262
finished epoch:  0 with score: 0.7169444564409991
Batch:  1.057809591293335
Batch:  1.0532116889953613
Batch:  1.0431883335113525
Batch:  1.0373973846435547
Batch:  1.056246042251587




finished epoch:  5 with score: 0.7905628376022475
Batch:  1.2663767337799072
Batch:  1.0384061336517334
Batch:  1.0395982265472412
Batch:  1.0424039363861084
Batch:  1.0501832962036133




finished epoch:  10 with score: 0.8036676947116833
Batch:  1.4243512153625488
Batch:  1.0485424995422363
Batch:  1.050374984741211
Batch:  1.0402331352233887
Batch:  1.0450329780578613




finished epoch:  15 with score: 0.8084125185388787
Batch:  1.163055419921875
Batch:  1.0377395153045654
Batch:  1.042466402053833
Batch:  1.0449936389923096
Batch:  1.0508453845977783




finished epoch:  20 with score: 0.8113600272390719
Batch:  1.2137837409973145
Batch:  1.0526952743530273
Batch:  1.0332963466644287
Batch:  1.048144817352295
Batch:  1.0456194877624512




finished epoch:  25 with score: 0.8125315743454791
Batch:  1.2368619441986084
Batch:  1.0544018745422363
Batch:  1.033679723739624
Batch:  1.0495307445526123
Batch:  1.0544626712799072




finished epoch:  30 with score: 0.8133931204622632
Batch:  1.2157008647918701
Batch:  1.0461325645446777
Batch:  1.038977861404419
Batch:  1.0388455390930176
Batch:  1.0452547073364258




finished epoch:  35 with score: 0.8144510150840255
Batch:  1.2010583877563477
Batch:  1.051405668258667
Batch:  1.0573959350585938
Batch:  1.050283670425415
Batch:  1.0396780967712402




finished epoch:  40 with score: 0.8153922459781121
Batch:  1.189058780670166
Batch:  1.040757417678833
Batch:  1.0504093170166016
Batch:  1.045663595199585
Batch:  1.0397896766662598




finished epoch:  45 with score: 0.8158434670174595
Batch:  1.2622241973876953
Batch:  1.0428385734558105
Batch:  1.043147325515747
Batch:  1.0564706325531006
Batch:  1.3774182796478271




finished epoch:  50 with score: 0.8169656068289127
Batch:  1.697082757949829
Batch:  1.0552968978881836
Batch:  1.0593597888946533
Batch:  1.0440232753753662
Batch:  1.061199426651001




finished epoch:  55 with score: 0.8171818491741308
Batch:  1.620112657546997
Batch:  1.0542919635772705
Batch:  1.0568559169769287
Batch:  1.0579240322113037




Final score:  0.8171494065095651


# Spanish

In [10]:
filtered = filter_small_categories(train_spanish)

train(filtered['title'], filtered['category'], filtered['weight'], language='spanish', max_features = 20000,
     batches=800, epochs=60, test_size=0.002)

(9980000,) (20000,) (9980000,) (20000,)
Run id:  0495c1663b1442518c1282fa490a8be2


  'stop_words.' % sorted(inconsistent))


Fit vectorizer:  2160.6638283729553
Word2Vec X_train:  2164.277093410492
Word2Vec X_test:  4.3777549266815186
samples per batch:  12475
Batch:  1.604860782623291
finished epoch:  0 with score: 0.6894358133677794
Batch:  1.6023211479187012
Batch:  1.0592050552368164
Batch:  1.0514397621154785
Batch:  1.0554003715515137
Batch:  1.0685057640075684
finished epoch:  5 with score: 0.7703840962260698
Batch:  1.3756449222564697
Batch:  1.0657825469970703
Batch:  1.0580158233642578
Batch:  1.0552031993865967
Batch:  1.0449573993682861
finished epoch:  10 with score: 0.7873343422612727
Batch:  1.2690224647521973
Batch:  1.0672268867492676
Batch:  1.0845885276794434
Batch:  1.0503854751586914
Batch:  1.0615041255950928




finished epoch:  15 with score: 0.7934842372674912
Batch:  1.2656989097595215
Batch:  1.0597023963928223
Batch:  1.0645697116851807
Batch:  1.0622332096099854
Batch:  1.0587666034698486




finished epoch:  20 with score: 0.7969632281524075
Batch:  1.338318109512329
Batch:  1.0685498714447021
Batch:  1.0404064655303955
Batch:  1.0657014846801758
Batch:  1.0575990676879883




finished epoch:  25 with score: 0.7984228690362879
Batch:  1.2653555870056152
Batch:  1.064096450805664
Batch:  1.0597388744354248
Batch:  1.0601775646209717
Batch:  1.0715968608856201




finished epoch:  30 with score: 0.7993922764111777
Batch:  1.3590261936187744
Batch:  1.050959587097168
Batch:  1.0648865699768066
Batch:  1.0602929592132568
Batch:  1.0490484237670898




finished epoch:  35 with score: 0.7991020767525996
Batch:  1.3765270709991455
Batch:  1.0627126693725586
Batch:  1.0554227828979492
Batch:  1.0449397563934326
Batch:  1.0544843673706055




finished epoch:  40 with score: 0.7991458023737465
Batch:  1.4795165061950684
Batch:  1.045588493347168
Batch:  1.04170560836792
Batch:  1.042811632156372
Batch:  1.0482923984527588




finished epoch:  45 with score: 0.7996675192198194
Batch:  1.2495479583740234
Batch:  1.042975902557373
Batch:  1.0727779865264893
Batch:  1.0554697513580322
Batch:  1.0508873462677002




finished epoch:  50 with score: 0.8002197057603094
Batch:  1.380624771118164
Batch:  1.0443518161773682
Batch:  1.082435131072998
Batch:  1.053795337677002
Batch:  1.0418567657470703




finished epoch:  55 with score: 0.8003359171533589
Batch:  1.2525625228881836
Batch:  1.0906317234039307
Batch:  1.0672686100006104
Batch:  1.0749897956848145




Final score:  0.8003615999313354
