In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import uuid
import time

from joblib import dump, load

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import balanced_accuracy_score

from collections import defaultdict

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('punkt') ## algorithm for tokenization
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
train = pd.read_csv('data/train.csv')
unreliable_weight = 0.4
train['weight'] = train['label_quality'].apply(lambda quality: 1. if quality == 'reliable' else unreliable_weight)

In [3]:
train_spanish = train[train.language == 'spanish']
train_portuguese = train[train.language == 'portuguese']

In [4]:
train_spanish_reliable = train_spanish[train_spanish.label_quality == 'reliable']
train_portuguese_reliable = train_portuguese[train_portuguese.label_quality == 'reliable']

In [5]:
def filter_small_categories(df):
    grouped = df.groupby(['category']).count()
    available_categories = list(grouped[grouped['title'] > 1].index.to_numpy())
    filtered = df[df.category.isin(available_categories)]
    return filtered

In [6]:
def my_preprocessor(doc):
    return re.sub('[0-9¡¨ª®°´·º»½¿ø' + string.punctuation + ']', '', doc).lower()

spanish_stemmer = SnowballStemmer('spanish')
portuguese_stemmer = SnowballStemmer('portuguese')

def spanish_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [spanish_stemmer.stem(token) for token in tokens]

def portuguese_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [portuguese_stemmer.stem(token) for token in tokens]

In [7]:
def elapsed_time(start, message):
    end = time.time()
    print( message + ': ', end-start )

In [8]:
def train(titles, labels, weights, language='spanish', batches=200, epochs=400, max_features=8000, 
          min_df=3, max_df=0.7, test_size=0.1):
    titles_train, titles_test, y_train, y_test, w_train, w_test = train_test_split(titles, labels, weights, test_size=test_size, random_state=42, stratify=labels)
    print(titles_train.shape, titles_test.shape, y_train.shape, y_test.shape)
    
    run_id = uuid.uuid4().hex
    print('Run id: ', run_id)
    
    if language == 'spanish':
        stop_words = list(map(lambda word: spanish_stemmer.stem(word), stopwords.words(language)))
        vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df, strip_accents='unicode', 
                                        stop_words=stop_words, tokenizer=spanish_stemmer_tokenizer, 
                                        preprocessor=my_preprocessor)
    else:
        stop_words = list(map(lambda word: portuguese_stemmer.stem(word), stopwords.words(language)))
        vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df, strip_accents='unicode', 
                                        stop_words=stop_words, tokenizer=portuguese_stemmer_tokenizer, 
                                        preprocessor=my_preprocessor)
    
    start_time = time.time()
    vectorizer.fit(titles_train)
    elapsed_time(start_time, 'Fit vectorizer')
    
    dump(vectorizer, 'models/' + language + '_vectorizer_' + run_id + '.joblib')
    
    tokens = vectorizer.get_feature_names()

    start_time = time.time()
    X_train = vectorizer.transform(titles_train)
    elapsed_time(start_time, 'Word2Vec X_train')
    
    start_time = time.time()
    X_test = vectorizer.transform(titles_test)
    elapsed_time(start_time, 'Word2Vec X_test')
    
    samples = X_train.shape[0]
    print('samples per batch: ', samples//batches)

    classifier = MultinomialNB()
    
    categories = y_train.unique()

    for epoch in range(epochs):
        
        for batch in range(batches):
            start_time = time.time()
            start = (samples * batch)//batches
            end = (samples * (batch + 1))//batches
            classifier.partial_fit(X_train[ start:end ], y_train[ start:end ], 
                                   sample_weight=w_train[ start:end ], classes=categories)
            if(batch==0):
                elapsed_time(start_time, 'Batch')
        if epoch % 5 == 0:
            y_predicted = classifier.predict(X_test)
            score = balanced_accuracy_score(y_true=y_test, y_pred=y_predicted, sample_weight=w_test)
            print('finished epoch: ', epoch, 'with score:', score)
            dump(classifier, 'models/checkpoint/' + language + '_classifier_' + run_id + '_' + str(epoch) + '.joblib') 

    y_predicted = classifier.predict(X_test)
    score = balanced_accuracy_score(y_true=y_test, y_pred=y_predicted, sample_weight=w_test)
    print('Final score: ', score)

    dump(classifier, 'models/' + language + '_classifier_' + run_id + '.joblib') 

# Portuguese

In [9]:
# filtered = filter_small_categories(train_portuguese)

# train(filtered['title'], filtered['category'], filtered['weight'], language='portuguese', max_features = 4000,
#      batches=500, epochs=200, test_size=0.002)

# Spanish

In [10]:
filtered = filter_small_categories(train_spanish)

train(filtered['title'], filtered['category'], filtered['weight'], language='spanish', max_features = 10000,
     batches=500, epochs=200, test_size=0.002)

(9980000,) (20000,) (9980000,) (20000,)
Run id:  cd606050dad24c57874bdf03d6da036d


  'stop_words.' % sorted(inconsistent))


Fit vectorizer:  2110.368017911911
Word2Vec X_train:  2123.940196275711
Word2Vec X_test:  4.245254755020142
samples per batch:  19960
Batch:  1.5368986129760742
finished epoch:  0 with score: 0.6962915167066682
Batch:  1.3627755641937256
Batch:  1.1748230457305908
Batch:  1.1630909442901611
Batch:  1.2403192520141602
Batch:  1.1729037761688232
finished epoch:  5 with score: 0.7649088852394931
Batch:  1.3285276889801025
Batch:  1.167435646057129
Batch:  1.2088963985443115
Batch:  1.1531434059143066
Batch:  1.1601274013519287
finished epoch:  10 with score: 0.7756596247771834
Batch:  1.344271183013916
Batch:  1.1833646297454834
Batch:  1.2114191055297852
Batch:  1.2492904663085938
Batch:  1.1510803699493408




finished epoch:  15 with score: 0.7796035283445066
Batch:  1.3869848251342773
Batch:  1.1901185512542725
Batch:  1.189366340637207
Batch:  1.1997926235198975
Batch:  1.174189805984497




finished epoch:  20 with score: 0.7830644563643302
Batch:  1.3599433898925781
Batch:  1.1884722709655762
Batch:  1.2048149108886719
Batch:  1.2192106246948242
Batch:  1.2355983257293701




finished epoch:  25 with score: 0.783806604148177
Batch:  1.4095282554626465
Batch:  1.1902923583984375
Batch:  1.1806635856628418
Batch:  1.1831870079040527
Batch:  1.194385051727295




finished epoch:  30 with score: 0.7843472742146448
Batch:  1.3985562324523926
Batch:  1.2078783512115479
Batch:  1.1753106117248535
Batch:  1.185072660446167
Batch:  1.246666431427002




finished epoch:  35 with score: 0.7840816851383829
Batch:  1.3660006523132324
Batch:  1.161254644393921
Batch:  1.1921112537384033
Batch:  1.163806438446045
Batch:  1.1802904605865479




finished epoch:  40 with score: 0.7853025505623679
Batch:  1.2400434017181396
Batch:  1.1955256462097168
Batch:  1.2245452404022217
Batch:  1.2198357582092285
Batch:  1.1902623176574707




finished epoch:  45 with score: 0.7858406802718934
Batch:  1.3220443725585938
Batch:  1.200472354888916
Batch:  1.2056329250335693
Batch:  1.1967213153839111
Batch:  1.1684441566467285




finished epoch:  50 with score: 0.7857387713560918
Batch:  1.3323299884796143
Batch:  1.171705961227417
Batch:  1.2017884254455566
Batch:  1.179335355758667
Batch:  1.2068662643432617




finished epoch:  55 with score: 0.7857412969775341
Batch:  1.3276722431182861
Batch:  1.1634058952331543
Batch:  1.2055895328521729
Batch:  1.1834776401519775
Batch:  1.2086818218231201




finished epoch:  60 with score: 0.78544490738159
Batch:  1.3160605430603027
Batch:  1.167055368423462
Batch:  1.2017898559570312
Batch:  1.2234456539154053
Batch:  1.1887028217315674




finished epoch:  65 with score: 0.784906748169647
Batch:  1.3581159114837646
Batch:  1.2218453884124756
Batch:  1.2017719745635986
Batch:  1.195230484008789
Batch:  1.1887061595916748




finished epoch:  70 with score: 0.7847059065999087
Batch:  1.3419678211212158
Batch:  1.179741382598877
Batch:  1.1864898204803467
Batch:  1.1787676811218262
Batch:  1.1788861751556396




finished epoch:  75 with score: 0.7850297494428935
Batch:  1.2818958759307861
Batch:  1.1539642810821533
Batch:  1.1874041557312012
Batch:  1.2217772006988525
Batch:  1.1726677417755127


KeyboardInterrupt: 