In [17]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import uuid
import time

from joblib import dump, load

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import balanced_accuracy_score

from collections import defaultdict

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('punkt') ## algorithm for tokenization
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
train = pd.read_csv('data/train.csv')
unreliable_weight = 0.5
train['weight'] = train['label_quality'].apply(lambda quality: 1. if quality == 'reliable' else unreliable_weight)

In [19]:
train_spanish = train[train.language == 'spanish']
train_portuguese = train[train.language == 'portuguese']

In [20]:
train_spanish_reliable = train_spanish[train_spanish.label_quality == 'reliable']
train_portuguese_reliable = train_portuguese[train_portuguese.label_quality == 'reliable']

In [21]:
def filter_small_categories(df):
    grouped = df.groupby(['category']).count()
    available_categories = list(grouped[grouped['title'] > 1].index.to_numpy())
    filtered = df[df.category.isin(available_categories)]
    return filtered

In [22]:
def my_preprocessor(doc):
    return re.sub('[0-9¡¨ª®°´·º»½¿ø' + string.punctuation + ']', '', doc).lower()

spanish_stemmer = SnowballStemmer('spanish')
portuguese_stemmer = SnowballStemmer('portuguese')

def spanish_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [spanish_stemmer.stem(token) for token in tokens]

def portuguese_stemmer_tokenizer(doc):
    tokens = word_tokenize(doc)
    return [portuguese_stemmer.stem(token) for token in tokens]

In [23]:
def elapsed_time(start, message):
    end = time.time()
    print( message + ': ', end-start )

In [24]:
def train(titles, labels, weights, language='spanish', batches=200, epochs=400, max_features=8000, min_df=3, max_df=0.7):
    titles_train, titles_test, y_train, y_test, w_train, w_test = train_test_split(titles, labels, weights, test_size=0.1, random_state=42, stratify=labels)
    print(titles_train.shape, titles_test.shape, y_train.shape, y_test.shape)
    
    run_id = uuid.uuid4().hex
    print('Run id: ', run_id)
    
    if language == 'spanish':
        stop_words = list(map(lambda word: spanish_stemmer.stem(word), stopwords.words(language)))
        vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df, strip_accents='unicode', 
                                        stop_words=stop_words, tokenizer=spanish_stemmer_tokenizer, 
                                        preprocessor=my_preprocessor)
    else:
        stop_words = list(map(lambda word: portuguese_stemmer.stem(word), stopwords.words(language)))
        vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df, strip_accents='unicode', 
                                        stop_words=stop_words, tokenizer=portuguese_stemmer_tokenizer, 
                                        preprocessor=my_preprocessor)
    
    start_time = time.time()
    vectorizer.fit(titles_train)
    elapsed_time(start_time, 'Fit vectorizer')
    
    dump(vectorizer, 'models/' + language + '_vectorizer_' + run_id + '.joblib')
    
    tokens = vectorizer.get_feature_names()

    start_time = time.time()
    X_train = vectorizer.transform(titles_train)
    elapsed_time(start_time, 'Word2Vec X_train')
    
    start_time = time.time()
    X_test = vectorizer.transform(titles_test)
    elapsed_time(start_time, 'Word2Vec X_test')
    
    samples = X_train.shape[0]
    print('samples per batch: ', samples//batches)

    classifier = MultinomialNB()
    
    categories = y_train.unique()

    for epoch in range(epochs):
        
        for batch in range(batches):
            start_time = time.time()
            start = (samples * batch)//batches
            end = (samples * (batch + 1))//batches
            classifier.partial_fit(X_train[ start:end ], y_train[ start:end ], 
                                   sample_weight=w_train[ start:end ], classes=categories)
            if(batch==1):
                elapsed_time(start_time, 'Batch')
        if epoch % 5 == 0:
            y_predicted = classifier.predict(X_test)
            score = balanced_accuracy_score(y_true=y_test, y_pred=y_predicted, sample_weight=w_test)
            print('finished epoch: ', epoch, 'with score:', score)
            dump(classifier, 'models/checkpoint/' + language + 'classifier_' + run_id + '_' + str(epoch) + '.joblib') 

    y_predicted = classifier.predict(X_test)
    score = balanced_accuracy_score(y_true=y_test, y_pred=y_predicted)
    print('Final score: ', score)

    dump(classifier, 'models/' + language + '_classifier_' + run_id + '.joblib') 

# Portuguese

In [25]:
# filtered = filter_small_categories(train_portuguese_reliable)
# train(filtered['title'], filtered['category'], language='portuguese')

# Spanish

In [None]:
start_time = time.time()
filtered = filter_small_categories(train_spanish)
elapsed_time(start_time, 'filtering categories')

train(filtered['title'], filtered['category'], filtered['weight'], language='spanish', max_features = 100)

filtering categories:  3.737879514694214
(9000000,) (1000000,) (9000000,) (1000000,)
Run id:  439c8b8b558f49f79e5dd1b5c2e467b2


  'stop_words.' % sorted(inconsistent))


Fit vectorizer:  2024.0291199684143
Word2Vec X_train:  2057.8113403320312
Word2Vec X_test:  238.27207207679749
samples per batch:  45000
Batch:  4.319194793701172
