In [1]:
from cleaner import TextCleanerTransformer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

import pandas as pd
import numpy as np

import re

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
all_train = pd.read_csv('data/train.csv')

all_train.dropna(inplace=True)

le = LabelEncoder()

le.fit(all_train['Label'].values)

all_train.Label = le.transform(all_train['Label'].values)

train, test = train_test_split(all_train, test_size=0.3, random_state=10, shuffle=True)

text_clf = Pipeline([
    ('stemm', TextCleanerTransformer(
        WordPunctTokenizer(), 
        SnowballStemmer("portuguese", ignore_stopwords=True), 
        [
            #("\bumidade\b", "humidade"), 
            #("Vamos", "vamos"), 
            #("Preciso", "preciso"), 
            #("Diga", "diga"), 
            #("Avalie", "avalie"), 
            #("Mostre", "mostre"), 
            #("Reproduzir", "reproduzir"), 
            #("Mostre", "mostre"), 
            #("Precisa", "precisa"), 
            #("Adicionar", "adicionar"), 
            #("Olhe", "olhe"), 
            #("Quero", "quero"), 
            #("Encontre", "encontre"), 
            #("Você", "você"), 
        ]
    )),
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

#text_clf.fit(map(str, train['Request'].values), train['Label'].values)

fit_params = {
    'tfidf__norm': [None, 'l1', 'l2'], 
    'tfidf__use_idf': [False, True], 
    'tfidf__smooth_idf': [False, True], 
    'tfidf__sublinear_tf': [False, True], 
    'clf__alpha': (1, 10)
}

gcv = GridSearchCV(text_clf, fit_params, 
                   scoring='accuracy', 
                   cv=3, verbose=True, 
                   n_jobs=-1)

gcv.fit(train['Request'], train['Label'].values)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  4.3min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('stemm', TextCleanerTransformer(lower=True, regex_list=[], remove_punct=True,
            stemmer=<nltk.stem.snowball.SnowballStemmer object at 0x7fec95858e80>,
            tokenizer=WordPunctTokenizer(pattern='\\w+|[^\\w\\s]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'tfidf__norm': [None, 'l1', 'l2'], 'tfidf__use_idf': [False, True], 'tfidf__smooth_idf': [False, True], 'tfidf__sublinear_tf': [False, True], 'clf__alpha': (1, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=True)

In [7]:
y_proba = gcv.best_estimator_.predict_proba(map(str, test['Request'].values))

y_pred = gcv.best_estimator_.predict(map(str, test['Request'].values))

In [8]:
for i, c in enumerate(le.classes_):
    print(i, c, roc_auc_score(test['Label'] == i, proba[:, i]))

0 add_to_playlist 0.9881649578748688
1 book_restaurant 0.9823485137714755
2 get_weather 0.9861301541501692
3 no_intent 0.9570106095699291
4 play_music 0.983017168542898
5 rate_book 0.9882522956173605
6 search_creative_work 0.980379996178335
7 search_screening_event 0.9891600601988456


In [23]:
real_test = pd.read_csv('data/test.csv')

real_test_y_proba = gcv.best_estimator_.predict_proba(map(str, real_test['Request'].values))

real_test_y_pred = gcv.best_estimator_.predict(map(str, real_test['Request'].values))

real_test['Label'] = le.inverse_transform(real_test_y_pred)

  if diff:


In [24]:
mkdir submissions

In [32]:
real_test.to_csv('submissions/model_1.csv', index=False)

In [33]:
pd.read_csv('submissions/model_1.csv')

Unnamed: 0,Request,Label
0,Quais filmes estão atualmente em cartaz no Spe...,search_screening_event
1,Faça uma lista do que está faltando.,no_intent
2,Dá-me os horários do filme para filmes exibido...,search_screening_event
3,"reservar um restaurante em Clawson, MS para um",book_restaurant
4,É sobre o tempo que os franceses aprenderam da...,no_intent
5,Deixe-me saber quando o Maiden Danced to Death...,search_screening_event
6,Eu quero enviar músicas para as listas de ann ...,add_to_playlist
7,classifique este livro 1 de 6,rate_book
8,Encontre os filmes e os horários dos filmes no...,search_screening_event
9,É difícil recordar tantos discursos quando se ...,no_intent
