# Tagging Exercise

Instructions and some steps pending.

## Setup

In [1]:
from tagger import *
from sklearn.pipeline import Pipeline

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#!wget http://citypolarna.se/event_data.csv -O "../data/raw/citypolarna_public_events_out.csv"

In [3]:
from tagger.dataset.cleaning import load_datasets

events_train, tags_train, events_test, tags_test, top_tags = load_datasets(
    "../data/raw/citypolarna_public_events_out.csv")

## Preprocessing

In [4]:
baseline_preprocessing = Pipeline([
    ('fields', ExtractText()),
    ('html', HTMLToText()),
    ('cset', CharacterSet(punctuation=False)),
    ('lower', Lowercase()),
    ('token', Tokenize())
])

In [5]:
baseline_preprocessing.fit_transform(events_train[0:10])

7318    [vi, är, några, som, tänkt, fika, på, söndag, ...
9088    [hej, då, var, det, dags, för, en, bokklubbstr...
4793    [pröva, på, att, dansa, kizomba, prova, på, kl...
4553    [på, fredag, är, det, premiär, för, grand, hot...
6068    [uppdatering, dddd, dd, dd, ändrat, sista, dat...
1732    [intressekoll, inför, kvällen, trädgårn, klubb...
8802    [hej, minsta, rundan, ever, haha, men, då, får...
9183    [missa, inte, denna, intima, och, självutlämna...
6929    [någon, som, vill, med, till, hävringe, fyr, f...
4779    [obs, det, riskerar, att, bli, fullt, eller, n...
Name: description, dtype: object

In [47]:
my_preprocessing = Pipeline([
    ('fields', ExtractText(['title', 'description'], add_time_of_day=False)),
    ('html', HTMLToText()),
    ('cset', CharacterSet(punctuation=False, digits=False)),
    ('lower', Lowercase()),
    ('token', Tokenize()),
    ('ngram', NGram(1, 1))
])

In [48]:
list(my_preprocessing.fit_transform(events_train[0:1]))

[['fika',
  'vi',
  'är',
  'några',
  'som',
  'tänkt',
  'fika',
  'på',
  'söndag',
  'häng',
  'på',
  'om',
  'du',
  'vill',
  'ring',
  'om',
  'du',
  'kommer',
  'efter',
  'dd',
  'så',
  'förklarar',
  'jag',
  'var',
  'vi',
  'är',
  'dddd']]

## Feature Extraction

In [49]:
baseline_features = Pipeline([
    ('bow', BagOfWords())
])

In [50]:
my_features = Pipeline([
    #('bow', BagOfWords(binary=True))
    ('tfidf', Tfidf())
])

## Classification Algorithms

In [51]:
baseline_classifier = Pipeline([
    ('nb', NaiveBayes())
])

In [52]:
my_classifier = Pipeline([
    ('dense', SparseToDense()),
    ('lr', MultiLayerPerceptron(layers=[1024], epochs=4, batch_size=64))
])

## Evaluation

In [53]:
baseline_model = Pipeline([
    ('pre', baseline_preprocessing),
    ('feat', baseline_features),
    ('clf', baseline_classifier)
])

In [54]:
%%time
baseline_model.fit(events_train, tags_train)

CPU times: user 3.57 s, sys: 70.7 ms, total: 3.64 s
Wall time: 3.65 s


Pipeline(memory=None,
     steps=[('pre', Pipeline(memory=None,
     steps=[('fields', ExtractText(add_time_of_day=False, columns=['description'])), ('html', HTMLToText()), ('cset', CharacterSet(digits=False, punctuation=False)), ('lower', Lowercase()), ('token', Tokenize(method='word_punct'))])), ('feat', Pipeline(memory=None, steps=[('bow', BagOfWords(binary=False))])), ('clf', Pipeline(memory=None, steps=[('nb', NaiveBayes())]))])

In [55]:
my_model = Pipeline([
    ('pre', my_preprocessing),
    ('feat', my_features),
    ('clf', my_classifier)
])

In [56]:
%%time
my_model.fit(events_train, tags_train)

NB! Cleared Keras session — already trained models will break
Fitting model:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              53435392  
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 72)                73800     
Total params: 53,513,288
Trainable params: 53,511,240
Non-trainable params: 2,048
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 59min 10s, sys: 1min, total: 1h 11s
Wall time: 8min 20s


Pipeline(memory=None,
     steps=[('pre', Pipeline(memory=None,
     steps=[('fields', ExtractText(add_time_of_day=False, columns=['title', 'description'])), ('html', HTMLToText()), ('cset', CharacterSet(digits=False, punctuation=False)), ('lower', Lowercase()), ('token', Tokenize(method='word_punct')), ('ngram', NGram(n_max=...('dense', SparseToDense()), ('lr', MultiLayerPerceptron(batch_size=64, epochs=4, layers=[1024]))]))])

## Submission

In [57]:
submit_model(baseline_model, 
             team_name="All your base are belong to us",
             model_name="baseline",
             local_events=events_test,
             local_tags=tags_test)

Team 'All your base are belong to us' submitting model 'baseline':
Pipeline(memory=None,
     steps=[('pre', Pipeline(memory=None,
     steps=[('fields', ExtractText(add_time_of_day=False, columns=['description'])), ('html', HTMLToText()), ('cset', CharacterSet(digits=False, punctuation=False)), ('lower', Lowercase()), ('token', Tokenize(method='word_punct'))])), ('feat', Pipeline(memory=None, steps=[('bow', BagOfWords(binary=False))])), ('clf', Pipeline(memory=None, steps=[('nb', NaiveBayes())]))])
------------------------------------------------------------------------
Hamming loss for submission: 0.04233654876741694
Exact match ratio for submission: 0.07395498392282958


In [58]:
submit_model(my_model, 
             team_name="Little gray cells",
             model_name="1-2-gram",
             local_events=events_test,
             local_tags=tags_test)

Team 'Little gray cells' submitting model '1-2-gram':
Pipeline(memory=None,
     steps=[('pre', Pipeline(memory=None,
     steps=[('fields', ExtractText(add_time_of_day=False, columns=['title', 'description'])), ('html', HTMLToText()), ('cset', CharacterSet(digits=False, punctuation=False)), ('lower', Lowercase()), ('token', Tokenize(method='word_punct')), ('ngram', NGram(n_max=...('dense', SparseToDense()), ('lr', MultiLayerPerceptron(batch_size=64, epochs=4, layers=[1024]))]))])
------------------------------------------------------------------------
Hamming loss for submission: 0.032087352625937836
Exact match ratio for submission: 0.2127545551982851


Remember to add counts of labels somewhere to make sense of stats.

%%time
stats = evaluate_per_label(my_model, top_tags, events_train, tags_train)
stats

In [22]:
stats.sort_values('auc', ascending=False)

Unnamed: 0_level_0,accuracy,precision,recall,f1,auc
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
innebandy,0.998258,0.890443,0.928030,0.905250,0.963475
allsång,0.996517,1.000000,0.800451,0.888753,0.900226
träna,0.995445,0.905522,0.745098,0.817344,0.872006
bio,0.969051,0.900245,0.684833,0.776810,0.839147
poker,0.996517,0.949735,0.658440,0.766839,0.829017
...,...,...,...,...,...
vandra,0.991292,0.761905,0.063152,0.109301,0.531238
foto,0.988210,0.666667,0.048611,0.090498,0.524306
konst,0.973339,0.888889,0.048083,0.090866,0.523973
livemusik,0.985531,0.833333,0.038457,0.071682,0.519093


# (Testing)

In [59]:
from tagger._evaluation.perlabel import _per_label_metrics as plm

In [60]:
y_pred = my_model.predict(events_test)

In [61]:
import pandas as pd
pd.DataFrame(plm(top_tags, tags_test, y_pred))

Unnamed: 0,tag,accuracy,precision,recall,f1,auc
0,mat,0.775456,0.734475,0.800467,0.766052,0.777339
1,musik,0.832797,0.765189,0.733858,0.749196,0.808846
2,fest,0.833333,0.705426,0.580851,0.637106,0.749595
3,fika,0.851554,0.766497,0.620123,0.685585,0.776704
4,teater,0.857449,0.795666,0.562363,0.658974,0.757761
...,...,...,...,...,...,...
67,innebandy,0.998392,0.882353,0.937500,0.909091,0.968209
68,vandra,0.989282,0.333333,0.052632,0.090909,0.525774
69,poker,0.997856,1.000000,0.714286,0.833333,0.857143
70,vernissage,0.990354,1.000000,0.217391,0.357143,0.608696
