# Packages

In [1]:
import numpy as np
import pandas as pd
import string
import sys

In [2]:
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [3]:
%load_ext autoreload
%autoreload 2

sys.path.append('..')

from src.data import raw_to_tokens, tokens_to_tagged, tagged_to_tokens, tagged_to_poi_street
from src.models.scikit_classifier import ScikitClassifier

# Import Word2Vec

In [4]:
w2v = Word2Vec.load('../data/id/id.bin')

# Import Training Data

In [9]:
def tagged_to_iob(tagged):
    """
    `annotated_sentence` = list of triplets [(w1, iob1), ...]
    Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
    to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
    """
    iob_tagged = []
    for idx, (word, ner) in enumerate(tagged): 
        if ner != 'OTHER':
            if idx == 0:
                ner = "B-" + ner
            elif tagged[idx-1][1] == ner:
                ner = "I-" + ner
            else:
                ner = "B-" + ner
        else:
            ner = 'O'
        iob_tagged.append((word, ner))
    return iob_tagged

def iob_to_tagged(iob_tagged):
    tagged = []
    for word, ner in iob_tagged:
        if '-' in ner:
            tagged.append((word, ner.split('-')[1]))
        else:
            tagged.append((word, 'OTHER'))
    return tagged

In [10]:
df = pd.read_csv('../data/train.csv').set_index('id')
df['poi'], df['street'] = zip(*df['POI/street'].str.split('/'))
df['raw_tokens'] = df['raw_address'].apply(raw_to_tokens)
df['poi_tokens'] = df['poi'].apply(raw_to_tokens)
df['street_tokens'] = df['street'].apply(raw_to_tokens)
df['tagged_tokens'] = df.apply(lambda r: tokens_to_tagged(r.raw_tokens, r.poi_tokens, r.street_tokens), axis=1)
df['iob_tokens'] = df['tagged_tokens'].apply(tagged_to_iob)

In [11]:
df.sample(n=5)

Unnamed: 0_level_0,raw_address,POI/street,poi,street,raw_tokens,poi_tokens,street_tokens,tagged_tokens,iob_tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
218317,"giri jati, pasir endah ujung berung",/giri jati,,giri jati,"[giri, jati,, pasir, endah, ujung, berung]",[],"[giri, jati]","[(giri, STREET), (jati,, OTHER), (pasir, OTHER...","[(giri, B-STREET), (jati,, O), (pasir, O), (en..."
93091,petukangan utara cile raya 17 rt 1 rw 6 12260 ...,/cile raya,,cile raya,"[petukangan, utara, cile, raya, 17, rt, 1, rw,...",[],"[cile, raya]","[(petukangan, OTHER), (utara, OTHER), (cile, S...","[(petukangan, O), (utara, O), (cile, B-STREET)..."
277299,"mata iii, 256 mekar jaya sukmajaya",/mata iii,,mata iii,"[mata, iii,, 256, mekar, jaya, sukmajaya]",[],"[mata, iii]","[(mata, STREET), (iii,, OTHER), (256, OTHER), ...","[(mata, B-STREET), (iii,, O), (256, O), (mekar..."
34261,bet jaya 46 rt 7 14 tugu utara koja,/,,,"[bet, jaya, 46, rt, 7, 14, tugu, utara, koja]",[],[],"[(bet, OTHER), (jaya, OTHER), (46, OTHER), (rt...","[(bet, O), (jaya, O), (46, O), (rt, O), (7, O)..."
12492,nerada estate blok a6 15,nerada estate/,nerada estate,,"[nerada, estate, blok, a6, 15]","[nerada, estate]",[],"[(nerada, POI), (estate, POI), (blok, OTHER), ...","[(nerada, B-POI), (estate, I-POI), (blok, O), ..."


# Train Valid Split

In [12]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.shape, valid_df.shape

((240000, 9), (60000, 9))

# 1. Train Random Forest with IOB Tokens (Without Word Embedding)

In [5]:
%%time
pipeline_1 = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('classifier', RandomForestClassifier(random_state=42))
])

params_1 = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, 15, 20, 25],
    'classifier__min_samples_split': [2, 4, 8, 16, 32],
    'classifier__min_samples_leaf': [1, 10, 100],
}

search_1 = RandomizedSearchCV(
    estimator = pipeline_1,
    param_distributions = params_1,
    n_iter = 10,
    n_jobs = -1,
    refit = True,
    random_state = 42,
)

model_1 = ScikitClassifier(search = search_1)

CPU times: user 158 µs, sys: 7 µs, total: 165 µs
Wall time: 169 µs


In [10]:
%%time
model_1.transform(train_df['iob_tokens']) #.sample(n=100, random_state=42))

CPU times: user 26.5 s, sys: 309 ms, total: 26.8 s
Wall time: 27 s


In [11]:
%%time
model_1.fit()
model_1.clf



CPU times: user 59min 33s, sys: 59.4 s, total: 1h 33s
Wall time: 1h 2min 35s


Pipeline(steps=[('vectorizer', DictVectorizer()),
                ('classifier',
                 RandomForestClassifier(max_depth=5, min_samples_split=16,
                                        random_state=42))])

In [12]:
model_1.save_model('../model/scikit_rf_iob.joblib')

In [6]:
model_1.load_model('../model/scikit_rf_iob.joblib')

In [7]:
pd.DataFrame({
    'importance': model_1.clf['classifier'].feature_importances_,
}, index = model_1.clf['vectorizer'].feature_names_).sort_values('importance', ascending=False).head(10)

Unnamed: 0,importance
prev-iob=I-STREET,0.023631
prev-word=raya,0.021873
word=haji,0.02059
prev-word=__START1__,0.019358
shape=ending-dot,0.016781
word=5,0.016392
word=jl.,0.016167
shape=other,0.01342
word=gg.,0.013408
prev-shape=lowercase,0.013188


In [13]:
%%time
pred_df = valid_df.sample(n=100, random_state=42).copy()
pred_df['pred_iob'] = model_1.parallelize_predict(pred_df['raw_tokens'])
pred_df['pred_tagged'] = pred_df['pred_iob'].apply(iob_to_tagged)
pred_df['pred_POI/street'] = pred_df['pred_tagged'].apply(tagged_to_poi_street)
accuracy_score(y_true = pred_df['POI/street'], y_pred = pred_df['pred_POI/street'])

CPU times: user 2.49 s, sys: 320 ms, total: 2.81 s
Wall time: 10.7 s


0.13

# 2. Train Random Forest with IOB Tags (With Word Embedding)

In [14]:
%%time
pipeline_2 = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('classifier', RandomForestClassifier(random_state=42))
])

params_2 = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, 15, 20, 25],
    'classifier__min_samples_split': [2, 4, 8, 16, 32],
    'classifier__min_samples_leaf': [1, 10, 100],
}

search_2 = RandomizedSearchCV(
    estimator = pipeline_2,
    param_distributions = params_2,
    n_iter = 5,
    n_jobs = -1,
    refit = True,
    random_state = 42,
)

model_2 = ScikitClassifier(word2vec = w2v, search = search_2)

CPU times: user 127 µs, sys: 0 ns, total: 127 µs
Wall time: 131 µs


In [15]:
%%time
model_2.transform(train_df['iob_tokens'].sample(n=100, random_state=42))

CPU times: user 243 ms, sys: 25.3 ms, total: 268 ms
Wall time: 263 ms


In [16]:
%%time
model_2.fit()
model_2.clf

CPU times: user 4min 35s, sys: 2.2 s, total: 4min 37s
Wall time: 4min 40s


Pipeline(steps=[('vectorizer', DictVectorizer()),
                ('classifier',
                 RandomForestClassifier(max_depth=25, min_samples_split=4,
                                        n_estimators=200, random_state=42))])

In [None]:
model_2.save_model('../model/scikit_rf_iob_embed.joblib')

In [None]:
model_2.load_model('../model/scikit_rf_iob_embed.joblib')

In [18]:
pd.DataFrame({
    'importance': model_2.clf['classifier'].feature_importances_,
}, index = model_2.clf['vectorizer'].feature_names_).sort_values('importance', ascending=False).head(10)

Unnamed: 0,importance
prev-iob=O,0.159476
prev-iob=B-STREET,0.074444
prev-iob=I-STREET,0.063746
prev-iob=B-POI,0.035949
prev-iob=I-POI,0.027495
shape=ending-dot,0.005048
word-vec-111,0.004106
prev-shape=ending-dot,0.003487
word-vec-264,0.003281
prev-shape=wildcard,0.003277


In [17]:
%%time
pred_df = valid_df.sample(n=100, random_state=42).copy()
pred_df['pred_iob'] = model_2.parallelize_predict(pred_df['raw_tokens'])
pred_df['pred_tagged'] = pred_df['pred_iob'].apply(iob_to_tagged)
pred_df['pred_POI/street'] = pred_df['pred_tagged'].apply(tagged_to_poi_street)
accuracy_score(y_true = pred_df['POI/street'], y_pred = pred_df['pred_POI/street'])

CPU times: user 17.4 s, sys: 1.46 s, total: 18.8 s
Wall time: 22.1 s


0.12