# Purpose

Want to explore the overlap between pre-trained word2vec model [here](https://github.com/Kyubyong/wordvectors) and the training data


# Summary

Only about 15-25% of distinct words have word embedding, but because most of these words appear in higher frequency, about 71% of words in the addresses have word embedding

# Packages

In [1]:
import numpy as np
import pandas as pd

In [2]:
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer

# Import Word Embedding

In [3]:
w2v = Word2Vec.load('../data/id/id.bin')

# Overlap with Train Data

In [4]:
def get_overlap(raw):
    vec = CountVectorizer()
    X = vec.fit_transform(raw)
    stats_df = pd.DataFrame({
        'word': vec.get_feature_names(),
        'count': np.array(X.sum(axis=0))[0],
        'is_digit': [w.isdigit() for w in vec.get_feature_names()],
        'in_w2v': [w in w2v.wv for w in vec.get_feature_names()],
        }, index = [vec.vocabulary_[w] for w in vec.get_feature_names()]
    )
    return stats_df

## Train Data

In [5]:
train_df = pd.read_csv('../data/train.csv').set_index('id')
train_df.sample(n=5)

Unnamed: 0_level_0,raw_address,POI/street
id,Unnamed: 1_level_1,Unnamed: 2_level_1
40423,"humm bird prof. dr. sat, no 18 rw 4 karet kuni...",humming bird/prof. dr. sat
6086,"cinangka pala v, 12 sawangan",/pala v
19821,sertajaya rusa xa 20 rt 32 8 cikarang timur,/rusa xa
18362,jl. rid rais perum pol ui beji timur beji,/jl. rid rais perum polite ui
129215,"badan pendid dan pelatihan transpo darat para,...",badan pendidikan dan pelatihan transportasi da...


In [6]:
stats_df = get_overlap(train_df['raw_address'])
stats_df

Unnamed: 0,word,count,is_digit,in_w2v
0,00,42,True,False
1,000,14,True,False
2,001,550,True,False
3,0011,1,True,False
4,0013,1,True,False
...,...,...,...,...
81112,zyzy,1,False,False
81113,zz,6,False,True
81114,zzam,1,False,False
81115,zzira,1,False,False


In [11]:
(
    stats_df[stats_df['is_digit']==False]
    .groupby('in_w2v')
    .agg({'word':'count', 'count':'sum'})
    .assign(word_pct=lambda x: x['word'] / x['word'].sum())
    .assign(count_pct=lambda x: x['count'] / x['count'].sum())
)

Unnamed: 0_level_0,word,count,word_pct,count_pct
in_w2v,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,62436,482278,0.845032,0.282547
True,11450,1224619,0.154968,0.717453


In [12]:
# samples
stats_df[(stats_df['is_digit']==False) & (stats_df['in_w2v']==False)].sample(n=10)

Unnamed: 0,word,count,is_digit,in_w2v
50539,muhibbin,3,False,False
53962,oeray,8,False,False
48246,mdrt,2,False,False
72194,syadeli,1,False,False
73653,tedy,2,False,False
37022,jowo,33,False,False
66063,sb59,1,False,False
52842,niode,2,False,False
63372,roastery,3,False,False
19348,bumijawa,16,False,False


## Test Data

In [13]:
test_df = pd.read_csv('../data/test.csv').set_index('id')
test_df.sample(n=5)

Unnamed: 0_level_0,raw_address
id,Unnamed: 1_level_1
5872,"puc gad raya, no 54 mranggen"
28593,gg. kuti 2 2a way kandis tanjung senang
37667,bahagia flamb ii no 118 17610 babelan
22055,swa barat xxiii no 37 kebon bawang 12 tanjung ...
30918,kenc wungu ten lll 9 5 karangayu


In [14]:
stats_df = get_overlap(test_df['raw_address'])
stats_df

Unnamed: 0,word,count,is_digit,in_w2v
0,00,6,True,False
1,000,3,True,False
2,001,71,True,False
3,0012,1,True,False
4,002,96,True,False
...,...,...,...,...
32472,zuri,1,False,False
32473,zuriati,1,False,False
32474,zurna,1,False,False
32475,zyy,1,False,False


In [15]:
(
    stats_df[stats_df['is_digit']==False]
    .groupby('in_w2v')
    .agg({'word':'count', 'count':'sum'})
    .assign(word_pct=lambda x: x['word'] / x['word'].sum())
    .assign(count_pct=lambda x: x['count'] / x['count'].sum())
)

Unnamed: 0_level_0,word,count,word_pct,count_pct
in_w2v,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,21654,80204,0.741779,0.282545
True,7538,203659,0.258221,0.717455


In [16]:
# samples
stats_df[(stats_df['is_digit']==False) & (stats_df['in_w2v']==False)].sample(n=10)

Unnamed: 0,word,count,is_digit,in_w2v
8412,cendol,3,False,False
19459,mebeul,4,False,False
31977,wonokerto,6,False,False
12487,graj,2,False,False
10615,duriangkang,1,False,False
21634,ols,4,False,False
7612,bpsdm,5,False,False
11068,ernawanto,1,False,False
19878,milagros,1,False,False
7795,budoyo,1,False,False
