In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string

from difflib import SequenceMatcher
from tqdm.notebook import tqdm

# Settings
pd.set_option('max_colwidth', 80)
pd.options.display.max_rows = 200

In [7]:
# Load data
tr = pd.read_csv('../train.csv')
te = pd.read_csv('../test.csv')

# Get POI and street
tr['poi'] = tr['POI/street'].str.split('/', expand=True)[0]
tr['poi_list'] = tr.poi.apply(str.split)

# Get rows with extended words
tr['poi_ext'] = tr.apply(lambda row: not row['poi'] in row['raw_address'], axis=1)
tr['raw_list'] = tr.raw_address.str.split()

# Save in separate dataframes
# df_poi = tr[tr.poi_ext].copy().drop(['poi_ext', 'POI/street'], axis=1)

In [8]:
# Function to remove punctuation
def remove_punc(s):
    exclude = set(string.punctuation)
    table = str.maketrans('', '', string.punctuation)
    return s.translate(table)

# Function to check keywords
def check_keyword(l, kw, top):
    exclude = [w for w in top if w != kw]
    
    if kw in l:
        for exc in exclude:
            if exc in l:
                return False
        return True
    return False

## Points of Interest

In [9]:
# Extract tokens
# df_poi['raw_list'] = df_poi.raw_address.apply(lambda x: [remove_punc(str(i)) for i in x.split(' ')])
# df_poi['poi_list'] = df_poi.poi.apply(lambda x: [remove_punc(str(i)) for i in x.split(' ')])
# df_poi['super_raw'] = df_poi.raw_address.apply(str.split)

df_poi = tr.copy().drop(['poi_ext', 'POI/street'], axis=1)
df_poi['raw_list'] = df_poi.raw_address.apply(lambda x: [remove_punc(str(i)) for i in x.split(' ')])
df_poi['raw_list_orig'] = df_poi.raw_address.apply(str.split)
df_poi['poi_list'] = df_poi.poi.apply(lambda x: [remove_punc(str(i)) for i in x.split(' ')])
df_poi['poi_list_orig'] = df_poi.poi.apply(str.split)
df_poi['super_raw'] = df_poi.raw_address.apply(str.split)

In [10]:
all_words = []
all_phrases = []

for i in tqdm(range(df_poi.shape[0])):
    x1 = df_poi.raw_list.iloc[i]
    x2 = df_poi.poi_list.iloc[i]
    x_orig = df_poi.raw_list_orig.iloc[i]
    x_ext = df_poi.poi_list_orig.iloc[i]
    
    match_scores = []
    ids = []
    for j in range(0, len(x1)-len(x2)+1):
        start = j
        end = j+len(x2)
        ids.append((start, end))
        
        scores = []
        for s1, s2 in zip(x1[start:end], x2):
            scores.append(SequenceMatcher(None, s1, s2).ratio())
        match_scores.append(np.mean(scores))

    opt = np.argmax(match_scores)

    matched_seq = x1[ids[opt][0]:ids[opt][1]]
    
    
    all_phrases.append({
        'idx': df_poi.id.iloc[i],
        'raw': df_poi.super_raw.iloc[i],
        'orig': matched_seq,
        'repl': x2
    })
    
    for k, m in enumerate(matched_seq):
#         if m != x2[k]:
        all_words.append({
            'orig': m,
            'repl': x2[k],
#             'orig_main': x_orig[k],
#             'ext_main': x_ext[k]
        })

  0%|          | 0/300000 [00:00<?, ?it/s]

In [11]:
poip = pd.DataFrame(all_phrases).sort_values('orig').reset_index(drop=True)
poiw = pd.DataFrame(all_words).sort_values('orig').reset_index(drop=True)

poip['orig_full'] = poip.orig.apply(lambda x: ' '.join(x))
poip['repl_full'] = poip.repl.apply(lambda x: ' '.join(x))

## Programmatic Check

In [12]:
poiw = poiw[poiw.repl != '']

# Get dictionary
poiw_ext = poiw[poiw.orig != poiw.repl]

# Filter only words that were extended
df_ext = poiw.groupby('orig').count()
df_ext['unique_ext'] = poiw.groupby('orig').repl.nunique()
df_ext = df_ext[df_ext.unique_ext>1].reset_index()

# Count extensions and non-extensions
ext = poiw[poiw.orig != poiw.repl].groupby('orig').count()
no_ext = poiw[poiw.orig == poiw.repl].groupby('orig').count()

# Merge extensions and non-extensions
df_ext = df_ext.merge(ext, how='left', left_on='orig', right_index=True).rename(columns={'orig': 'word', 'repl_x': 'count', 'repl_y': 'ext'})
df_ext = df_ext.merge(no_ext, how='left', left_on='word', right_index=True).rename(columns={'repl': 'no_ext'})

# Fill 0 for non-extensions
df_ext['no_ext'] = df_ext.no_ext.fillna(0)

# Compute extension rate
df_ext['ext_rate'] = df_ext.ext / df_ext['count']

# Filter and sort
df_ext = df_ext[df_ext.word != '']
# df_ext = df_ext[(df_ext.ext_rate > 0.6) & (df_ext['count'] >= 5)].sort_values('count', ascending=False).reset_index(drop=True)

In [13]:
replacements = []
for word in tqdm(df_ext.word):
    freqs = poiw_ext[poiw_ext.orig == word].repl.value_counts()
    probs = freqs / freqs.sum()
    replacements.append({'word': word, 'ext': freqs.index[0], 'prob': probs.values[0], 'freq': freqs.values[0]})

  0%|          | 0/4700 [00:00<?, ?it/s]

In [14]:
# Add extension, probability, and frequency
df_ext_probs = df_ext.merge(pd.DataFrame(replacements), how='left', on='word').rename(columns={'ext_x': 'n_ext', 'ext_y': 'ext'})

# Calculate net gain
df_ext_probs['net'] = df_ext_probs.n_ext * df_ext_probs.prob - df_ext_probs.no_ext

# Inspect
df_ext_filtered = df_ext_probs[(df_ext_probs.net > 0) & (df_ext_probs.freq > 2)].reset_index(drop=True)
df_ext_filtered

Unnamed: 0,word,count,unique_ext,n_ext,no_ext,ext_rate,ext,prob,freq,net
0,acad,5,2,5,0.0,1.000000,academy,0.800000,4,4.0
1,acces,4,2,3,1.0,0.750000,accesoris,1.000000,3,2.0
2,access,30,4,27,3.0,0.900000,accessories,0.888889,24,21.0
3,ach,17,3,14,3.0,0.823529,achmad,0.857143,12,9.0
4,adip,6,3,6,0.0,1.000000,adipura,0.500000,3,3.0
...,...,...,...,...,...,...,...,...,...,...
1160,yaya,160,2,158,2.0,0.987500,yayasan,1.000000,158,156.0
1161,yoha,4,2,3,1.0,0.750000,yohanes,1.000000,3,2.0
1162,zaen,6,3,5,1.0,0.833333,zaenuri,0.600000,3,2.0
1163,zai,17,7,16,1.0,0.941176,zaitun,0.312500,5,4.0


### Final Checks

In [125]:
replace_checks = []

# Check extensions
for i, word in enumerate(tqdm(df_ext_filtered[df_ext_filtered.net > 0].ext)):
    if tr.poi.str.contains(word).sum() == 0:
        print(i, word)
        replace_checks.append(word)

  0%|          | 0/1165 [00:00<?, ?it/s]

23 alhasanah
25 alikhlas
26 alikhlash
30 almubarokah


In [119]:
tr.loc[poip[poip.repl.apply(lambda x: 'alhasanah' in x)].idx.iloc[0]]

id                                      293733
raw_address           mas al-has teh 10, no 46
POI/street            masjid al-hasanah/teh 10
poi                          masjid al-hasanah
poi_list                  [masjid, al-hasanah]
poi_ext                                   True
raw_list       [mas, al-has, teh, 10,, no, 46]
Name: 293733, dtype: object

In [128]:
df_ext_filtered.loc[23, 'word'] = 'al-has'
df_ext_filtered.loc[23, 'ext'] = 'al-hasanah'

df_ext_filtered.loc[25, 'word'] = 'al-ik'
df_ext_filtered.loc[25, 'ext'] = 'al-ikhlas'

df_ext_filtered.loc[26, 'word'] = 'al-ikh'
df_ext_filtered.loc[26, 'ext'] = 'al-ikhlas'

df_ext_filtered.loc[30, 'word'] = 'al-muba'
df_ext_filtered.loc[30, 'ext'] = 'al-mubarokah'

In [None]:
df_ext_filter.to_csv('poi_et-3.csv', index=False)