In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string

from difflib import SequenceMatcher
from tqdm.notebook import tqdm

# Settings
pd.set_option('max_colwidth', 80)
pd.options.display.max_rows = 200

In [6]:
# Load data
tr = pd.read_csv('../train.csv')
te = pd.read_csv('../test.csv')

# Get POI and street
tr['poi'] = tr['POI/street'].str.split('/', expand=True)[0]
tr['street'] = tr['POI/street'].str.split('/', expand=True)[1]

# Get rows with extended words
tr['str_ext'] = tr.apply(lambda row: not row['street'] in row['raw_address'], axis=1)
tr['raw_list'] = tr.raw_address.str.split()
tr['str_list'] = tr.street.str.split()

# Save in separate dataframes
# df_poi = tr[tr.poi_ext].copy().drop(['poi_ext', 'str_ext', 'POI/street'], axis=1)
# df_str = tr[tr.str_ext].copy().drop(['poi_ext', 'str_ext', 'POI/street'], axis=1)

In [7]:
# Function to remove punctuation
def remove_punc(s):
    exclude = set(string.punctuation)
    table = str.maketrans('', '', string.punctuation)
    return s.translate(table)

# Function to check keywords
def check_keyword(l, kw, top):
    exclude = [w for w in top if w != kw]
    
    if kw in l:
        for exc in exclude:
            if exc in l:
                return False
        return True
    return False

## Points of Interest

In [8]:
# Extract tokens
# df_str['raw_list'] = df_str.raw_address.apply(lambda x: [remove_punc(str(i)) for i in x.split(' ')])
# df_str['str_list'] = df_str.street.apply(lambda x: [remove_punc(str(i)) for i in x.split(' ')])
# df_str['super_raw'] = df_str.raw_address.apply(str.split)

df_str = tr.copy().drop(['str_ext', 'POI/street'], axis=1)
df_str['raw_list'] = df_str.raw_address.apply(lambda x: [remove_punc(str(i)) for i in x.split(' ')])
df_str['str_list'] = df_str.street.apply(lambda x: [remove_punc(str(i)) for i in x.split(' ')])
df_str['super_raw'] = df_str.raw_address.apply(str.split)

In [9]:
all_words = []
all_phrases = []

for i in tqdm(range(df_str.shape[0])):
    x1 = df_str.raw_list.iloc[i]
    x2 = df_str.str_list.iloc[i]
    match_scores = []
    ids = []
    for j in range(0, len(x1)-len(x2)+1):
        start = j
        end = j+len(x2)
        ids.append((start, end))
        
        scores = []
        for s1, s2 in zip(x1[start:end], x2):
            scores.append(SequenceMatcher(None, s1, s2).ratio())
        match_scores.append(np.mean(scores))

    opt = np.argmax(match_scores)

    matched_seq = x1[ids[opt][0]:ids[opt][1]]
    
    all_phrases.append({
        'idx': df_str.id.iloc[i],
        'raw': df_str.super_raw.iloc[i],
        'orig': matched_seq,
        'repl': x2
    })
    
    for k, m in enumerate(matched_seq):
#         if m != x2[k]:
        all_words.append({
            'orig': m,
            'repl': x2[k]
        })

  0%|          | 0/300000 [00:00<?, ?it/s]

In [10]:
strp = pd.DataFrame(all_phrases).sort_values('orig').reset_index(drop=True)
strw = pd.DataFrame(all_words).sort_values('orig').reset_index(drop=True)

strp['orig_full'] = strp.orig.apply(lambda x: ' '.join(x))
strp['repl_full'] = strp.repl.apply(lambda x: ' '.join(x))

# strp['keyword'] = strp.apply(lambda row: [k for k, e in zip(row.orig, row.repl) if k != e], axis=1)
# strp['ext'] = strp.apply(lambda row: [e for k, e in zip(row.orig, row.repl) if k != e], axis=1)
# strp[strp.orig.apply(lambda x: 'a' in x)]
# strp = strp[strp.keyword.apply(len) != 0]

## Programmatic Check

In [11]:
strw = strw[strw.repl != '']

# Get dictionary
strw_ext = strw[strw.orig != strw.repl]

# Filter only words that were extended
df_ext = strw.groupby('orig').count()
df_ext['unique_ext'] = strw.groupby('orig').repl.nunique()
df_ext = df_ext[df_ext.unique_ext>1].reset_index()

# Count extensions and non-extensions
ext = strw[strw.orig != strw.repl].groupby('orig').count()
no_ext = strw[strw.orig == strw.repl].groupby('orig').count()

# Merge extensions and non-extensions
df_ext = df_ext.merge(ext, how='left', left_on='orig', right_index=True).rename(columns={'orig': 'word', 'repl_x': 'count', 'repl_y': 'ext'})
df_ext = df_ext.merge(no_ext, how='left', left_on='word', right_index=True).rename(columns={'repl': 'no_ext'})

# Fill 0 for non-extensions
df_ext['no_ext'] = df_ext.no_ext.fillna(0)

# Compute extension rate
df_ext['ext_rate'] = df_ext.ext / df_ext['count']

# Filter and sort
df_ext = df_ext[df_ext.word != '']

In [12]:
replacements = []
for word in tqdm(df_ext.word):
    freqs = strw_ext[strw_ext.orig == word].repl.value_counts()
    probs = freqs / freqs.sum()
    replacements.append({'word': word, 'ext': freqs.index[0], 'prob': probs.values[0], 'freq': freqs.values[0]})

  0%|          | 0/3207 [00:00<?, ?it/s]

In [15]:
# Add extension, probability, and frequency
df_ext_probs = df_ext.merge(pd.DataFrame(replacements), how='left', on='word').rename(columns={'ext_x': 'n_ext', 'ext_y': 'ext'})

# Calculate net gain
df_ext_probs['net'] = df_ext_probs.n_ext * df_ext_probs.prob - df_ext_probs.no_ext

# Inspect
df_ext_filtered = df_ext_probs[(df_ext_probs.net > 0) & (df_ext_probs.freq > 1)]
df_ext_filtered

Unnamed: 0,word,count,unique_ext,n_ext,no_ext,ext_rate,ext,prob,freq,net
21,abulya,3,2,2,1.0,0.666667,abulyatama,1.0,2,1.0
168,ath,3,2,2,1.0,0.666667,athena,1.0,2,1.0
217,bangba,3,2,2,1.0,0.666667,bangbarung,1.0,2,1.0
235,banyum,6,2,4,2.0,0.666667,banyumulek,1.0,4,2.0
264,bawaka,3,2,2,1.0,0.666667,bawakareng,1.0,2,1.0
335,blamba,5,2,3,2.0,0.6,blambangan,1.0,3,1.0
356,bones,4,2,3,1.0,0.75,bonesinjai,1.0,3,2.0
489,cibole,3,2,2,1.0,0.666667,cibolerang,1.0,2,1.0
491,cibungb,8,2,5,3.0,0.625,cibungbulang,1.0,5,2.0
527,cikaja,8,2,5,3.0,0.625,cikajangc,1.0,5,2.0


### Final Checks

In [19]:
replace_checks = []

# Check extensions
for i, word in enumerate(tqdm(df_ext_filtered.ext)):
    if tr.street.str.contains(word).sum() == 0:
        print(i, word)
        replace_checks.append(word)

  0%|          | 0/73 [00:00<?, ?it/s]

6 bonesinjai
9 cikajangc
11 darehju
13 godongpurwodadi
15 gotriw
19 jatinomb
20 jatinomboyolali
21 juwanar
22 kalijatip
24 kamparban
26 karangpandann
28 kedirike
30 kerungkerung
31 koposoreang
34 magelangyo
36 mamujupalu
37 mancakanyer
41 patigabus
42 penebelt
49 prabumulihbelimbing
60 sentolom
61 singarajag
62 sokorengel
64 sumaterak
68 tubangresik
70 watesngoro
72 wonosobopurworejo


In [33]:
for i, word in enumerate(replace_checks[18:]):
    temp_list = tr.loc[strp[strp.repl.apply(lambda x: word in x)].idx.iloc[0]].loc[['raw_address', 'street']].tolist()
    print(f"[{word}]:   {temp_list[0]}    |    {temp_list[1]}")

[penebelt]:   mie nyon penebel juru peneb    |    juru penebel-t
[prabumulihbelimbing]:   anug ganti oli, prabumulih-b, prabumulih barat    |    prabumulih-belimbing
[sentolom]:   sento, nasgor ibu nicen kalibawang,    |    sentolo-m
[singarajag]:   war makan bu mol, raya singar, sumberkima    |    raya singaraja-g
[sokorengel]:   toko ban suko ban2 raya soko-r, soko    |    raya soko-rengel
[sumaterak]:   koko cen lin sumate, 61b    |    lintas sumatera-k
[tubangresik]:   raya tuban-g, no 4    |    raya tuban-gresik
[watesngoro]:   bengkel mobil, raya wates-n ngoro    |    raya wates-ngoro-
[wonosobopurworejo]:   kan lurah may, raya wonosobo-p kepil    |    raya wonosobo-purworejo


In [34]:
df_ext_filtered.loc[6, 'word'] = 'bone-s'
df_ext_filtered.loc[9, 'word'] = 'cikaja'
df_ext_filtered.loc[11, 'word'] = 'dare'
df_ext_filtered.loc[13, 'word'] = 'god'
df_ext_filtered.loc[15, 'word'] = 'gotr'
df_ext_filtered.loc[19, 'word'] = 'jatin'
df_ext_filtered.loc[20, 'word'] = 'jatinom-b'
df_ext_filtered.loc[21, 'word'] = 'juwa'
df_ext_filtered.loc[22, 'word'] = 'kalija'
df_ext_filtered.loc[24, 'word'] = 'kampar'
df_ext_filtered.loc[26, 'word'] = 'karangpa'
df_ext_filtered.loc[28, 'word'] = 'kedir'
df_ext_filtered.loc[30, 'word'] = 'kerung-'
df_ext_filtered.loc[31, 'word'] = 'kopo-so'
df_ext_filtered.loc[34, 'word'] = 'magela'
df_ext_filtered.loc[36, 'word'] = 'mamuju'
df_ext_filtered.loc[37, 'word'] = 'mancak-'
df_ext_filtered.loc[41, 'word'] = 'pati-g'
df_ext_filtered.loc[42, 'word'] = 'peneb'
df_ext_filtered.loc[49, 'word'] = 'prabumulih-b'
df_ext_filtered.loc[60, 'word'] = 'sento'
df_ext_filtered.loc[61, 'word'] = 'singar'
df_ext_filtered.loc[62, 'word'] = 'soko-r'
df_ext_filtered.loc[64, 'word'] = 'sumate'
df_ext_filtered.loc[68, 'word'] = 'tuban-g'
df_ext_filtered.loc[70, 'word'] = 'wates-n'
df_ext_filtered.loc[72, 'word'] = 'wonosobo-p'

df_ext_filtered.loc[6, 'ext'] = 'bone-sinjai'
df_ext_filtered.loc[9, 'ext'] = 'cikajang-c'
df_ext_filtered.loc[11, 'ext'] = 'dareh-ju'
df_ext_filtered.loc[13, 'ext'] = 'godong-purwodadi'
df_ext_filtered.loc[15, 'ext'] = 'gotri-w'
df_ext_filtered.loc[19, 'ext'] = 'jatinom-b'
df_ext_filtered.loc[20, 'ext'] = 'jatinom-boyolali'
df_ext_filtered.loc[21, 'ext'] = 'juwana-r'
df_ext_filtered.loc[22, 'ext'] = 'kalijati-p'
df_ext_filtered.loc[24, 'ext'] = 'kampar-ban'
df_ext_filtered.loc[26, 'ext'] = 'karangpandan-n'
df_ext_filtered.loc[28, 'ext'] = 'kediri-ke'
df_ext_filtered.loc[30, 'ext'] = 'kerung-kerung'
df_ext_filtered.loc[31, 'ext'] = 'kopo-soreang'
df_ext_filtered.loc[34, 'ext'] = 'magelang-yo'
df_ext_filtered.loc[36, 'ext'] = 'mamuju-palu'
df_ext_filtered.loc[37, 'ext'] = 'mancak-anyer'
df_ext_filtered.loc[41, 'ext'] = 'pati-gabus'
df_ext_filtered.loc[42, 'ext'] = 'penebel-t'
df_ext_filtered.loc[49, 'ext'] = 'prabumulih-belimbing'
df_ext_filtered.loc[60, 'ext'] = 'sentolo-m'
df_ext_filtered.loc[61, 'ext'] = 'singaraja-g'
df_ext_filtered.loc[62, 'ext'] = 'soko-rengel'
df_ext_filtered.loc[64, 'ext'] = 'sumatera-k'
df_ext_filtered.loc[68, 'ext'] = 'tuban-gresik'
df_ext_filtered.loc[70, 'ext'] = 'wates-ngoro-'
df_ext_filtered.loc[72, 'ext'] = 'wonosobo-purworejo'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [38]:
df_ext_filtered.to_csv('str_et-1.csv', index=False)