# ABOUT:
- in the training some answer spans "man kerto v" have a different answer label "man kertoarjo v"
- we want to find these mappings that are one to one so that we can confidently replace them in our final predictions

## read_csv

In [3]:
import pandas as pd
df = pd.read_csv(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\datasets\train.csv\train.csv")

In [4]:
# split 'POI/street' column into 2 individual columns
df[['POI', 'street']] = df['POI/street'].str.split('/', expand=True)
df = df.drop(["POI/street"],axis=1)

In [5]:
# convert POI and street to one columns
# remove all rows without answers
df = pd.concat([df.loc[df.street!='',["raw_address",'street']].rename(columns = {"street":"answer"}),df.loc[df.POI!='',["raw_address",'POI']].rename(columns = {"POI":"answer"})],axis=0)
df.head()

Unnamed: 0,raw_address,answer
0,jl kapuk timur delta sili iii lippo cika 11 a ...,jl kapuk timur delta sili iii lippo cika
2,setu siung 119 rt 5 1 13880 cipayung,siung
4,jl. orde baru,jl. orde baru
5,"raya samb gede, 299 toko bb kids",raya samb gede
6,"kem mel raya, no 4 bojong rawalumbu rt 1 36 ra...",kem mel raya


## Find one to one POI/street mappings
- given "man kerto v"
    - it can have many solutions including itself
        1. 'man kertoarjo v'
        2. 'manyar kerto v'
        3. "man kerto v"
- we want to find abbreviated POI/street that map to exactly one correct POI/street 
    e.g "ahmad dah iv" is mapped to only "ahmad dahlan iv" in the training set



In [27]:
import numpy as np
def process(address,answer):
#     if answer in address:                                    
#         return np.nan
    
    address_words = address.replace(",","").split()
    answer_words = answer.split()
    
    for i,word in enumerate(address_words):
        if address_words[i] not in answer_words[0]:
            continue
        break
    
    return " ".join(address_words[i:i+len(answer_words)])

In [28]:
df['answer_in_address'] = df.apply(lambda row: process(row['raw_address'],row['answer']),axis=1)

In [29]:
df

Unnamed: 0,raw_address,answer,answer_in_address
0,jl kapuk timur delta sili iii lippo cika 11 a ...,jl kapuk timur delta sili iii lippo cika,jl kapuk timur delta sili iii lippo cika
2,setu siung 119 rt 5 1 13880 cipayung,siung,siung
4,jl. orde baru,jl. orde baru,jl. orde baru
5,"raya samb gede, 299 toko bb kids",raya samb gede,raya samb gede
6,"kem mel raya, no 4 bojong rawalumbu rt 1 36 ra...",kem mel raya,kem mel raya
...,...,...,...
299989,"lapangan bakti, singki', rantepao, kab torut, ...",lapangan bakti,lapangan bakti
299990,"ver, durian bajenis",veruna,ver
299992,kios asmi panaikang,kios asmi,kios asmi
299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri,taman asri


In [30]:
from collections import defaultdict
mappings = defaultdict(set)
def send_to_mappings(row):
    mappings[row.answer_in_address].add(row.answer)
df[~df.answer_in_address.isna()].apply(lambda x: send_to_mappings(x),axis=1)

0         None
2         None
4         None
5         None
6         None
          ... 
299989    None
299990    None
299992    None
299998    None
299999    None
Length: 351348, dtype: object

- (above) many words like "jati" can be mapped to multiple other answers including itself
    - 'jati',
    - 'jatianom',
    - 'jatipuro',
    - 'jatirogo',
    - 'jatiroto',
    - 'jatisam'
- we ignore those

In [35]:
one_to_one_mappings = dict()
for k,s in mappings.items():
    v = next(iter(s))
    if len(s)==1 and k!=v:
        one_to_one_mappings[k] = v

In [36]:
(one_to_one_mappings)

{'tam asri gg. 7': 'tam asri, gg. 7',
 'pluit pluit karang jel': 'pluit karang jel 13',
 'ahmad dah iv': 'ahmad dahlan iv',
 'cipinang besar selatan': 'cipinang jaya 1a',
 'gesya gesya': 'gesya resid',
 'sungai beringin': 'sungai beri',
 'mamp 2nd': 'mamp prap',
 'pulori': 'puloriburit',
 'kar utara': 'karang utara',
 'gun anyar lor gg. i': 'gun anyar lor, gg. i',
 'r. wol mongin': 'r. wol monginsidi',
 'raya banjarn - pur': 'raya banjarnegara - purwok',
 'colu ten ii': 'columbus ten ii',
 'bule timur kel gading': 'bulevar timur kel gading',
 'al maqbul (jalan': 'jalan raya kuwolu',
 'kel dua raya gg. mel': 'kel dua raya, gg. mel',
 'jl. mh. thamrin.': 'jl. mh. thamrin',
 'jatij 2 gg. al chas iv': 'jatij 2, gg. al chas iv',
 'prabumu': 'prabumulih-b',
 'pad ii gg. 5': 'pad ii, gg. 5',
 'jl. gay timur mgp': 'jl. gayung timur mgp',
 'p. suryaa': 'p. suryaatmaja',
 'pad ii gg. 1': 'pad ii, gg. 1',
 'pala 2': 'palasari 2',
 'taman crys 1': 'taman crystal 1',
 'jl pluit karang ayu baratpenj

In [38]:
wrong_mappings = dict()

In [40]:
def clean_mappings(mappings):
    wrong_mappings = dict()
    correct_mappings = dict()
    
    for abbr,correct in mappings.items():
        abbr_words = abbr.split()
        correct_words = correct.split()
        red_flag = False
        
        for i in range(len(abbr_words)):
            if abbr_words[i] not in correct_words[i] or len(abbr_words)!=len(correct_words):
                red_flag = True
                break
        if red_flag:
            wrong_mappings[abbr] = correct
        else:
            correct_mappings[abbr] = correct
    return correct_mappings,wrong_mappings

In [41]:
correct_mappings,wrong_mappings = clean_mappings(one_to_one_mappings)

## correct_mappings
- these mappings are one to one 
- if these abbreviations are seen in our predictions we can map them to correct answer

In [42]:
correct_mappings

{'tam asri gg. 7': 'tam asri, gg. 7',
 'ahmad dah iv': 'ahmad dahlan iv',
 'pulori': 'puloriburit',
 'kar utara': 'karang utara',
 'gun anyar lor gg. i': 'gun anyar lor, gg. i',
 'r. wol mongin': 'r. wol monginsidi',
 'raya banjarn - pur': 'raya banjarnegara - purwok',
 'colu ten ii': 'columbus ten ii',
 'bule timur kel gading': 'bulevar timur kel gading',
 'kel dua raya gg. mel': 'kel dua raya, gg. mel',
 'jatij 2 gg. al chas iv': 'jatij 2, gg. al chas iv',
 'prabumu': 'prabumulih-b',
 'pad ii gg. 5': 'pad ii, gg. 5',
 'jl. gay timur mgp': 'jl. gayung timur mgp',
 'p. suryaa': 'p. suryaatmaja',
 'pad ii gg. 1': 'pad ii, gg. 1',
 'pala 2': 'palasari 2',
 'taman crys 1': 'taman crystal 1',
 'kra kecil': 'kramat kecil',
 'muti lest 24': 'muti lestari 24',
 'gran baru vi gg. vi': 'gran baru vi, gg. vi',
 'tukad band xii': 'tukad bandung xii',
 'mage & kutu tegal': 'magelang & kutu tegal',
 'perum sapphire resid': 'perumahan sapphire resid',
 'pad ii gang 17': 'padema ii gang 17',
 'gud pe

In [43]:
len(correct_mappings)

47022

In [44]:
import pickle
with open(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\datasets\Abbr_to_Norm_mappings.pickle","wb") as f:
    pickle.dump(correct_mappings,f)

- below are wrong mappings which we wont use

In [45]:
(wrong_mappings)

{'pluit pluit karang jel': 'pluit karang jel 13',
 'cipinang besar selatan': 'cipinang jaya 1a',
 'gesya gesya': 'gesya resid',
 'sungai beringin': 'sungai beri',
 'mamp 2nd': 'mamp prap',
 'al maqbul (jalan': 'jalan raya kuwolu',
 'jl. mh. thamrin.': 'jl. mh. thamrin',
 'jl pluit karang ayu baratpenjaringanjkt': 'jl pluit karang ayu barat',
 'indra indra jaya gang': 'indra jaya gang ten',
 'ploso kel. ploso': 'ploso timur iv',
 'sira siraman': 'siraman pulu',
 'pulo padang pulo pad': 'pulo pad - mar',
 'l no 17': 'jl. syarifuddin yoes',
 'kalbar': 'jln.ngura',
 'punia)': 'jl. sriwijaya',
 'jalan sabang merahkelurahan': 'jalan sabang merah',
 'jl. boulevard raya gading serpongtangerang': 'jl. boulevard raya gading serpong',
 'dukuh menanggal dukuh': 'dukuh menan ix',
 'pakal pakal sumb baru gg.': 'pakal sumb baru, gg. 1',
 'pluit village': 'pluit indah',
 'ar rafi': 'wari jaya',
 'mas karebet': 'mas raya',
 'm a': 'mang i',
 'a kasu': 'kasu raya',
 'raja raja': 'raja sela',
 'argo kenc