# ABOUT:
- convert to required format and submit

## load predictions
- these answer spans were generated from two nlp tasks
    1. text classification
        - we classify if POI and street exists in the address
            1. if they don't we can just return empty string ""
            2. if they exists the QA model will find the span
    2. Question answering task 
        - we asked the QA model these questions
            1. what is the POI? 
            2. what is the street name?

In [1]:
import pandas as pd
df = pd.read_csv(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\predictions\QA predictions 1.csv",index_col = "id")
df = df.iloc[:,1:]
df.POI_span = df.POI_span.fillna("")
df.street_span = df.street_span.fillna("")
df

Unnamed: 0_level_0,raw_address,POI_span,street_span
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,s. par 53 sidanegara 4 cilacap tengah,,s. par
1,"angg per, baloi indah kel. lubuk baja",,angg per
2,"asma laun, mand imog,",asma laun,mand imog
3,"ud agung rej, raya nga sri wedari karanganyar",ud agung rej,raya nga sri
4,"cut mutia, 35 baiturrahman",,cut mutia
...,...,...,...
49995,toko mbak farid semboro semboro,toko mbak farid,
49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vie - tk. ridho kids,vete 3 cari
49997,"mart dan roti bakar malabar, nasio,",mart dan roti bakar malabar,nasio
49998,graha indah pamulang jl. mujair raya bambu apu...,graha indah,jl. mujair raya


## load mappings
- these mappings are one to one
- they were created from training set
- if they are seen again in test set we replace the abbreviated answeer with the correct answer

In [2]:
import pickle
with open(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\datasets\Abbr_to_Norm_mappings.pickle", "rb") as f:
    mappings = pickle.load(f)

- mappings = {'kah nasu': 'kaharu nasu',
 'ahmad dah iv': 'ahmad dahlan iv',
 'taman mer': 'taman meruya',
 'pahl': 'pahlawan',
 'yos suda': 'yos sudarso'....}

In [3]:
def adjust_span(span):
    if span=="" or span not in mappings:
        return span
    return mappings[span]

## map to new answer spans
- using the mappings we replace the abbreviated answeer with the correct answer

In [4]:
df['adjusted_POI_span'] = df.POI_span.apply(lambda span: adjust_span(span))
df['adjusted_street_span'] = df.street_span.apply(lambda span: adjust_span(span))

In [5]:
df[df.POI_span!=df.adjusted_POI_span]

Unnamed: 0_level_0,raw_address,POI_span,street_span,adjusted_POI_span,adjusted_street_span
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,"ud agung rej, raya nga sri wedari karanganyar",ud agung rej,raya nga sri,ud agung rejeki,raya nga sri
36,"m. t. hary, no 11 bank neg indonesia kali rejo...",bank neg indonesia,m. t. hary,bank negara indonesia,m. t. hary
145,"agen payt,",agen payt,,agen paytren,
168,"bank rak indonesia, raya pengas, pengasinan sa...",bank rak indonesia,raya pengas,bank rakyat indonesia,raya pengas
215,"toko gemi, banyuputih",toko gemi,,toko gemilang,
...,...,...,...,...,...
49454,"toko kelon, raya mauk, kosambi",toko kelon,raya mauk,toko kelontong,raya mauk
49569,"bakso pan, r. a. kart, tunggalpager",bakso pan,r. a. kart,bakso pandji,r. a. kart
49581,"hayam wuruk c 99 hara jaya toko, rw 5 maphar t...",hara jaya toko,hayam wuruk,harapan jaya toko,hayam wuruk
49878,"war cinta, giri,",war cinta,giri,warnet cinta,giri


In [6]:
df[df.street_span!=df.adjusted_street_span]

Unnamed: 0_level_0,raw_address,POI_span,street_span,adjusted_POI_span,adjusted_street_span
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"angg per, baloi indah kel. lubuk baja",,angg per,,anggrek per
163,"jela sela 15,",,jela sela 15,,jelambar sela 15
448,raya boj ren rawa burung kosambi,,raya boj ren,,raya boj renged
532,jatibening baru jati ben ii rt 2 3 pondok gede,,jati ben ii,,jati bening ii
647,gun singga 17855 cikarang selatan,,gun singga,,gun singgalang
...,...,...,...,...,...
49545,"rm galuh kencana, raya leuwina, tapos",rm galuh kencana,raya leuwina,rm galuh kencana,raya leuwinanggung
49583,padema ii a pademangan timur pademangan,,padema ii a,,pademangan ii a
49671,"smp harad perum bukit gad cis,",smp harad,perum bukit gad cis,smp harad,perumahan bukit gad cis
49759,pradahkali kendal kel. darmo per sela xii no 3...,,darmo per sela xii,,darmo permai sela xii


- (above) very few rows, about 1000 had their answer spans changed

## Generate submissions
- we submit two copies
    1. not adjusted
    2. adjusted using mappings

In [7]:
adjusted_df = pd.concat([df.adjusted_POI_span+"/"+df.adjusted_street_span],axis=1)
adjusted_df = adjusted_df.rename(columns  ={0:"POI/street"})
adjusted_df

Unnamed: 0_level_0,POI/street
id,Unnamed: 1_level_1
0,/s. par
1,/anggrek per
2,asma laun/mand imog
3,ud agung rejeki/raya nga sri
4,/cut mutia
...,...
49995,toko mbak farid/
49996,vie - tk. ridho kids/vete 3 cari
49997,mart dan roti bakar malabar/nasio
49998,graha indah/jl. mujair raya


In [8]:
not_adjusted_df = pd.concat([df.POI_span+"/"+df.street_span],axis=1)
not_adjusted_df = not_adjusted_df.rename(columns  ={0:"POI/street"})
not_adjusted_df

Unnamed: 0_level_0,POI/street
id,Unnamed: 1_level_1
0,/s. par
1,/angg per
2,asma laun/mand imog
3,ud agung rej/raya nga sri
4,/cut mutia
...,...
49995,toko mbak farid/
49996,vie - tk. ridho kids/vete 3 cari
49997,mart dan roti bakar malabar/nasio
49998,graha indah/jl. mujair raya


In [9]:
adjusted_df.to_csv(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\predictions\adjusted_pred.csv")
not_adjusted_df.to_csv(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\predictions\not_adjusted_pred.csv")