# Process Data

In [1]:
# Import required modules
import numpy as np
import pandas as pd
import pickle
import re
import warnings

from tqdm.notebook import tqdm

# Settings
pd.set_option('max_colwidth', 80)
pd.options.display.max_rows = 200
warnings.filterwarnings('ignore')

## Load Data

In [2]:
# Load data
df = pd.read_csv('train.csv')

# Separate POI and street
df['poi'] = df['POI/street'].str.split('/', expand=True)[0]
df['street'] = df['POI/street'].str.split('/', expand=True)[1]
df['raw_list'] = df.raw_address.str.split()

## Replace Extensions

### Straightforward Replacements

In [3]:
# Extend bank neg to bank negara
df.loc[df.raw_address.str.contains('bank neg '), 'raw_address'] = df.loc[df.raw_address.str.contains('bank neg '), 'raw_address'].str.replace('bank neg ', 'bank negara ')

# Extend indon to indonesia
df.loc[df.raw_address.str.contains('indon '), 'raw_address'] = df.loc[df.raw_address.str.contains('indon '), 'raw_address'].str.replace('indon ', 'indonesia ')
df.loc[df.raw_address.str.contains('indon,'), 'raw_address'] = df.loc[df.raw_address.str.contains('indon,'), 'raw_address'].str.replace('indon,', 'indonesia,')

# Update raw list
df['raw_list'] = df.raw_address.str.split()

In [4]:
# Load extensions
poi_ext = pd.read_csv('analysis/poi_et-3.csv')

# Replace word for word
for i in tqdm(range(poi_ext.shape[0])):
    word = poi_ext.word.iloc[i]
    ext = poi_ext.ext.iloc[i]
    
    df['raw_address'] = df['raw_address'].str.replace(r'\b(' + word + r')\b', ext)

  0%|          | 0/1165 [00:00<?, ?it/s]

## Prepare Training Data

In [5]:
poi_matches = []

for i in tqdm(range(df.shape[0])):
    try:
        poi_matches.append(re.search(df.poi.iloc[i], df.raw_address.iloc[i]))
    except:
        poi_matches.append(re.search(
            re.escape(df.poi.iloc[i]),
            re.escape(df.raw_address.iloc[i])
        ))

  0%|          | 0/300000 [00:00<?, ?it/s]

In [6]:
# Append matches
df['poi_matches'] = poi_matches

# Pure main
df_main = df[df.poi_matches.notnull()].copy()

# Extensions
df_ext = df[df.poi_matches.isnull()]

# Add start and end indices
df_main['poi_start'] = [p.start() if p is not None else -1 for p in df_main.poi_matches]
df_main['poi_end'] = [p.end() if p is not None else -1 for p in df_main.poi_matches]

# Drop unused
df_main = df_main.drop(['poi_matches'], axis=1)

In [7]:
data_train = []
for i in tqdm(range(df_main.shape[0])):
    
    # Pull data
    poi_start = df_main.poi_start.iloc[i]
    poi_end = df_main.poi_end.iloc[i]
    
    entities = []
    
    
    if poi_start < poi_end:
        entities.append(
            (poi_start, poi_end, 'POI')
        )
    
    data_train.append(
        (
            df_main.raw_address.iloc[i],
            {
                'entities': entities
            }
        )
    )
print(len(data_train))

  0%|          | 0/270770 [00:00<?, ?it/s]

270770


In [8]:
with open('training_data/poi-5.pkl', 'wb') as file:
    pickle.dump(data_train, file)

## Prepare Test Data

In [9]:
# Load data
te = pd.read_csv('test.csv')

In [10]:
# Extend bank neg to bank negara
te.loc[te.raw_address.str.contains('bank neg '), 'raw_address'] = te.loc[te.raw_address.str.contains('bank neg '), 'raw_address'].str.replace('bank neg ', 'bank negara ')

# Extend indon to indonesia
te.loc[te.raw_address.str.contains('indon '), 'raw_address'] = te.loc[te.raw_address.str.contains('indon '), 'raw_address'].str.replace('indon ', 'indonesia ')
te.loc[te.raw_address.str.contains('indon,'), 'raw_address'] = te.loc[te.raw_address.str.contains('indon,'), 'raw_address'].str.replace('indon,', 'indonesia,')

# Update raw list
# te['raw_list'] = te.raw_address.str.split()

In [11]:
# Load extensions
# poi_ext = pd.read_csv('analysis/poi_ext.csv')

# Replace word for word
for i in tqdm(range(poi_ext.shape[0])):
    word = poi_ext.word.iloc[i]
    ext = poi_ext.ext.iloc[i]
    
    te['raw_address'] = te['raw_address'].str.replace(r'\b(' + word + r')\b', ext)

  0%|          | 0/1165 [00:00<?, ?it/s]

In [12]:
te = te[['id', 'raw_address']]

In [13]:
te.to_csv('test_data/test_poi-et3.csv', index=False)