# Process Street Data
Remove postal codes: 3 digits or more

In [1]:
# Import required modules
import numpy as np
import pandas as pd
import pickle
import re
import warnings

from tqdm.notebook import tqdm

# Settings
pd.set_option('max_colwidth', 80)
pd.options.display.max_rows = 200
warnings.filterwarnings('ignore')

## Load Data

In [2]:
# Load data
df = pd.read_csv('train.csv')

# Separate POI and street
df['poi'] = df['POI/street'].str.split('/', expand=True)[0]
df['street'] = df['POI/street'].str.split('/', expand=True)[1]

# Extract non-extended examples only
df['str_ext'] = df.apply(lambda row: not row.street in row.raw_address, axis=1)
df_main = df[~df.str_ext]

## Replace Extensions

In [3]:
# Load extensions
str_ext = pd.read_csv('analysis/str_et-1.csv')

# Replace word for word
for i in tqdm(range(str_ext.shape[0])):
    word = str_ext.word.iloc[i]
    ext = str_ext.ext.iloc[i]
    
    df['raw_address'] = df['raw_address'].str.replace(r'\b(' + word + r')\b', ext)

  0%|          | 0/99 [00:00<?, ?it/s]

## Prepare Training Data

In [4]:
str_matches = []

for i in tqdm(range(df_main.shape[0])):
    if df_main.street.iloc[i] != '':
        try:
            temp_list = list(re.finditer(r'\b(' + str(df_main.street.iloc[i]) + r')\b', df_main.raw_address.iloc[i]))
            str_matches.append(temp_list)
        except:
            
            temp_list = list(re.finditer(
                r'\b(' + re.escape(df_main.street.iloc[i]) + r')\b',
                re.escape(df_main.raw_address.iloc[i])
            ))
            str_matches.append(temp_list)
    else:
        str_matches.append([])

  0%|          | 0/282613 [00:00<?, ?it/s]

In [5]:
# Append matches
df_main['str_matches'] = str_matches
df_main['str_lens'] = df_main.str_matches.apply(len)

In [6]:
# Get full dataset
data_train = []

for i in tqdm(range(df_main.shape[0])):
    
    entities = []
    
    if df_main.str_lens.iloc[i] > 0:
        for m in df_main.str_matches.iloc[i]:
            str_start = m.start()
            str_end = m.end()
            if str_start < str_end:
                entities.append(
                    (str_start, str_end, 'STREET')
                )
        
    data_train.append(
        (
            df_main.raw_address.iloc[i],
            {
                'entities': entities
            }
        )
    )

  0%|          | 0/282613 [00:00<?, ?it/s]

In [7]:
with open('training_data/street-et1.pkl', 'wb') as file:
    pickle.dump(data_train, file)

## Prepare Test Data

In [8]:
# Load data
te = pd.read_csv('test.csv')

In [9]:
# Load extensions
# poi_ext = pd.read_csv('analysis/poi_ext.csv')

# Replace word for word
for i in tqdm(range(str_ext.shape[0])):
    word = str_ext.word.iloc[i]
    ext = str_ext.ext.iloc[i]
    
    te['raw_address'] = te['raw_address'].str.replace(r'\b(' + word + r')\b', ext)

  0%|          | 0/99 [00:00<?, ?it/s]

In [10]:
te = te[['id', 'raw_address']]

In [11]:
te.to_csv('test_data/test_str-et1.csv', index=False)