This Python notebook trains a Logistic Regression model to predict whether a given Overture point should or should not be snapped to its nearest building.

In [None]:
!pip install scikit-learn --quiet
!pip install pandas --quiet
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import sklearn
import math

In [None]:
merged = pd.read_parquet(path='merged_data.parquet')

In [None]:
merged = merged.drop(['name', 'original_coord', 'snapped_coord', 'osm_coord', 'old_dist', 'new_dist'], axis=1)

In [None]:
# Each of the following is a column, representing how many roads of that type would be crossed if the point was snapped
road_columns = ['bridleway', 'cycleway', 'footway', 'living_street', 'motorway', 'path', 'pedestrian', 'primary', 'residential', 'secondary', 'service', 'steps', 'tertiary', 'track', 'trunk', 'unclassified', 'unknown']
merged[road_columns] = 0

In [None]:
def convert_list_to_columns(row):
    counts = pd.Series(row['crossed']).value_counts()
    for tag, count in counts.items():
        if tag in row.index:
            row[tag] = count
    return row

merged = merged.apply(convert_list_to_columns, axis=1)

In [None]:
# Each of the following is a column, representing certain key categories that tend not to have buildings associated with them.
key_categories = ['farm', 'lake', 'park', 'field', 'campground', 'river']
merged[key_categories] = 0

In [None]:
def category_substring(row):
    for i in range(len(key_categories)):
        if key_categories[i] in row['category']:
            row[key_categories[i]] += 1

    return row

merged = merged.apply(category_substring, axis=1)

In [None]:
# The 'y' values
merged['label'] = merged.improved_by_snap.apply(lambda x: 1 if x else 0)

In [None]:
merged = merged.drop(['category', 'improved_by_snap', 'crossed'], axis=1)

In [None]:
X = merged.drop(['label', 'id'], axis=1).values
y = merged['label'].values

In [None]:
# Save this so can merge model predictions back with the original data table (based on id)
id_col = merged['id']

In [None]:
X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(X, y, id_col, test_size=0.25)

In [None]:
model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [None]:
sklearn.metrics.accuracy_score(y_test, y_pred)

In [None]:
results = pd.DataFrame({
    'id': id_test,
    'actual': y_test,
    'prediction': y_pred
}).reset_index(drop=True)

In [None]:
# Create a new table for the test set
original_data = pd.read_parquet(path='merged_data.parquet')
merged_results = results.merge(original_data, on='id', how='left')

In [None]:
merged_results['model_dist'] = merged_results.apply(lambda x: x['new_dist'] if x['prediction'] == 1 else x['old_dist'], axis=1)

In [None]:
print(merged_results['old_dist'].mean())
print(merged_results['new_dist'].mean())
print(merged_results['model_dist'].mean())

In [None]:
merged_results['log_ratio'] = merged_results.apply(lambda x: math.log((x['old_dist'] + 0.001) / (x['model_dist'] + 0.001)), axis=1)

In [None]:
merged_results['log_ratio_all'] = merged_results.apply(lambda x: math.log((x['old_dist'] + 0.001) / (x['new_dist'] + 0.001)), axis=1)

In [None]:
print(merged_results['log_ratio'].mean())
print(merged_results['log_ratio_all'].mean())