In [1]:
%load_ext autoreload
%autoreload 2

%load_ext dotenv
%dotenv

In [2]:
import re

import numpy as np
import pandas as pd
import os
import featuretools as ft
from shl.prepare import normalize_epoch_time, normalize_lat_long, calculate_window, calculate_shift
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import DistanceMetric
from shapely.geometry import Polygon, LineString, Point
from tqdm import tqdm
import geopandas as gpd

from shl.features import WifiFeature
import matplotlib
import folium
import branca.colormap as cm

In [3]:
train_location = normalize_epoch_time(pd.read_parquet('../data/train/Location.parquet'), 'epoch_time')
test_location = normalize_epoch_time(pd.read_parquet('../data/test/Location.parquet'), 'epoch_time')
validate_location = normalize_epoch_time(pd.read_parquet('../data/validate/Location.parquet'), 'epoch_time')
train_label = normalize_epoch_time(pd.read_parquet('../data/train/Label.parquet'), 'epoch_time')
test_label = normalize_epoch_time(pd.read_parquet('../data/test/Label_idx.parquet'), 'epoch_time')
validate_label = normalize_epoch_time(pd.read_parquet('../data/validate/Label.parquet'), 'epoch_time')

In [4]:
feature_points = {
    'London': [
        [51.509865, -0.118092]
    ],
    'Brighton': [
        [50.827778, -0.152778]
    ],
    'parks': [
        # Hollingbury Golf Course
        [50.85423803467499, -0.12791258170001926],
        # St Ann’s Well Gardens
        [50.829876823789675, -0.15525600010959892],
        # Preston Park
        [50.839694335541274, -0.1462660790420134],
        # Waterwall conservation area
        [50.8659, -0.1750],
        # Withdean park
        [50.8546, -0.1497],
        # Stanmer park
        [50.8678, -0.0968],
        # Upper Lodge Wood
        [50.8751, -0.1177],
        # Pudding bag
        [50.8710, -0.1161],
        # Great Wood
        [50.8653, -0.1036],
        # Grubbings
        [50.8729, -0.0971],
        # Milbark wood
        [50.8783, -0.0982],
        # High park wood
        [50.8849, -0.1078],
        # Green broom
        [50.8833, -0.1107],
        # Moon's plantations
        [50.8774, -0.0840],
        # Friston forest
        [50.7783, 0.1894],
        # Malthouse wood
        [51.0042, -0.2044],
        # Bedgebury forest
        [51.0694, 0.4649],
        # Swinley forest
        [51.3726, -0.7292],
        # Crowthore wood
        [51.3808, -0.7598],
        # Queen Elizabeth Country Parh
        [50.9651, -0.9695],
        # Hurth wood
        [51.1854, -0.4278],
        # Winterfold wood
        [51.1764, -0.4564],
        # Queen's park
        [50.8249, -0.1248],
    ],
    'shopping malls': [

    ],
    'stations': [

    ]
}

def calculate_minimal_distance(data: pd.DataFrame, points):
    dist = DistanceMetric.get_metric('haversine')
    dist_matrix = dist.pairwise(data[['Latitude','Longitude']], points)
    return dist_matrix.min(axis=1)

def create_point_distance_features(data: pd.DataFrame):
    features = data[['epoch_time_id']]
    for name, points in feature_points.items():
        if len(points) > 0:
            if type(points[0]) == list: 
                features[f'distance_{name}'] = calculate_minimal_distance(data, points)
            elif type(points[0]) == dict:
                for point in points:
                    point_name, point_loc = list(point.keys())[0], list(point.values())[0]
                    features[f'distance_{name}_{point_name}'] = calculate_minimal_distance(data, [point_loc])

    return features

settings = {
    'fill_limit': 30,
    'window_sizes': [60, 300, 600],
    'window_center': True,
    # 'window_functions': ['mean', 'std'],
    'columns': ['distance_London', 'distance_Brighton', 'distance_parks'],
    'functions': ['mean', 'std', 'median'],
}
shift_settings = {
    'periods': [60, 300, 600],
    'columns_patterns': ['window_'],
    'fill_limit': 30,
}
train_distances_to_points_with_windows = calculate_shift(calculate_window(create_point_distance_features(train_location), **settings), **shift_settings)
display(train_distances_to_points_with_windows)

train_distances_to_points_with_windows.to_parquet('../data/train/features_distances.parquet')
calculate_shift(calculate_window(create_point_distance_features(test_location), **settings), **shift_settings).to_parquet('../data/test/features_distances.parquet')
calculate_shift(calculate_window(create_point_distance_features(validate_location), **settings), **shift_settings).to_parquet('../data/validate/features_distances.parquet')

Unnamed: 0,epoch_time_id,distance_London,distance_Brighton,distance_parks,distance_London_window_60_mean,distance_London_window_60_std,distance_London_window_60_median,distance_Brighton_window_60_mean,distance_Brighton_window_60_std,distance_Brighton_window_60_median,...,distance_parks_window_300_median_shift_60_future,distance_London_window_600_mean_shift_60_future,distance_London_window_600_std_shift_60_future,distance_London_window_600_median_shift_60_future,distance_Brighton_window_600_mean_shift_60_future,distance_Brighton_window_600_std_shift_60_future,distance_Brighton_window_600_median_shift_60_future,distance_parks_window_600_mean_shift_60_future,distance_parks_window_600_std_shift_60_future,distance_parks_window_600_median_shift_60_future
0,1490431658000,0.665419,0.023635,0.010603,0.666419,0.000388,0.666553,0.022820,0.000467,0.022708,...,,,,,,,,,,
1,1490431660000,0.665422,0.023632,0.010606,0.666419,0.000388,0.666553,0.022820,0.000467,0.022708,...,,,,,,,,,,
2,1490431661000,0.665432,0.023621,0.010617,0.666419,0.000388,0.666553,0.022820,0.000467,0.022708,...,,,,,,,,,,
3,1490431662000,0.665444,0.023613,0.010628,0.666419,0.000388,0.666553,0.022820,0.000467,0.022708,...,,,,,,,,,,
4,1490431663000,0.665645,0.023557,0.010769,0.666419,0.000388,0.666553,0.022820,0.000467,0.022708,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911104,1499268278000,0.663119,0.025111,0.008683,0.663099,0.000012,0.663093,0.025117,0.000055,0.025126,...,,,,,,,,,,
911105,1499268279000,0.663119,0.025111,0.008683,0.663099,0.000012,0.663093,0.025117,0.000055,0.025126,...,,,,,,,,,,
911106,1499268280000,0.663119,0.025111,0.008683,0.663099,0.000012,0.663093,0.025117,0.000055,0.025126,...,,,,,,,,,,
911107,1499268281000,0.663119,0.025111,0.008683,0.663099,0.000012,0.663093,0.025117,0.000055,0.025126,...,,,,,,,,,,


In [5]:
railways = gpd.read_file('../additional_data/railways.json')
display(railways)

Unnamed: 0,geometry
0,"LINESTRING (-0.14132 50.82902, -0.14151 50.830..."
1,"LINESTRING (-0.93849 50.85162, -0.93633 50.851..."
2,"LINESTRING (-0.14087 50.82901, -0.14089 50.829..."
3,"LINESTRING (0.27687 50.80288, 0.27665 50.80432..."
4,"LINESTRING (-0.14108 50.82902, -0.14104 50.829..."
5,"LINESTRING (-0.16576 51.23988, -0.16623 51.239..."
6,"LINESTRING (-0.17003 51.23156, -0.16928 51.233..."
7,"LINESTRING (-0.16563 51.23996, -0.15608 51.255..."
8,"LINESTRING (-0.08851 51.38370, -0.08783 51.384..."
9,"LINESTRING (-0.07169 51.40712, -0.07087 51.406..."


In [6]:
subway = gpd.read_file('../additional_data/subway.json')
display(subway)

Unnamed: 0,geometry
0,"LINESTRING (0.07963 51.54034, 0.07729 51.54146..."
1,"LINESTRING (0.01027 51.51106, 0.00955 51.51240..."
2,"LINESTRING (-0.18799 51.57114, -0.19263 51.572..."
3,"LINESTRING (-0.23859 51.58982, -0.24056 51.590..."
4,"LINESTRING (-0.30260 51.61922, -0.30124 51.617..."
5,"LINESTRING (-0.37130 51.57519, -0.36957 51.574..."
6,"LINESTRING (-0.37130 51.57515, -0.36744 51.574..."
7,"LINESTRING (-0.22702 51.49365, -0.22505 51.493..."
8,"LINESTRING (-0.19207 51.48249, -0.19288 51.481..."
9,"LINESTRING (0.09334 51.60333, 0.09079 51.59488..."


In [7]:
feature_lines = {
    'subway': gpd.read_file('../additional_data/subway.json'),
    'railways': gpd.read_file('../additional_data/railways.json'),
}

def calculate_min_distance_to_lines(point: Point, lines: gpd.GeoDataFrame):
    return min(map(point.distance, lines.loc[:, 'geometry']))

def create_distance_to_lines_features(data: pd.DataFrame):
    features = {
        'epoch_time_id': data['epoch_time_id'],
    }
    for name, lines in feature_lines.items():
        distances = []
        for _, row in tqdm(data.iterrows(), total=data.shape[0]):
            distances.append(calculate_min_distance_to_lines(Point(row['Longitude'], row['Latitude']), lines))
        features[f'distance_{name}'] = distances

    return pd.DataFrame(features)

In [8]:
settings = {
    'fill_limit': 30,
    'window_sizes': [60, 300, 600],
    'window_center': True,
    # 'window_functions': ['mean', 'std'],
    'columns': ['distance_subway', 'distance_railways'],
    'functions': ['mean', 'std', 'median'],
}
shift_settings = {
    'periods': [60, 300, 600],
    'columns_patterns': ['window_'],
    'fill_limit': 30,
}
train_distances_to_lines_with_windows = calculate_shift(calculate_window(create_distance_to_lines_features(train_location), **settings), **shift_settings)
display(train_distances_to_lines_with_windows)

train_distances_to_lines_with_windows.to_parquet('../data/train/train_distances_to_lines_with_windows.parquet')
calculate_shift(calculate_window(create_distance_to_lines_features(test_location), **settings), **shift_settings).to_parquet('../data/test/test_distances_to_lines_with_windows.parquet')
calculate_shift(calculate_window(create_distance_to_lines_features(validate_location), **settings), **shift_settings).to_parquet('../data/validate/validate_distances_to_lines_with_windows.parquet')


100%|██████████| 911109/911109 [12:12<00:00, 1244.14it/s]
100%|██████████| 911109/911109 [11:33<00:00, 1312.88it/s]


Unnamed: 0,epoch_time_id,distance_subway,distance_railways,distance_subway_window_60_mean,distance_subway_window_60_std,distance_subway_window_60_median,distance_railways_window_60_mean,distance_railways_window_60_std,distance_railways_window_60_median,distance_subway_window_300_mean,...,distance_subway_window_300_median_shift_60_future,distance_railways_window_300_mean_shift_60_future,distance_railways_window_300_std_shift_60_future,distance_railways_window_300_median_shift_60_future,distance_subway_window_600_mean_shift_60_future,distance_subway_window_600_std_shift_60_future,distance_subway_window_600_median_shift_60_future,distance_railways_window_600_mean_shift_60_future,distance_railways_window_600_std_shift_60_future,distance_railways_window_600_median_shift_60_future
0,1490431658000,0.581761,0.006968,0.582727,0.000372,0.582794,0.006055,0.000362,0.006051,,...,,,,,,,,,,
1,1490431660000,0.581764,0.006965,0.582727,0.000372,0.582794,0.006055,0.000362,0.006051,,...,,,,,,,,,,
2,1490431661000,0.581773,0.006957,0.582727,0.000372,0.582794,0.006055,0.000362,0.006051,,...,,,,,,,,,,
3,1490431662000,0.581785,0.006946,0.582727,0.000372,0.582794,0.006055,0.000362,0.006051,,...,,,,,,,,,,
4,1490431663000,0.582004,0.006713,0.582727,0.000372,0.582794,0.006055,0.000362,0.006051,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911104,1499268278000,0.579433,0.009272,0.579411,0.000019,0.579412,0.009296,0.000034,0.009282,,...,,,,,,,,,,
911105,1499268279000,0.579433,0.009272,0.579411,0.000019,0.579412,0.009296,0.000034,0.009282,,...,,,,,,,,,,
911106,1499268280000,0.579433,0.009272,0.579411,0.000019,0.579412,0.009296,0.000034,0.009282,,...,,,,,,,,,,
911107,1499268281000,0.579433,0.009272,0.579411,0.000019,0.579412,0.009296,0.000034,0.009282,,...,,,,,,,,,,


100%|██████████| 562565/562565 [07:12<00:00, 1301.40it/s]
100%|██████████| 562565/562565 [07:09<00:00, 1309.72it/s]
100%|██████████| 101524/101524 [01:27<00:00, 1164.95it/s]
100%|██████████| 101524/101524 [01:01<00:00, 1645.16it/s]


In [9]:
train_label.merge(train_distances_to_lines_with_windows, on="epoch_time_id")

Unnamed: 0,epoch_time,label,epoch_time_id,distance_subway,distance_railways,distance_subway_window_60_mean,distance_subway_window_60_std,distance_subway_window_60_median,distance_railways_window_60_mean,distance_railways_window_60_std,...,distance_subway_window_300_median_shift_60_future,distance_railways_window_300_mean_shift_60_future,distance_railways_window_300_std_shift_60_future,distance_railways_window_300_median_shift_60_future,distance_subway_window_600_mean_shift_60_future,distance_subway_window_600_std_shift_60_future,distance_subway_window_600_median_shift_60_future,distance_railways_window_600_mean_shift_60_future,distance_railways_window_600_std_shift_60_future,distance_railways_window_600_median_shift_60_future
0,1490431658000,4,1490431658000,0.581761,0.006968,0.582727,0.000372,0.582794,0.006055,0.000362,...,,,,,,,,,,
1,1490431660000,4,1490431660000,0.581764,0.006965,0.582727,0.000372,0.582794,0.006055,0.000362,...,,,,,,,,,,
2,1490431661000,4,1490431661000,0.581773,0.006957,0.582727,0.000372,0.582794,0.006055,0.000362,...,,,,,,,,,,
3,1490431662000,4,1490431662000,0.581785,0.006946,0.582727,0.000372,0.582794,0.006055,0.000362,...,,,,,,,,,,
4,1490431663000,4,1490431663000,0.582004,0.006713,0.582727,0.000372,0.582794,0.006055,0.000362,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662501,1499267848000,5,1499267848000,0.581049,0.007721,0.581106,0.001277,0.581081,0.007655,0.001293,...,0.579412,0.008349,0.001454,0.009338,0.581817,0.003460,0.579412,0.007197,0.002964,0.009338
662502,1499267849000,5,1499267849000,0.580982,0.007789,0.581046,0.001270,0.581015,0.007715,0.001286,...,0.579411,0.008360,0.001449,0.009339,0.581796,0.003437,0.579411,0.007209,0.002960,0.009339
662503,1499267850000,5,1499267850000,0.580913,0.007859,0.580986,0.001259,0.580947,0.007776,0.001274,...,0.579411,0.008371,0.001443,0.009339,0.581775,0.003414,0.579411,0.007220,0.002954,0.009339
662504,1499267851000,5,1499267851000,0.580845,0.007928,0.580926,0.001244,0.580879,0.007837,0.001260,...,0.579411,0.008382,0.001438,0.009339,0.581755,0.003392,0.579411,0.007232,0.002949,0.009339
