In [1]:
%load_ext autoreload
%autoreload 2

%load_ext dotenv
%dotenv

In [2]:
import re

import numpy as np
import pandas as pd
import os
import featuretools as ft
from shl.prepare import normalize_epoch_time, normalize_lat_long, calculate_window, calculate_shift, fillna_agg_by_label
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import DistanceMetric
from sklearn.metrics.pairwise import haversine_distances
from sklearn.metrics import pairwise_distances_argmin_min
from shapely.geometry import Polygon, LineString, Point
from tqdm import tqdm
import geopandas as gpd
import pickle

from shl.features import WifiFeature
import matplotlib
import folium
import branca.colormap as cm

In [3]:
train_location = normalize_epoch_time(pd.read_parquet('../data/train/Location.parquet'), 'epoch_time')
test_location = normalize_epoch_time(pd.read_parquet('../data/test/Location.parquet'), 'epoch_time')
validate_location = normalize_epoch_time(pd.read_parquet('../data/validate/Location.parquet'), 'epoch_time')
train_label = normalize_epoch_time(pd.read_parquet('../data/train/Label.parquet'), 'epoch_time')
test_label = normalize_epoch_time(pd.read_parquet('../data/test/Label_idx.parquet'), 'epoch_time')
validate_label = normalize_epoch_time(pd.read_parquet('../data/validate/Label.parquet'), 'epoch_time')

In [4]:
with open('../additional_data/bus_stops.pickle', 'rb') as f:
    bus_stops = pickle.load(f)

feature_points = {
    'London': [
        [51.509865, -0.118092]
    ],
    'Brighton': [
        [50.827778, -0.152778]
    ],
    'parks': [
        # Hollingbury Golf Course
        [50.85423803467499, -0.12791258170001926],
        # St Ann’s Well Gardens
        [50.829876823789675, -0.15525600010959892],
        # Preston Park
        [50.839694335541274, -0.1462660790420134],
        # Waterwall conservation area
        [50.8659, -0.1750],
        # Withdean park
        [50.8546, -0.1497],
        # Stanmer park
        [50.8678, -0.0968],
        # Upper Lodge Wood
        [50.8751, -0.1177],
        # Pudding bag
        [50.8710, -0.1161],
        # Great Wood
        [50.8653, -0.1036],
        # Grubbings
        [50.8729, -0.0971],
        # Milbark wood
        [50.8783, -0.0982],
        # High park wood
        [50.8849, -0.1078],
        # Green broom
        [50.8833, -0.1107],
        # Moon's plantations
        [50.8774, -0.0840],
        # Friston forest
        [50.7783, 0.1894],
        # Malthouse wood
        [51.0042, -0.2044],
        # Bedgebury forest
        [51.0694, 0.4649],
        # Swinley forest
        [51.3726, -0.7292],
        # Crowthore wood
        [51.3808, -0.7598],
        # Queen Elizabeth Country Parh
        [50.9651, -0.9695],
        # Hurth wood
        [51.1854, -0.4278],
        # Winterfold wood
        [51.1764, -0.4564],
        # Queen's park
        [50.8249, -0.1248],
    ],
    'bus_stops': bus_stops,
    'shopping malls': [

    ],
    'stations': [

    ]
}

def calculate_minimal_distance(data: pd.DataFrame, points):
    from sklearn.neighbors import BallTree
    tree = BallTree(np.array(points), leaf_size=15)
    distances, indices = tree.query(data[['Latitude','Longitude']], k=1)
    return distances
    # return pairwise_distances_argmin_min(data[['Latitude','Longitude']], np.array(points))

def create_point_distance_features(data: pd.DataFrame):
    features = data[['epoch_time_id']]
    for name, points in feature_points.items():
        if len(points) > 0:
            if type(points[0]) == list: 
                features[f'distance_{name}'] = calculate_minimal_distance(data, points)
            elif type(points[0]) == dict:
                for point in points:
                    point_name, point_loc = list(point.keys())[0], list(point.values())[0]
                    features[f'distance_{name}_{point_name}'] = calculate_minimal_distance(data, [point_loc])

    return features

settings = {
    'fill_limit': 30,
    'window_sizes': [60, 300, 600],
    'window_center': True,
    # 'window_functions': ['mean', 'std'],
    'columns': ['distance_London', 'distance_Brighton', 'distance_parks'],
    'functions': ['mean', 'std', 'median'],
}
shift_settings = {
    'periods': [60, 300, 600],
    'columns_patterns': ['window_'],
    'fill_limit': 30,
}
train_distances_to_points_with_windows = fillna_agg_by_label(calculate_shift(calculate_window(create_point_distance_features(train_location), **settings), **shift_settings), train_label)
display(train_distances_to_points_with_windows)

train_distances_to_points_with_windows.to_parquet('../data/train/features_distances.parquet')
fillna_agg_by_label(calculate_shift(calculate_window(create_point_distance_features(test_location), **settings), **shift_settings), test_label).to_parquet('../data/test/features_distances.parquet')
fillna_agg_by_label(calculate_shift(calculate_window(create_point_distance_features(validate_location), **settings), **shift_settings), validate_label).to_parquet('../data/validate/features_distances.parquet')

Unnamed: 0,epoch_time_id,distance_London,distance_Brighton,distance_parks,distance_bus_stops,distance_London_window_60_mean,distance_London_window_60_std,distance_London_window_60_median,distance_Brighton_window_60_mean,distance_Brighton_window_60_std,...,distance_parks_window_300_median_shift_60_future,distance_London_window_600_mean_shift_60_future,distance_London_window_600_std_shift_60_future,distance_London_window_600_median_shift_60_future,distance_Brighton_window_600_mean_shift_60_future,distance_Brighton_window_600_std_shift_60_future,distance_Brighton_window_600_median_shift_60_future,distance_parks_window_600_mean_shift_60_future,distance_parks_window_600_std_shift_60_future,distance_parks_window_600_median_shift_60_future
0,1490431583000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,0.025955,0.000000,...,,,,,,,,,,
1,1490431584000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,0.025955,0.000000,...,,,,,,,,,,
2,1490431585000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,0.025955,0.000000,...,,,,,,,,,,
3,1490431586000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,0.025955,0.000000,...,,,,,,,,,,
4,1490431587000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,0.025955,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987383,1499268278000,0.663243,0.027238,0.009174,0.001605,0.663223,0.000012,0.663219,0.027241,0.000072,...,,,,,,,,,,
987384,1499268279000,0.663243,0.027238,0.009175,0.001605,0.663223,0.000012,0.663219,0.027241,0.000072,...,,,,,,,,,,
987385,1499268280000,0.663243,0.027238,0.009175,0.001605,0.663223,0.000012,0.663219,0.027241,0.000072,...,,,,,,,,,,
987386,1499268281000,0.663243,0.027238,0.009175,0.001605,0.663223,0.000012,0.663219,0.027241,0.000072,...,,,,,,,,,,


In [5]:
train_distances_to_points_with_windows_with_label = train_label.merge(train_distances_to_points_with_windows, on='epoch_time_id')
train_distances_to_points_with_windows_with_label

Unnamed: 0,epoch_time,label,epoch_time_id,distance_London,distance_Brighton,distance_parks,distance_bus_stops,distance_London_window_60_mean,distance_London_window_60_std,distance_London_window_60_median,...,distance_parks_window_300_median_shift_60_future,distance_London_window_600_mean_shift_60_future,distance_London_window_600_std_shift_60_future,distance_London_window_600_median_shift_60_future,distance_Brighton_window_600_mean_shift_60_future,distance_Brighton_window_600_std_shift_60_future,distance_Brighton_window_600_median_shift_60_future,distance_parks_window_600_mean_shift_60_future,distance_parks_window_600_std_shift_60_future,distance_parks_window_600_median_shift_60_future
0,1490431583000,4,1490431583000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,...,,,,,,,,,,
1,1490431584000,4,1490431584000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,...,,,,,,,,,,
2,1490431585000,4,1490431585000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,...,,,,,,,,,,
3,1490431586000,4,1490431586000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,...,,,,,,,,,,
4,1490431587000,4,1490431587000,0.665536,0.025955,0.010956,0.001025,0.665536,0.000000,0.665536,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738780,1499267848000,5,1499267848000,0.664882,0.026065,0.010585,0.001078,0.664906,0.001210,0.664913,...,0.009372,0.665902,0.003862,0.663269,0.024118,0.004610,0.02697,0.009280,0.001754,0.009343
738781,1499267849000,5,1499267849000,0.664818,0.026094,0.010540,0.001078,0.664847,0.001201,0.664850,...,0.009372,0.665880,0.003839,0.663269,0.024139,0.004595,0.02697,0.009280,0.001754,0.009343
738782,1499267850000,5,1499267850000,0.664751,0.026121,0.010495,0.001086,0.664788,0.001188,0.664785,...,0.009372,0.665858,0.003817,0.663269,0.024159,0.004580,0.02697,0.009281,0.001754,0.009343
738783,1499267851000,5,1499267851000,0.664687,0.026146,0.010453,0.001099,0.664729,0.001173,0.664719,...,0.009372,0.665836,0.003794,0.663269,0.024180,0.004564,0.02697,0.009282,0.001753,0.009343


In [6]:
railways = gpd.read_file('../additional_data/railways.json')
display(railways)

Unnamed: 0,geometry
0,"LINESTRING (-0.14132 50.82902, -0.14151 50.830..."
1,"LINESTRING (-0.93849 50.85162, -0.93633 50.851..."
2,"LINESTRING (-0.14087 50.82901, -0.14089 50.829..."
3,"LINESTRING (0.27687 50.80288, 0.27665 50.80432..."
4,"LINESTRING (-0.14108 50.82902, -0.14104 50.829..."
5,"LINESTRING (-0.16576 51.23988, -0.16623 51.239..."
6,"LINESTRING (-0.17003 51.23156, -0.16928 51.233..."
7,"LINESTRING (-0.16563 51.23996, -0.15608 51.255..."
8,"LINESTRING (-0.08851 51.38370, -0.08783 51.384..."
9,"LINESTRING (-0.07169 51.40712, -0.07087 51.406..."


In [7]:
subway = gpd.read_file('../additional_data/subway.json')
display(subway)

Unnamed: 0,geometry
0,"LINESTRING (0.07963 51.54034, 0.07729 51.54146..."
1,"LINESTRING (0.01027 51.51106, 0.00955 51.51240..."
2,"LINESTRING (-0.18799 51.57114, -0.19263 51.572..."
3,"LINESTRING (-0.23859 51.58982, -0.24056 51.590..."
4,"LINESTRING (-0.30260 51.61922, -0.30124 51.617..."
5,"LINESTRING (-0.37130 51.57519, -0.36957 51.574..."
6,"LINESTRING (-0.37130 51.57515, -0.36744 51.574..."
7,"LINESTRING (-0.22702 51.49365, -0.22505 51.493..."
8,"LINESTRING (-0.19207 51.48249, -0.19288 51.481..."
9,"LINESTRING (0.09334 51.60333, 0.09079 51.59488..."


In [8]:
feature_lines = {
    'subway': gpd.read_file('../additional_data/subway.json'),
    'railways': gpd.read_file('../additional_data/railways.json'),
    'bus_routes': gpd.read_file('../additional_data/bus_routes.json'),
}

def calculate_min_distance_to_lines(point: Point, lines: gpd.GeoDataFrame):
    return min(map(point.distance, lines.loc[:, 'geometry']))

def create_distance_to_lines_features(data: pd.DataFrame):
    features = {
        'epoch_time_id': data['epoch_time_id'],
    }
    for name, lines in feature_lines.items():
        distances = []
        for _, row in tqdm(data.iterrows(), total=data.shape[0]):
            distances.append(calculate_min_distance_to_lines(Point(row['Longitude'], row['Latitude']), lines))
        features[f'distance_{name}'] = distances

    return pd.DataFrame(features)

In [None]:
settings = {
    'fill_limit': 30,
    'window_sizes': [60, 300, 600],
    'window_center': True,
    # 'window_functions': ['mean', 'std'],
    'columns': ['distance_subway', 'distance_railways'],
    'functions': ['mean', 'std', 'median'],
}
shift_settings = {
    'periods': [60, 300, 600],
    'columns_patterns': ['window_'],
    'fill_limit': 30,
}
train_distances_to_lines_with_windows = fillna_agg_by_label(calculate_shift(calculate_window(create_distance_to_lines_features(train_location), **settings), **shift_settings), train_label)
display(train_distances_to_lines_with_windows)

train_distances_to_lines_with_windows.to_parquet('../data/train/features_distances_to_lines_with_windows.parquet')
fillna_agg_by_label(calculate_shift(calculate_window(create_distance_to_lines_features(test_location), **settings), **shift_settings), test_label).to_parquet('../data/test/features_distances_to_lines_with_windows.parquet')
fillna_agg_by_label(calculate_shift(calculate_window(create_distance_to_lines_features(validate_location), **settings), **shift_settings), validate_label).to_parquet('../data/validate/dfeatures_istances_to_lines_with_windows.parquet')


 82%|████████▏ | 806320/987388 [13:54<05:04, 593.82it/s] 

In [None]:
train_label.merge(train_distances_to_lines_with_windows, on="epoch_time_id")