In [None]:
!pip install folium matplotlib mapclassify

In [None]:
# import numpy as np
# import pandas as pd
# import glob
# import os.path
# import datetime
# import os
# import geopandas as gpd
# from geopy.distance import great_circle
# from datetime import datetime, timedelta
# import shapely as shp

!pip install movingpandas
import movingpandas as mpd

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# The dataset

(_copied_ from [url](https://www.microsoft.com/en-us/download/details.aspx?id=52367))

> This GPS trajectory dataset was collected in (Microsoft Research Asia) Geolife project by 182 users in a period of over three years (from April 2007 to August 2012). A GPS trajectory of this dataset is represented by a sequence of time-stamped points, each of which contains the information of latitude, longitude and altitude. This dataset contains 17,621 trajectories with a total distance of about 1.2 million kilometers and a total duration of 48,000+ hours. These trajectories were recorded by different GPS loggers and GPS-phones, and have a variety of sampling rates. 91 percent of the trajectories are logged in a dense representation, e.g. every 1\~5 seconds or every 5\~10 meters per point. This dataset recoded a broad range of users’ outdoor movements, including not only life routines like go home and go to work but also some entertainments and sports activities, such as shopping, sightseeing, dining, hiking, and cycling.

# Formating `.plt` files into trajectory instances

The dataset can be downloaded from [Microsoft](https://www.microsoft.com/en-us/research/publication/geolife-gps-trajectory-dataset-user-guide/). Its structure is as follows:

```
- Data Trajectories 1.3/
    - Data/
        - 000/
           - Trajectory/
               - <timestamp1>.plt
               - <timestamp2>.plt
        - 001/
        - ...
        - 180/
        - 181/
    - User Guide-1.3.pdf
```

000, 001, ..., 180, 181 are the users that recorded their trajectories, i.e., 182 users. Many of them did not label the activity, e.g., walking, bus, car, plane, taxi, and thus are later excluded.

Each file inside `Trajectory` is a single user trajectory, containing many timestamped GPS recordings, which will receive its unique identifier later on.

The following script from [heremaps.github.io](https://heremaps.github.io/pptk/tutorials/viewer/geolife.html) aggregates  all the `.plt` data and converts them into a pandas dataframe.

In [152]:
import numpy as np
import pandas as pd
import glob
import os.path
import datetime
import os

def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None,
                         parse_dates=[[5, 6]], infer_datetime_format=True)

    # for clarity rename columns
    points.rename(inplace=True, columns={'5_6': 'time', 0: 'lat', 1: 'lon', 3: 'alt'})

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4])

    return points

mode_names = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {s : i + 1 for i, s in enumerate(mode_names)}

def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None,
                         parse_dates=[[0, 1], [2, 3]],
                         infer_datetime_format=True, delim_whitespace=True)

    # for clarity rename columns
    labels.columns = ['start_time', 'end_time', 'label']

    # replace 'label' column with integer encoding
    labels['label'] = [mode_ids[i] for i in labels['label']]

    return labels

def apply_labels(points, labels):
    indices = labels['start_time'].searchsorted(points['time'], side='right') - 1
    no_label = (indices < 0) | (points['time'].values >= labels['end_time'].iloc[indices].values)
    points['label'] = labels['label'].iloc[indices].values
    points['label'][no_label] = 0

def read_user(user_folder):
    labels = None
    user_id = int(os.path.basename(user_folder))

    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    dfs = []

    for traj_id, plt_file in enumerate(plt_files):
        df = read_plt(plt_file)
        df['trajectory_id'] = f"{user_id}_{traj_id}"  # unique trajectory ID
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)

    labels_file = os.path.join(user_folder, 'labels.txt')
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        apply_labels(df, labels)
    else:
        df['label'] = 0

    df['user'] = user_id
    return df


def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        print('[%d/%d] processing user %s' % (i + 1, len(subfolders), sf))
        df = read_user(os.path.join(folder,sf))
        df['user'] = int(sf)
        dfs.append(df)
    return pd.concat(dfs)

# df = read_all_users("Geolife_Trajectories/Data")
# df.to_csv("/content/drive/MyDrive/geolife.csv", index=False)
df = pd.read_csv("/content/drive/MyDrive/geolive.csv")

In [None]:
df = df[df['label'] > 0] # removing trajetories without any label
df = df[df['lat'] <= 90] # cleaning up impossible latitudes
df['time'] = pd.to_datetime(df['time'])

# Splitting trajectories with more than 1 label


In [None]:
df['new_trajectory_id'] = None
df['prev_label'] = None

def split_on_label_change(group):
    group = group.sort_values('time').copy()
    base_id = group['trajectory_id'].iloc[0]
    suffix = 'a'

    current_label = None
    current_id = f"{base_id}_{suffix}"

    new_ids = []
    prev_labels = []

    for i, label in enumerate(group['label']):
        if current_label is None:
            prev_labels.append(None)
        else:
            prev_labels.append(int(current_label))
            if label != current_label:
                suffix = chr(ord(suffix) + 1)
                current_id = f"{base_id}_{suffix}"
        new_ids.append(current_id)
        current_label = label

    group['new_trajectory_id'] = new_ids
    group['prev_label'] = prev_labels
    return group

df = df.groupby('trajectory_id', group_keys=False).apply(split_on_label_change)

  df = df.groupby('trajectory_id', group_keys=False).apply(split_on_label_change)
  df = df.groupby('trajectory_id', group_keys=False).apply(split_on_label_change)


In [None]:
df.to_csv('/content/drive/MyDrive/all-unagg-new-labels.csv', index=False)

# Finding home addresses for users

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/all-geolife-gpd.csv') # contains data only from users with label 0 that also have other labels

In [None]:
df['time'] = pd.to_datetime(df['time'])
df['hour'] = df['time'].dt.hour

night_df = df[(df['hour'] >= 23) | (df['hour'] <= 6)]
night_df['lat_round'] = night_df['lat'].round(2)
night_df['lon_round'] = night_df['lon'].round(2)

location_counts = (
    night_df
    .groupby(['user', 'lat_round', 'lon_round'])
    .size()
    .reset_index(name='count')
)

home_df = (
    location_counts
    .sort_values(['user', 'count'], ascending=[True, False])
    .drop_duplicates(subset='user')
    .rename(columns={'lat_round': 'home_lat', 'lon_round': 'home_lon'})
    .reset_index(drop=True)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  night_df['lat_round'] = night_df['lat'].round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  night_df['lon_round'] = night_df['lon'].round(2)


In [None]:
home_df

Unnamed: 0,user,home_lat,home_lon,count
0,0,40.01,116.32,12404
1,1,39.98,116.33,8770
2,2,39.90,116.38,10731
3,3,40.01,116.32,36824
4,4,40.00,116.33,20737
...,...,...,...,...
164,175,39.11,117.17,2
165,176,39.97,116.30,35
166,179,40.01,116.32,4117
167,180,28.96,115.76,204


In [None]:
home_df.to_csv('/content/drive/MyDrive/Colab Notebooks/home-addresses.csv', index=False)
# home_df = pd.read_csv('/content/drive/MyDrive/home-addresses.csv')

In [None]:
merged = df.merge(home_df, how='left')

In [None]:
df_sample = df[df['trajectory_id'].isin(list(df['trajectory_id'].sample(3).values))]
traj_collection = mpd.TrajectoryCollection(df_sample, "trajectory_id", t="time", x="lon", y="lat")
traj_collection.explore(column="trajectory_id", cmap="plasma", tiles="CartoDB positron")

# Agregating trajectories and enriching the dataset

Right now, each df row is a timestamped GPS record, and can be aggregated into a single trajectory by the field `trajectory_id`. By doing this, we can derive features such as `speed`, `duration`, `distance`, etc.

All the following features below are incorporated into the new aggregated dataset:

| Field                | Description                                                                 | Data Type   | Feature Type         |
|----------------------|-----------------------------------------------------------------------------|-------------|----------------------|
| `user`               | Unique identifier of the user who performed the trajectory.                 | `int` / `str`| Metadata |
| `trajectory_id`      | Unique identifier for the trajectory within each user.                      | `int`        | Metadata |
| `start_time`          | Timestamp of when the trajectory started.                                  | `datetime`  | Temporal             |
| `end_time`          | Timestamp of when the trajectory ended.                                  | `datetime`  | Temporal             |
| `duration_s`          | Total duration of the trajectory in seconds.                               | `float`     | Temporal             |
| `start_hour`          | Hour of the day when the trajectory began (0–23).                          | `int`       | Temporal             |
| `weekday`             | Day of the week when the trajectory began (0 = Monday, 6 = Sunday).        | `int`       | Temporal             |
| `distance_m`          | Total distance traveled in meters.                                         | `float`     | Spatial              |
| `straightness_ratio`  | Ratio of straight-line to actual distance (1 = perfectly straight).        | `float`     | Spatial / Efficiency |
| `avg_speed_mps`       | Average speed in meters per second.                                        | `float`     | Movement             |
| `speed_std`           | Standard deviation of speeds during the trajectory.                        | `float`     | Variability          |
| `speed_entropy`       | Entropy of the speed distribution.                                         | `float`     | Variability          |
| `speed_peak_ratio`    | Ratio of average to maximum speed.                                         | `float`     | Movement / Load      |
| `movement_ratio`      | Proportion of time spent moving vs total time.                             | `float`     | Behavioral           |
| `load_factor`         | Ratio of average hourly activity to the peak hour (peak hour factor).                         | `float`     | Behavioral           |
| `time_entropy`        | Entropy of hourly activity distribution.                                   | `float`     | Temporal Behavior    |
| `vcr`                 | Velocity Change Rate (speed jumps per km).                                 | `float`     | Movement Dynamics    |
| `sr`                  | Stop Rate (number of stops per km).                                        | `float`     | Stop Behavior        |
| `hcr`                 | Heading Change Rate (sharp direction changes per km).                      | `float`     | Route Behavior       |
| `num_points`          | Number of GPS points in the trajectory.                                    | `int`       | Metadata             |
| `coordinates`         | Line geometry of the trajectory.                                           | `LineString`| Spatial Geometry     |
| `centroid`            | Geometric center of the trajectory path.                                   | `Point`     | Spatial Geometry     |
| `label`               | Most frequent mode of transport in the trajectory.                         | `int` / `str`| Transport Mode      |
| `labels`              | List of all transport mode labels observed in the trajectory.              | `list`      | Transport Mode       |



In [95]:
import pandas as pd
import numpy as np
from shapely.geometry import LineString
import math
from scipy.stats import entropy
from geopy.distance import great_circle

def compute_entropy(values, bins=10):
    if len(values) < 2:
        return 0
    hist, _ = np.histogram(values, bins=bins)
    probs = hist / np.sum(hist)
    return entropy(probs, base=2)

def compute_heading(lat1, lon1, lat2, lon2):
    dLon = math.radians(lon2 - lon1)
    lat1 = math.radians(lat1)
    lat2 = math.radians(lat2)
    x = math.sin(dLon) * math.cos(lat2)
    y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(dLon)
    bearing = math.atan2(x, y)
    bearing = math.degrees(bearing)
    return (bearing + 360) % 360

In [None]:
def total_distance(latitudes, longitudes):
    dist = 0.0
    for i in range(1, len(latitudes)):
        coord1 = (latitudes[i-1], longitudes[i-1])
        coord2 = (latitudes[i], longitudes[i])
        dist += great_circle(coord1, coord2).meters
    return dist

def aggregate_activity(group, vcr_thresh=1.0, stop_thresh=0.5, hcr_thresh=30):
    if len(group) < 2:
        return None

    group = group.sort_values('time')
    latitudes = group['lat'].values
    longitudes = group['lon'].values
    times = pd.to_datetime(group['time']).values.astype('datetime64[s]').astype(np.int64)
    times_seconds = times

    start_lat = latitudes[0]
    start_lon = longitudes[0]
    end_lat = latitudes[-1]
    end_lon = longitudes[-1]

    distance = total_distance(latitudes, longitudes)
    line = LineString(list(zip(latitudes, longitudes)))
    displacement = great_circle((latitudes[0], longitudes[0]), (latitudes[-1], longitudes[-1])).meters
    straightness = displacement / distance if distance > 0 else 0

    speeds = []
    time_diffs = []
    for i in range(1, len(times_seconds)):
        dt = times_seconds[i] - times_seconds[i-1]
        if dt > 0:
            d = great_circle((latitudes[i-1], longitudes[i-1]), (latitudes[i], longitudes[i])).meters
            speeds.append(d / dt)
            time_diffs.append(dt)
        else:
            speeds.append(0)
            time_diffs.append(0)

    speeds = np.array(speeds)
    time_diffs = np.array(time_diffs)
    duration = np.sum(time_diffs)

    # VCR, SR, HCR
    speed_changes = 0
    stops = 0
    heading_changes = 0
    prev_heading = None
    prev_speed = None

    for i in range(len(speeds)):
        speed = speeds[i]
        if prev_speed is not None and abs(speed - prev_speed) > vcr_thresh:
            speed_changes += 1
        if speed < stop_thresh:
            stops += 1

        if i < len(latitudes) - 1:
            heading = compute_heading(latitudes[i], longitudes[i], latitudes[i+1], longitudes[i+1])
            if prev_heading is not None:
                delta_heading = abs(heading - prev_heading)
                if delta_heading > hcr_thresh:
                    heading_changes += 1
            prev_heading = heading

        prev_speed = speed

    norm_dist = distance / 1000 if distance > 0 else 1

    speed_std = np.std(speeds) if speeds.size > 0 else 0
    speed_entropy = compute_entropy(speeds, bins=10)
    avg_speed = np.mean(speeds) if speeds.size > 0 else 0
    max_speed = np.max(speeds) if speeds.size > 0 else 0
    speed_peak_ratio = avg_speed / max_speed if max_speed > 0 else 0

    moving_time = np.sum(time_diffs[speeds >= stop_thresh]) if len(time_diffs) > 0 else 0
    movement_ratio = moving_time / duration if duration > 0 else 0

    group['hour'] = group['time'].dt.hour
    hourly_counts = group.groupby('hour').size()
    if len(hourly_counts) > 0 and hourly_counts.max() > 0:
        load_factor = hourly_counts.mean() / hourly_counts.max()
    else:
        load_factor = 0
    time_entropy = compute_entropy(group['hour'], bins=24)

    start_time = group['time'].min()
    start_hour = start_time.hour
    weekday = start_time.weekday()

    return pd.Series({
        'user': group['user'].iloc[0],
        'new_trajectory_id': group['new_trajectory_id'].iloc[0],
        'prev_label':  group['prev_label'].iloc[0],
        'label': group['label'].mode().iloc[0],
        'start_time': start_time,
        'duration_s': duration,
        'start_hour': start_hour,
        'weekday': weekday,
        'distance_m': distance,
        'straightness_ratio': straightness,
        'avg_speed_mps': avg_speed,
        'speed_std': speed_std,
        'speed_entropy': speed_entropy,
        'speed_peak_ratio': speed_peak_ratio,
        'movement_ratio': movement_ratio,
        'load_factor': load_factor,
        'time_entropy': time_entropy,
        'vcr': speed_changes / norm_dist,
        'sr': stops / norm_dist,
        'hcr': heading_changes / norm_dist,
        'num_points': len(group),
        'start_lat': start_lat,
        'start_lon': start_lon,
        'end_lat': end_lat,
        'end_lon': end_lon
        # 'coordinates': line,
        # 'centroid': line.centroid,
    })

aggregated_df = df.groupby(['user', 'new_trajectory_id']).apply(aggregate_activity).dropna().reset_index(drop=True)

In [None]:
end_times = df.groupby(['user', 'new_trajectory_id'])['time'].max().reset_index()
end_times = end_times.rename(columns={'time': 'end_time'})
aggregated_df_merged = aggregated_df.merge(end_times, on=['user', 'new_trajectory_id'], how='left')
aggregated_df_merged.count()

In [None]:
aggregated_df_merged['end_hour'] = aggregated_df_merged['end_time'].transform(lambda x: x.hour)

In [None]:
aggregated_df_merged.to_csv('/content/drive/MyDrive/all-new-labels.csv', index=False)

In [None]:
aggregated_df_merged.columns

Index(['user', 'new_trajectory_id', 'prev_label', 'label', 'start_time',
       'duration_s', 'start_hour', 'weekday', 'distance_m',
       'straightness_ratio', 'avg_speed_mps', 'speed_std', 'speed_entropy',
       'speed_peak_ratio', 'movement_ratio', 'load_factor', 'time_entropy',
       'vcr', 'sr', 'hcr', 'num_points', 'start_lat', 'start_lon', 'end_lat',
       'end_lon', 'end_time', 'end_hour'],
      dtype='object')

In [None]:
cols = set(aggregated_df_merged.columns) - set(['coordinates'])
aggregated_df_merged = aggregated_df_merged[list(cols)]

# Adding location information

For each trajectory, the centroid's coordinates are used to fetch the associated country and city.

In [None]:
from geopy.geocoders import Nominatim
import time

trajectory_cities = {}
geolocator = Nominatim(user_agent="<user-agent>")

centroids = aggregated_df[['trajectory_id', 'centroid']].values
for trajectory_id, centroid in centroids:
  location = geolocator.reverse(centroid[7:-1].split(" "),  language="en")
  if location:
      country = location.raw['address']['country']
      city =  location.raw['address'].get('city') or location.raw['address'].get('town')
      loc = [country, city]
      print(f"Trajectory: {trajectory_id}. Location: {loc}")
      time.sleep(0.5)
      trajectory_cities[trajectory_id] = loc
  else:
      print(f"NOT FOUND: Trajectory: {trajectory_id}. Location: {location}")

In [None]:
import json

with open("trajectory_cities.json", "w") as file:
  json.dump(trajectory_cities, file, indent=4)

In [None]:
content = {}

with open("trajectory_cities.json") as json_data:
  d = json.load(json_data)
  for traj, loc in d.items():
    content[traj] = loc


fmt_content = []
for traj_id, loc in content.items():
  fmt_content.append({"trajectory_id": traj_id, "country": loc[0], "city": loc[1]})

locations = pd.DataFrame(fmt_content)
locations.to_csv("/content/drive/MyDrive/locations.csv")

In [None]:
all_df = pd.merge(locations, aggregated_df_merged, on='trajectory_id', how="right")
all_df.to_csv("/content/drive/MyDrive/all.csv", index=False)

# Subgroup Discovery



In [56]:
!pip install pysubgroup

Collecting pysubgroup
  Downloading pysubgroup-0.8.0-py3-none-any.whl.metadata (11 kB)
Downloading pysubgroup-0.8.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.5/70.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pysubgroup
Successfully installed pysubgroup-0.8.0


In [57]:
import pysubgroup as ps
import pandas as pd

In [None]:
# all_df = pd.read_csv("/content/drive/MyDrive/all-new-labels.csv")
# home_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/home-addresses.csv')
# merged = all_df.merge(home_df, how='left')
# merged.to_csv("/content/drive/MyDrive/Colab Notebooks/agg-all-home.csv", index=False)

## Combining home addresses

In [None]:
from numpy import radians, cos, sin, sqrt

df = merged

def haversine(lat1, lon1, lat2, lon2):
    R = 6371000
    phi1, phi2 = radians(lat1), radians(lat2)
    d_phi = radians(lat2 - lat1)
    d_lambda = radians(lon2 - lon1)

    a = sin(d_phi/2)**2 + cos(phi1) * cos(phi2) * sin(d_lambda/2)**2
    return 2 * R * np.arcsin(sqrt(a))

df['start_distance_to_home'] = haversine(df['start_lat'], df['start_lon'], df['home_lat'], df['home_lon'])

In [138]:
# df.to_csv("/content/drive/MyDrive/Colab Notebooks/agg-all-with-dist-home.csv", index=False)
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/agg-all-with-dist-home.csv")

to_use = [
    'user', 'duration_s', 'start_hour', 'end_hour', 'start_distance_to_home',
    'weekday', 'distance_m', 'straightness_ratio',
    'avg_speed_mps', 'speed_std', 'speed_entropy', 'speed_peak_ratio',
    'movement_ratio', 'load_factor', 'time_entropy',
    'vcr', 'sr', 'hcr', 'label', 'prev_label', 'new_trajectory_id'
]
df_sd = df[to_use]

replacements = dict(zip([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0],['walk', 'bike', 'bus', 'car', 'subway', 'train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']))
df_sd = df_sd.replace({'label': replacements})

replacements = dict(zip([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0],['walk', 'bike', 'bus', 'car', 'subway', 'train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']))
df_sd = df_sd.replace({'prev_label': replacements})

replacements = dict(zip([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0],['monday', 'tuesday', 'wednesday', 'thursday', 'friday','saturday', 'sunday']))
df_sd = df_sd.replace({'weekday': replacements})

pd.set_option('display.max_colwidth', None)

## Discretize continous variables

In [140]:
df = df_sd
n_bins = 4
discretized_features = []
continuous_features = [
    'duration_s', 'distance_m', 'straightness_ratio',
    'avg_speed_mps', 'speed_entropy', 'load_factor', 'time_entropy', 'start_distance_to_home',
    'vcr', 'sr', 'hcr', 'movement_ratio',
]

for col in continuous_features:
    if col not in df.columns:
        continue
    try:
        series = pd.to_numeric(df[col], errors='coerce')
        if series.nunique() < n_bins:
            continue
        bin_col = f'{col}_bin'
        df[bin_col] = pd.qcut(series, q=n_bins, labels=[f'{col}_Q{i+1}' for i in range(n_bins)])
        df[bin_col] = df[bin_col].astype('category')
        discretized_features.append(bin_col)
    except Exception as e:
        print(f"{col} — {e}")

categorical_features = []
if 'weekday' in df.columns:
    df['weekday'] = df['weekday'].astype('category')
    categorical_features.append('weekday')

if 'start_hour' in df.columns:
    df['start_hour_bin'] = pd.cut(df['start_hour'], bins=[0, 6, 12, 18, 24],
                                  labels=['night', 'morning', 'afternoon', 'evening'])
    df['start_hour_bin'] = df['start_hour_bin'].astype('category')
    categorical_features.append('start_hour_bin')


if 'end_hour' in df.columns:
    df['end_hour_bin'] = pd.cut(df['end_hour'], bins=[0, 6, 12, 18, 24],
                                  labels=['night', 'morning', 'afternoon', 'evening'])
    df['end_hour_bin'] = df['end_hour_bin'].astype('category')
    categorical_features.append('end_hour_bin')

if 'label' in df.columns:
    categorical_features.append('label')
if 'prev_label' in df.columns:
    categorical_features.append('prev_label')

load_factor — Bin edges must be unique: Index([0.3343195266272189, 0.7081526974951831, 1.0, 1.0, 1.0], dtype='float64', name='load_factor').
You can drop duplicate edges by setting the 'duplicates' kwarg
time_entropy — Bin edges must be unique: Index([0.0, 0.0, 0.0, 0.8181297009368202, 3.7783492877839375], dtype='float64', name='time_entropy').
You can drop duplicate edges by setting the 'duplicates' kwarg


In [120]:
df_cols = [
    'weekday','movement_ratio', 'vcr', 'sr', 'hcr', 'vcr_bin', 'sr_bin', 'hcr_bin',
    'label', 'prev_label', 'duration_s_bin', 'distance_m_bin',
    'straightness_ratio_bin', 'avg_speed_mps_bin', 'speed_entropy_bin',
    'start_hour_bin', 'start_distance_to_home_bin', 'end_hour_bin',
    'user'
]
df_disc = df[df_cols]
df_disc['label_change'] = df_disc['label'] + "->" + df_disc['prev_label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_disc['label_change'] = df_disc['label'] + "->" + df_disc['prev_label']


## Running SD

In [54]:
from IPython.display import display

In [121]:
def run(df_disc, target, ignore, qf, size=10):
  searchSpace_Nominal = ps.create_nominal_selectors(
      df_disc, ignore=ignore
  )
  searchSpace_Numeric = ps.create_numeric_selectors(
      df_disc, ignore=ignore
  )
  search_space = searchSpace_Nominal + searchSpace_Numeric

  task = ps.SubgroupDiscoveryTask(
      data=df_disc,
      target=target,
      search_space=search_space,
      result_set_size=size,
      depth=4,
      qf=qf,
      constraints=[ps.constraints.MinSupportConstraint(30)]

  )

  result = ps.BeamSearch().execute(task)
  return result

#### Stop Rate (sr)

**Without walk**

Chosen subgroups:

1. label=='subway' AND weekday=='tuesday' (id: 0)
2. duration_s_bin=='duration_s_Q1' AND start_hour_bin=='night' (id: 4)
3. start_distance_to_home_bin=='start_distance_to_home_Q4' AND weekday=='wednesday' (id: 13)


**With walk**

Chosen subgroups:

1. duration_s_bin=='duration_s_Q1' AND label=='walk' AND start_hour_bin=='night' AND weekday=='sunday'	(id: 0)
2. label=='walk' AND start_hour_bin=='morning' AND weekday=='friday' (id: 7)
3. duration_s_bin=='duration_s_Q1' AND label=='walk'	 AND weekday=='wednesday' (id: 11)

In [122]:
target_col = "sr"
target = ps.NumericTarget([target_col])
ignore = [target_col]
ignore += ['user', 'distance_m_bin', 'label_change', 'vcr', 'hcr', 'vcr_bin', 'hcr_bin', 'sr_bin', 'prev_label', 'movement_ratio', 'end_hour_bin', 'avg_speed_mps_bin', 'straightness_ratio_bin', 'speed_entropy_bin']

qf = ps.StandardQFNumeric(0.5)
df_disc_n = df_disc[df_disc['label'] != 'walk']
result = run(df_disc_n, target, ignore, qf, 20)

display(result.to_dataframe().head(20))
result.to_dataframe().iloc[[0, 4, 13]][['subgroup', 'size_sg', 'mean_sg', 'mean_dataset']]

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,835.586753,label=='subway' AND weekday=='tuesday',43,2132,150.431502,23.005717,665.837672,123.98335,2.219663,5.644172,4331.284797,4331.284797,0.0,0.0,6.538875,0.393266
1,829.766577,duration_s_bin=='duration_s_Q1' AND label=='subway',79,2132,116.361751,23.005717,527.134538,123.98335,1.319605,5.644172,4331.284797,4331.284797,0.0,0.0,5.057949,0.2338
2,676.949539,label=='subway' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND start_hour_bin=='night',32,2132,142.67462,23.005717,752.688231,123.98335,1.034668,5.644172,4331.284797,4331.284797,0.0,0.0,6.201703,0.183316
3,630.742117,start_distance_to_home_bin=='start_distance_to_home_Q2' AND weekday=='tuesday',50,2132,112.206123,23.005717,605.688568,123.98335,14.532606,5.644172,4331.284797,4331.284797,0.0,0.0,4.877315,2.574799
4,606.060661,duration_s_bin=='duration_s_Q1' AND start_hour_bin=='night',102,2132,83.014665,23.005717,465.856554,123.98335,2.898622,5.644172,4331.284797,4331.284797,0.0,0.0,3.608436,0.51356
5,550.701973,duration_s_bin=='duration_s_Q1' AND start_distance_to_home_bin=='start_distance_to_home_Q2',65,2132,91.31189,23.005717,540.742301,123.98335,4.832703,5.644172,4331.284797,4331.284797,0.0,0.0,3.969096,0.856229
6,456.463035,label=='subway' AND start_distance_to_home_bin=='start_distance_to_home_Q2',93,2132,70.338729,23.005717,455.087206,123.98335,1.23989,5.644172,4331.284797,4331.284797,0.0,0.0,3.057446,0.219676
7,449.182997,start_hour_bin=='night' AND weekday=='tuesday',68,2132,77.477157,23.005717,519.935073,123.98335,11.730251,5.644172,4331.284797,4331.284797,0.0,0.0,3.367735,2.078294
8,439.357637,label=='subway' AND start_hour_bin=='night',104,2132,66.088278,23.005717,446.129039,123.98335,1.195069,5.644172,4331.284797,4331.284797,0.0,0.0,2.872689,0.211735
9,346.361334,start_distance_to_home_bin=='start_distance_to_home_Q2' AND start_hour_bin=='night',168,2132,49.728074,23.005717,338.611979,123.98335,10.307133,5.644172,4331.284797,4331.284797,0.0,0.0,2.161553,1.826155


Unnamed: 0,subgroup,size_sg,mean_sg,mean_dataset
0,label=='subway' AND weekday=='tuesday',43,150.431502,23.005717
4,duration_s_bin=='duration_s_Q1' AND start_hour_bin=='night',102,83.014665,23.005717
13,start_distance_to_home_bin=='start_distance_to_home_Q4' AND weekday=='wednesday',78,57.829263,23.005717


In [134]:
target_col = "sr"
target = ps.NumericTarget([target_col])
ignore = [target_col]
ignore += ['user', 'distance_m_bin', 'label_change', 'vcr', 'hcr', 'vcr_bin', 'hcr_bin', 'sr_bin', 'prev_label', 'movement_ratio', 'end_hour_bin', 'avg_speed_mps_bin', 'straightness_ratio_bin', 'speed_entropy_bin']

qf = ps.StandardQFNumeric(0.5)
result = run(df_disc, target, ignore, qf, 20)

display(result.to_dataframe().head(20))
result.to_dataframe().iloc[[0, 7, 11]][['subgroup', 'size_sg', 'mean_sg', 'mean_dataset']]

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,2984.511641,duration_s_bin=='duration_s_Q1' AND label=='walk' AND start_hour_bin=='night' AND weekday=='sunday',33,4238,591.968249,72.431459,2785.43279,414.590144,40.292105,14.322585,16328.975104,16328.975104,0.0,0.0,8.172806,2.813187
1,2923.60652,duration_s_bin=='duration_s_Q1' AND start_distance_to_home_bin=='start_distance_to_home_Q1' AND weekday=='sunday',34,4238,573.82584,72.431459,2745.890459,414.590144,44.06618,14.322585,16328.975104,16328.975104,0.0,0.0,7.922329,3.076692
2,2632.147917,label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q4' AND start_hour_bin=='morning' AND weekday=='friday',32,4238,537.733869,72.431459,2642.996884,414.590144,33.560043,14.322585,15247.32465,16328.975104,0.0,0.0,7.424038,2.343155
3,2296.207911,label=='walk',2106,4238,122.467394,72.431459,570.397849,414.590144,33.515607,14.322585,16328.975104,16328.975104,0.0,0.0,1.690804,2.340053
4,2253.203685,label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q1' AND weekday=='sunday',68,4238,345.672541,72.431459,1962.801457,414.590144,28.294622,14.322585,16328.975104,16328.975104,0.0,0.0,4.772409,1.975525
5,2221.539729,start_distance_to_home_bin=='start_distance_to_home_Q1' AND start_hour_bin=='night' AND weekday=='sunday',52,4238,380.503589,72.431459,2238.671613,414.590144,17.994735,14.322585,16328.975104,16328.975104,0.0,0.0,5.253292,1.256389
6,2217.192879,duration_s_bin=='duration_s_Q1' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND weekday=='tuesday',33,4238,458.395198,72.431459,1051.664229,414.590144,57.83585,14.322585,4563.858477,16328.975104,0.0,0.0,6.328676,4.038087
7,2204.40628,label=='walk' AND start_hour_bin=='morning' AND weekday=='friday',114,4238,278.893023,72.431459,1496.857976,414.590144,32.749657,14.322585,15247.32465,16328.975104,0.0,0.0,3.850441,2.286574
8,2140.738538,duration_s_bin=='duration_s_Q1' AND start_hour_bin=='night' AND weekday=='sunday',56,4238,358.499679,72.431459,2157.19957,414.590144,14.069364,14.322585,16328.975104,16328.975104,0.0,0.0,4.949502,0.98232
9,2130.132691,duration_s_bin=='duration_s_Q1' AND label=='walk' AND start_hour_bin=='night',251,4238,206.884242,72.431459,1110.820323,414.590144,35.984295,14.322585,16328.975104,16328.975104,0.0,0.0,2.856276,2.512416


Unnamed: 0,subgroup,size_sg,mean_sg,mean_dataset
0,duration_s_bin=='duration_s_Q1' AND label=='walk' AND start_hour_bin=='night' AND weekday=='sunday',33,591.968249,72.431459
7,label=='walk' AND start_hour_bin=='morning' AND weekday=='friday',114,278.893023,72.431459
11,duration_s_bin=='duration_s_Q1' AND label=='walk',765,147.734597,72.431459


### Velocity Change Rate (vcr)

**Without walk**

Chosen subgroups:

1. label=='bike' AND start_distance_to_home_bin=='start_distance_to_home_Q1' AND start_hour_bin=='morning' (id: 2)
2. label=='bike' AND weekday=='sunday' (id: 14)
3. duration_s_bin=='duration_s_Q2' AND label=='bus' AND start_distance_to_home_bin=='start_distance_to_home_Q2' (id: 19)


**With walk**

Chosen subgroups:

1. duration_s_bin=='duration_s_Q1' AND label=='walk' (id: 3)
2. prev_label=='subway' AND start_distance_to_home_bin=='start_distance_to_home_Q2' (id: 6)
3. label=='walk' AND prev_label=='bus' (id: 7)

In [107]:
target_col = "vcr"
target = ps.NumericTarget([target_col])
ignore = [target_col]

ignore += ['label_change', 'vcr', 'sr', 'hcr', 'hcr_bin', 'sr_bin', 'vcr_bin', 'end_hour_bin',  'speed_entropy_bin', 'movement_ratio', 'avg_speed_mps_bin', 'distance_m_bin', 'straightness_ratio_bin']

qf=ps.StandardQFNumeric(0.5)
df_disc_n = df_disc[df_disc['label'] != 'walk']
result = run(df_disc_n, target, ignore, qf, size=20)

display(result.to_dataframe().head(20))
result.to_dataframe().iloc[[2, 14, 19]][['subgroup', 'size_sg', 'mean_sg', 'mean_dataset']]

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,223.966291,label=='bike',207,2132,38.841283,23.27455,51.344709,25.701306,27.058421,19.702125,372.073028,372.073028,0.0,0.0,1.668831,1.373376
1,215.477389,label=='bike' AND start_distance_to_home_bin=='start_distance_to_home_Q1' AND start_hour_bin=='morning',37,2132,58.698814,23.27455,80.694253,25.701306,37.814485,19.702125,372.073028,372.073028,0.0,0.0,2.522017,1.91931
2,209.665047,label=='bike' AND start_distance_to_home_bin=='start_distance_to_home_Q1',121,2132,42.335009,23.27455,54.723932,25.701306,27.047563,19.702125,372.073028,372.073028,0.0,0.0,1.81894,1.372825
3,205.318356,label=='bike' AND prev_label=='walk' AND start_hour_bin=='morning',44,2132,54.227457,23.27455,78.669667,25.701306,30.184807,19.702125,372.073028,372.073028,1.239912,0.0,2.329904,1.532058
4,198.963246,label=='bike' AND start_hour_bin=='morning',68,2132,47.402387,23.27455,65.2482,25.701306,32.299793,19.702125,372.073028,372.073028,0.0,0.0,2.036662,1.639407
5,191.531313,duration_s_bin=='duration_s_Q3' AND label=='bike' AND prev_label=='walk',44,2132,52.148982,23.27455,76.976279,25.701306,27.103365,19.702125,372.073028,372.073028,1.831417,0.0,2.240601,1.375657
6,185.85694,duration_s_bin=='duration_s_Q3' AND label=='bike',60,2132,47.268578,23.27455,66.995272,25.701306,27.496445,19.702125,372.073028,372.073028,1.831417,0.0,2.030913,1.395608
7,173.051414,label=='bus' AND prev_label=='walk' AND user: [126.0:163.0[,260,2132,34.006743,23.27455,25.696646,25.701306,30.294106,19.702125,153.650511,372.073028,0.0,0.0,1.461113,1.537606
8,171.586544,label=='bike' AND prev_label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q1',84,2132,41.996177,23.27455,62.51533,25.701306,24.462096,19.702125,372.073028,372.073028,1.001035,0.0,1.804382,1.241597
9,168.523472,label=='bike' AND prev_label=='walk',145,2132,37.269662,23.27455,52.788325,25.701306,25.076327,19.702125,372.073028,372.073028,0.089548,0.0,1.601305,1.272773


Unnamed: 0,subgroup,size_sg,mean_sg,mean_dataset
2,label=='bike' AND start_distance_to_home_bin=='start_distance_to_home_Q1',121,42.335009,23.27455
14,duration_s_bin=='duration_s_Q2' AND label=='bus',307,32.638924,23.27455
19,label=='bike' AND weekday=='sunday',36,49.146884,23.27455


In [109]:
target_col = "vcr"
target = ps.NumericTarget([target_col])
ignore = [target_col]

ignore += ['user', 'label_change', 'vcr', 'sr', 'hcr', 'hcr_bin', 'sr_bin', 'vcr_bin', 'end_hour_bin',  'speed_entropy_bin', 'movement_ratio', 'avg_speed_mps_bin', 'distance_m_bin', 'straightness_ratio_bin']

qf=ps.StandardQFNumeric(0.5)
result = run(df_disc, target, ignore, qf, size=20)

display(result.to_dataframe().head(20))
result.to_dataframe().iloc[[3, 6, 7]][['subgroup', 'size_sg', 'mean_sg', 'mean_dataset']]

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,223.966291,label=='bike',207,2132,38.841283,23.27455,51.344709,25.701306,27.058421,19.702125,372.073028,372.073028,0.0,0.0,1.668831,1.373376
1,215.477389,label=='bike' AND start_distance_to_home_bin=='start_distance_to_home_Q1' AND start_hour_bin=='morning',37,2132,58.698814,23.27455,80.694253,25.701306,37.814485,19.702125,372.073028,372.073028,0.0,0.0,2.522017,1.91931
2,209.665047,label=='bike' AND start_distance_to_home_bin=='start_distance_to_home_Q1',121,2132,42.335009,23.27455,54.723932,25.701306,27.047563,19.702125,372.073028,372.073028,0.0,0.0,1.81894,1.372825
3,205.318356,label=='bike' AND prev_label=='walk' AND start_hour_bin=='morning',44,2132,54.227457,23.27455,78.669667,25.701306,30.184807,19.702125,372.073028,372.073028,1.239912,0.0,2.329904,1.532058
4,198.963246,label=='bike' AND start_hour_bin=='morning',68,2132,47.402387,23.27455,65.2482,25.701306,32.299793,19.702125,372.073028,372.073028,0.0,0.0,2.036662,1.639407
5,191.531313,duration_s_bin=='duration_s_Q3' AND label=='bike' AND prev_label=='walk',44,2132,52.148982,23.27455,76.976279,25.701306,27.103365,19.702125,372.073028,372.073028,1.831417,0.0,2.240601,1.375657
6,185.85694,duration_s_bin=='duration_s_Q3' AND label=='bike',60,2132,47.268578,23.27455,66.995272,25.701306,27.496445,19.702125,372.073028,372.073028,1.831417,0.0,2.030913,1.395608
7,171.586544,label=='bike' AND prev_label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q1',84,2132,41.996177,23.27455,62.51533,25.701306,24.462096,19.702125,372.073028,372.073028,1.001035,0.0,1.804382,1.241597
8,168.523472,label=='bike' AND prev_label=='walk',145,2132,37.269662,23.27455,52.788325,25.701306,25.076327,19.702125,372.073028,372.073028,0.089548,0.0,1.601305,1.272773
9,165.944806,duration_s_bin=='duration_s_Q3' AND label=='bike' AND prev_label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q1',31,2132,53.079117,23.27455,83.093166,25.701306,26.779722,19.702125,372.073028,372.073028,1.831417,0.0,2.280565,1.35923


Unnamed: 0,subgroup,size_sg,mean_sg,mean_dataset
3,label=='bike' AND prev_label=='walk' AND start_hour_bin=='morning',44,54.227457,23.27455
6,duration_s_bin=='duration_s_Q3' AND label=='bike',60,47.268578,23.27455
7,label=='bike' AND prev_label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q1',84,41.996177,23.27455


### Heading Change Rate (hcr)


**Without walk**

Chosen subgroups:

1. label=='subway' AND weekday=='tuesday'	(id: 0)
2. label=='subway' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND start_hour_bin=='night' (id: 4)
3. start_distance_to_home_bin=='start_distance_to_home_Q2' AND start_hour_bin=='night'	(id: 19)


**With walk**

Chosen subgroups:

1. label=='walk' AND start_hour_bin=='morning'	(id: 6)
2. duration_s_bin=='duration_s_Q1' AND label=='walk' AND prev_label=='bike'	(id: 5)
3. duration_s_bin=='duration_s_Q1' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND weekday=='tuesday' (id: 2)

In [112]:
target_col = "hcr"
target = ps.NumericTarget([target_col])
ignore = [target_col]

ignore += ['user', 'label_change', 'hcr_bin', 'vcr', 'sr', 'sr_bin', 'vcr_bin', 'movement_ratio', 'avg_speed_mps_bin', 'speed_entropy_bin', 'end_hour_bin',  'straightness_ratio_bin', 'distance_m_bin']

qf=ps.StandardQFNumeric(0.5)
df_disc_n = df_disc[df_disc['label'] != 'walk']
result = run(df_disc_n, target, ignore, qf, 20)

display(result.to_dataframe())
result.to_dataframe().iloc[[0, 4, 19]][['subgroup', 'size_sg', 'mean_sg', 'mean_dataset']]

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,637.409838,label=='subway' AND weekday=='tuesday',43,2132,123.280452,26.076363,564.903446,97.676783,4.149315,11.163248,3664.93329,3664.93329,0.0,0.0,4.727671,0.371694
1,616.544241,prev_label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND weekday=='tuesday',39,2132,124.802454,26.076363,576.679347,97.676783,16.444922,11.163248,3664.93329,3664.93329,0.227723,0.0,4.786038,1.473131
2,562.319847,start_distance_to_home_bin=='start_distance_to_home_Q2' AND weekday=='tuesday',50,2132,105.600398,26.076363,510.757197,97.676783,20.031766,11.163248,3664.93329,3664.93329,0.227723,0.0,4.04966,1.794439
3,558.08033,duration_s_bin=='duration_s_Q1' AND label=='subway',79,2132,88.865306,26.076363,427.038769,97.676783,4.477867,11.163248,3664.93329,3664.93329,0.0,0.0,3.407887,0.401126
4,543.374456,label=='subway' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND start_hour_bin=='night',32,2132,122.132303,26.076363,636.526585,97.676783,3.327998,11.163248,3664.93329,3664.93329,0.227723,0.0,4.68364,0.298121
5,508.307633,label=='subway' AND prev_label=='walk' AND weekday=='tuesday',38,2132,108.534749,26.076363,585.257838,97.676783,3.687242,11.163248,3664.93329,3664.93329,0.0,0.0,4.162189,0.330302
6,466.206237,label=='subway' AND prev_label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q2',55,2132,88.939599,26.076363,488.440525,97.676783,3.459814,11.163248,3664.93329,3664.93329,0.0,0.0,3.410736,0.309929
7,437.73347,duration_s_bin=='duration_s_Q1' AND start_distance_to_home_bin=='start_distance_to_home_Q2',65,2132,80.370518,26.076363,452.070704,97.676783,10.23647,11.163248,3664.93329,3664.93329,0.0,0.0,3.082121,0.91698
8,422.831377,prev_label=='walk' AND start_hour_bin=='night' AND weekday=='tuesday',58,2132,81.596843,26.076363,475.285951,97.676783,14.708565,11.163248,3664.93329,3664.93329,0.046923,0.0,3.12915,1.317588
9,417.307554,duration_s_bin=='duration_s_Q1' AND prev_label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q2',55,2132,82.346102,26.076363,487.966633,97.676783,8.738152,11.163248,3664.93329,3664.93329,0.0,0.0,3.157883,0.782761


Unnamed: 0,subgroup,size_sg,mean_sg,mean_dataset
0,label=='subway' AND weekday=='tuesday',43,123.280452,26.076363
4,label=='subway' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND start_hour_bin=='night',32,122.132303,26.076363
19,start_distance_to_home_bin=='start_distance_to_home_Q2' AND start_hour_bin=='night',168,49.065742,26.076363


In [123]:
target_col = "hcr"
target = ps.NumericTarget([target_col])
ignore = [target_col]

ignore += ['user', 'label_change', 'hcr_bin', 'vcr', 'sr', 'sr_bin', 'vcr_bin', 'movement_ratio', 'avg_speed_mps_bin', 'speed_entropy_bin', 'end_hour_bin',  'straightness_ratio_bin', 'distance_m_bin']

qf=ps.StandardQFNumeric(0.5)
result = run(df_disc, target, ignore, qf, 20)

display(result.to_dataframe())
result.to_dataframe().iloc[[6, 5, 2]][['subgroup', 'size_sg', 'mean_sg', 'mean_dataset']]

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,2691.980702,label=='walk',2106,4238,142.681185,84.02109,308.506549,235.573513,95.500147,32.889492,9751.195997,9751.195997,0.0,0.0,1.698159,2.903667
1,2213.672702,label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q2',544,4238,178.931525,84.02109,280.069781,235.573513,120.093356,32.889492,4301.56776,9751.195997,0.0,0.0,2.129603,3.65142
2,2187.734321,duration_s_bin=='duration_s_Q1' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND weekday=='tuesday',33,4238,464.856753,84.02109,918.228868,235.573513,181.979565,32.889492,4301.56776,9751.195997,0.0,0.0,5.53262,5.533061
3,2076.789325,duration_s_bin=='duration_s_Q1' AND label=='walk',765,4238,159.107567,84.02109,313.215025,235.573513,99.065136,32.889492,5442.991701,9751.195997,0.0,0.0,1.893662,3.01206
4,2054.349451,label=='walk' AND prev_label=='subway' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND start_hour_bin=='morning',33,4238,441.637427,84.02109,743.518603,235.573513,239.730951,32.889492,4301.56776,9751.195997,8.649085,0.0,5.256269,7.288983
5,1975.394545,duration_s_bin=='duration_s_Q1' AND label=='walk' AND prev_label=='bike',105,4238,276.799859,84.02109,574.759989,235.573513,185.957995,32.889492,5442.991701,9751.195997,0.0,0.0,3.294409,5.654025
6,1956.942502,label=='walk' AND start_hour_bin=='morning',785,4238,153.867363,84.02109,404.440835,235.573513,99.17558,32.889492,9751.195997,9751.195997,0.0,0.0,1.831295,3.015418
7,1943.746509,label=='walk' AND prev_label=='subway' AND start_distance_to_home_bin=='start_distance_to_home_Q2',63,4238,328.910132,84.02109,570.31667,235.573513,162.829209,32.889492,4301.56776,9751.195997,4.24637,0.0,3.914614,4.950797
8,1909.964324,duration_s_bin=='duration_s_Q1' AND prev_label=='bike',109,4238,266.962494,84.02109,566.362857,235.573513,185.867623,32.889492,5442.991701,9751.195997,0.0,0.0,3.177327,5.651277
9,1879.254058,duration_s_bin=='duration_s_Q1' AND label=='walk' AND start_distance_to_home_bin=='start_distance_to_home_Q2',222,4238,210.148364,84.02109,364.295349,235.573513,133.008072,32.889492,4301.56776,9751.195997,0.0,0.0,2.501138,4.04409


Unnamed: 0,subgroup,size_sg,mean_sg,mean_dataset
6,label=='walk' AND start_hour_bin=='morning',785,153.867363,84.02109
5,duration_s_bin=='duration_s_Q1' AND label=='walk' AND prev_label=='bike',105,276.799859,84.02109
2,duration_s_bin=='duration_s_Q1' AND start_distance_to_home_bin=='start_distance_to_home_Q2' AND weekday=='tuesday',33,464.856753,84.02109


# Analysis

## Plotting routes for some subgroups

In [144]:
a = ps.EqualitySelector('duration_s_bin', 'duration_s_Q1')
b = ps.EqualitySelector('start_hour_bin', 'night')
conj = ps.Conjunction([a, b])
covered_df = df[conj.covers(df)]
trajectory_ids = list(map(lambda x: x[:-2], covered_df['new_trajectory_id'].unique()))

In [159]:
all_df = df #pd.read_csv("/content/drive/MyDrive/geolive.csv")
trajectories_all = all_df[all_df['trajectory_id'].isin(trajectory_ids)]

In [165]:
traj_collection = mpd.TrajectoryCollection(trajectories_all, "trajectory_id", t="time", x="lon", y="lat")
len(traj_collection.trajectories)
traj_collection.explore(column="trajectory_id", cmap="plasma", tiles="CartoDB positron")