In [None]:
import pandas as pd
import geopandas as gpd
# Import necessary geometric objects from shapely module
from shapely.geometry import Point, LineString, Polygon
import geopy
from geopy.geocoders import Nominatim
import skmob
import fiona
import matplotlib.pyplot as plt

### Geographic Partionining
* We have a few options in partionining mobile data spatially
    * __Option 1__: Call a free geocoding API (i.e. Nominatim) to get the relevant OpenStreetMap information. From here we can find the bounding box coordinates of a particular POI
    * __Option 2__: Partition by federal and/or state designations, i.e. census tracts, zip codes, etc.
    * __Option 3__: Type in a bounding box manually. For example, [geojson.io](https://geojson.io) could be a good option.
    * __Option 4__: Obtain a KMZ file from the [Damage Assessment Toolkit](https://apps.dat.noaa.gov/stormdamage/damageviewer/>).

#### Geocoding Example

In [None]:
locator = Nominatim(user_agent="Untitled-1")

In [None]:
location = locator.geocode("University of Washington, Seattle, WA, USA")
location.raw

In [None]:
bbox = list()
[bbox.append(float(i)) for i in location.raw['boundingbox']]
bbox

In [None]:
from shapely.geometry import box
miny, maxy, minx, maxx = bbox
poly = box(minx=minx, miny=miny, maxx=maxx, maxy=maxy)
poly

#### Census Tract Example
* The bounding box of a census tract is also the convex hull of its geometry object

In [None]:
OK_url = "https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_40_tract_500k.zip"
OK_tracts = gpd.read_file(OK_url)
OK_tracts.geometry.head()

In [None]:
OK_tracts.geometry[0].convex_hull

In [None]:
#IL_url = "https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_17_tract_500k.zip"
#IL_tracts = gpd.read_file(IL_url)
#IL_tracts.head()

#### Now let's read in our mobile data
* IL first

In [None]:
#IL_data = pd.read_csv('IL_1000_April06-08.csv')
#IL_gpd = gpd.GeoDataFrame(IL_data, crs='EPSG:4269', geometry=gpd.points_from_xy(IL_data['lon'], IL_data['lat']))
#aea_proj_str = '+proj=aea +lat_1=38.00 +lat_2=42.00 +lat_0=40.11 +lon_0=-88.228'
#IL_gpd = IL_gpd.to_crs(aea_proj_str)

In [None]:
# Exclude points outside of Illinois
# IL_gpd_j = gpd.sjoin(IL_gpd, IL_tracts, how='inner', predicate='within')

In [None]:
#print(IL_gpd.shape[0])
#print(IL_gpd_j.shape[0])

## Let's look at Tulsa
* Mobile data between May 2nd and 22nd, 2020
* Tornado event on May 15th, 2020

In [None]:
# Read data as geodataframe, project to CRS
OK_data = pd.read_csv('data/mobile/OK_damage_May02-22.csv')
OK_gpd = gpd.GeoDataFrame(OK_data, crs='EPSG:4269', geometry=gpd.points_from_xy(OK_data['lon'], OK_data['lat']))

# Exclude points outside of Oklahoma
OK_gpd_j = gpd.sjoin(OK_gpd, OK_tracts, how='left', predicate='within')

# Create datetime column using timestamps
OK_gpd_j['datetime'] = pd.to_datetime(OK_gpd_j['timestamp'], unit='s')

# Keep first 13 columns
OK_gpd_j = OK_gpd_j.iloc[:, :13]
# Create a date column
OK_gpd_j['date'] = OK_gpd_j['datetime'].dt.date
OK_gpd_j.head()

In [None]:
# Preprocess mobility data; separate by user_ID, filter out points with speed > 400 km/h, 
# and compress points within 200 m of each other in the same trajectory
tdf = skmob.TrajDataFrame(OK_gpd_j, latitude='lat', longitude='lon', datetime='datetime', user_id='uid')
f_tdf = skmob.preprocessing.filtering.filter(tdf, max_speed_kmh=400, include_loops=False)
fc_tdf = skmob.preprocessing.compression.compress(f_tdf, spatial_radius_km=0.2)

# Let's see how many points we eliminated
print('Original number of points: ', tdf.shape[0])
print('Number of points after filtering: ', f_tdf.shape[0])
print('Number of points after compression: ', fc_tdf.shape[0])

In [None]:
damage_area_pts = gpd.GeoDataFrame(fc_tdf, crs='EPSG:4269', geometry=gpd.points_from_xy(fc_tdf['lng'], fc_tdf['lat']))
damage_area_pts['datetime'] = pd.to_datetime(damage_area_pts['datetime'], format='%Y-%m-%d %H:%M:%S')
damage_area_pts.head()

### Let's check how many unique users passed by the bounding box area in our timeframe

In [None]:
len(damage_area_pts['uid'].unique())

#### Can play with the data using .explore(), but need to remove the 'datetime' and 'date' columns (make sure to add them back after)

In [None]:
# Drop datetime and date column in order to plot it
damage_area_pts.drop(['datetime', 'date'], axis=1, inplace=True)
damage_area_pts.explore()

In [None]:
#gdf_tulsa['datetime'] = gdf_tulsa['datetime'].dt.strftime('%Y%m%d%H%M%S')
# Create datetime column using timestamps
# OK_gpd_j['datetime'] = pd.to_datetime(OK_gpd_j['timestamp'], unit='s')

In [None]:
damage_area_pts['datetime'] = pd.to_datetime(damage_area_pts['datetime'], format='%Y:%m:%d %H:%M:%S')
# Groupby 'uid' and sort by 'datetime'
damage_area_pts = damage_area_pts.groupby('uid').apply(lambda x: x.sort_values('datetime'))

# Make date column a string
damage_area_pts['date'] = damage_area_pts['datetime'].dt.date.astype(str)

# Make hour column a string
damage_area_pts['hour'] = damage_area_pts['datetime'].dt.hour.astype(str)

# Separate by date
damage_area_pre = damage_area_pts[damage_area_pts['date'] <= '2020-05-14']
damage_area_dur = damage_area_pts[damage_area_pts['date'] == '2020-05-15']
damage_area_post = damage_area_pts[damage_area_pts['date'] >= '2020-05-16']

damage_area_pre_tdf = skmob.TrajDataFrame(damage_area_pre, latitude='lat', longitude='lng', datetime='datetime', user_id='uid')
damage_area_dur_tdf = skmob.TrajDataFrame(damage_area_dur, latitude='lat', longitude='lng', datetime='datetime', user_id='uid')
damage_area_post_tdf = skmob.TrajDataFrame(damage_area_post, latitude='lat', longitude='lng', datetime='datetime', user_id='uid')

### First metric of interest: Radius of Gyration
* This is essentially a measure of the extent of a user's travel
* Formally, it is defined as the radius of the great circle encompassing a user's data points

In [None]:
from skmob.measures.individual import jump_lengths, radius_of_gyration, home_location

# Groupby date
damage_area_pre_tdf_rg = damage_area_pre_tdf.groupby(['date','hour']).apply(lambda x: radius_of_gyration(x))
damage_area_dur_tdf_rg = damage_area_dur_tdf.groupby(['date','hour']).apply(lambda x: radius_of_gyration(x))
damage_area_post_tdf_rg = damage_area_post_tdf.groupby(['date','hour']).apply(lambda x: radius_of_gyration(x))

In [None]:
damage_area_pre_tdf_rg_df = damage_area_pre_tdf_rg.groupby(['date','hour'], level=(0, 1)).mean()
damage_area_pre_tdf_rg_df['std'] = damage_area_pre_tdf_rg.groupby(['date','hour'], level=(0, 1)).std()
damage_area_pre_tdf_rg_df['uid_count'] = damage_area_pre_tdf_rg.groupby(['date','hour'], level=(0, 1)).count()['uid']

damage_area_dur_tdf_rg_df = damage_area_dur_tdf_rg.groupby(['date','hour'], level=(0, 1)).mean()
damage_area_dur_tdf_rg_df['std'] = damage_area_dur_tdf_rg.groupby(['date','hour'], level=(0, 1)).std()
damage_area_dur_tdf_rg_df['uid_count'] = damage_area_dur_tdf_rg.groupby(['date','hour'], level=(0, 1)).count()['uid']

damage_area_post_tdf_rg_df = damage_area_post_tdf_rg.groupby(['date','hour'], level=(0, 1)).mean()
damage_area_post_tdf_rg_df['std'] = damage_area_post_tdf_rg.groupby(['date','hour'], level=(0, 1)).std()
damage_area_post_tdf_rg_df['uid_count'] = damage_area_post_tdf_rg.groupby(['date','hour'], level=(0, 1)).count()['uid']

In [None]:
damage_area_pre_tdf_rg_df = damage_area_pre_tdf_rg_df.reset_index()
damage_area_dur_tdf_rg_df = damage_area_dur_tdf_rg_df.reset_index()
damage_area_post_tdf_rg_df = damage_area_post_tdf_rg_df.reset_index()

In [None]:
# Create datetime column
damage_area_pre_tdf_rg_df['datetime'] = pd.to_datetime(damage_area_pre_tdf_rg_df['date'] + ' ' + damage_area_pre_tdf_rg_df['hour'] + ':00:00')
damage_area_dur_tdf_rg_df['datetime'] = pd.to_datetime(damage_area_dur_tdf_rg_df['date'] + ' ' + damage_area_dur_tdf_rg_df['hour'] + ':00:00')
damage_area_post_tdf_rg_df['datetime'] = pd.to_datetime(damage_area_post_tdf_rg_df['date'] + ' ' + damage_area_post_tdf_rg_df['hour'] + ':00:00')

In [None]:
# Let's just group by hour
damage_area_pre_tdf_rg_df_hour = damage_area_pre_tdf_rg_df.sort_values(by=['hour'])

In [None]:
# Plot all radius of gyrations them in the same axes, using date as x-axis
fig, ax = plt.subplots(2, 1, figsize=(15, 10), sharex=True)
damage_area_pre_tdf_rg_df.plot(x='datetime', y='radius_of_gyration',  ax=ax[0], label='Before May 15th')
damage_area_dur_tdf_rg_df.plot(x='datetime', y='radius_of_gyration',  ax=ax[0], label='May 15th')
damage_area_post_tdf_rg_df.plot(x='datetime', y='radius_of_gyration', ax=ax[0], label='After May 15th')
# Fill in the area between the mean and the standard deviation
ax[0].fill_between(damage_area_pre_tdf_rg_df['datetime'], damage_area_pre_tdf_rg_df['radius_of_gyration'] - damage_area_pre_tdf_rg_df['std'], damage_area_pre_tdf_rg_df['radius_of_gyration'] + damage_area_pre_tdf_rg_df['std'], alpha=0.3)
ax[0].fill_between(damage_area_dur_tdf_rg_df['datetime'], damage_area_dur_tdf_rg_df['radius_of_gyration'] - damage_area_dur_tdf_rg_df['std'], damage_area_dur_tdf_rg_df['radius_of_gyration'] + damage_area_dur_tdf_rg_df['std'], alpha=0.3)
ax[0].fill_between(damage_area_post_tdf_rg_df['datetime'], damage_area_post_tdf_rg_df['radius_of_gyration'] - damage_area_post_tdf_rg_df['std'], damage_area_post_tdf_rg_df['radius_of_gyration'] + damage_area_post_tdf_rg_df['std'], alpha=0.3)

# Second y-axis to show the number of unique users in each hour
#ax2 = ax.twinx()
damage_area_pre_tdf_rg_df.plot(x='datetime', y='uid_count', color='black', label='', alpha=0.3, ax=ax[1])
damage_area_dur_tdf_rg_df.plot(x='datetime', y='uid_count', color='black', label='', alpha=0.3, ax=ax[1])
damage_area_post_tdf_rg_df.plot(x='datetime', y='uid_count',color='black', label='', alpha=0.3, ax=ax[1])
#damage_area_pre_tdf_rg.groupby(['date','hour'], level=(0, 1)).count()['radius_of_gyration'].plot(ax=ax2, color='black', label='Number of Unique Users')
#damage_area_dur_tdf_rg.groupby(['date','hour'], level=(0, 1)).count()['radius_of_gyration'].plot(ax=ax2, color='black')
#damage_area_post_tdf_rg.groupby(['date','hour'], level=(0, 1)).count()['radius_of_gyration'].plot(ax=ax2, color='black')

ax[0].set_title('Radius of Gyration')
plt.xlabel('Date')
# Share x
ax[0].set_ylabel('Radius of Gyration (m)')
ax[0].set_ylim((0, None))
ax[1].set_ylabel('Number of unique users')
ax[1].set_ylim((0, None))
plt.legend()

Ok, there doesn't seem to be a clear spike. Besides, this might not be the best metric to analyze in a constrained geographic box. It's likely that many of the users traveling through the tornado area are commuters passing through.

### Second metric: Visits per time unit (we will group by hour)
* This is a clear indicator of whether travel spiked on the timeframe of interest

In [None]:
from skmob.measures.collective import visits_per_time_unit

# Groupby date
damage_area_pre_tdf_vpt = damage_area_pre_tdf.groupby(['date','hour']).apply(lambda x: visits_per_time_unit(x, time_unit='1h'))
damage_area_dur_tdf_vpt = damage_area_dur_tdf.groupby(['date','hour']).apply(lambda x: visits_per_time_unit(x, time_unit='1h'))
damage_area_post_tdf_vpt = damage_area_post_tdf.groupby(['date','hour']).apply(lambda x: visits_per_time_unit(x, time_unit='1h'))

In [None]:
damage_area_pre_tdf_vpt = damage_area_pre_tdf_vpt.reset_index()
damage_area_dur_tdf_vpt = damage_area_dur_tdf_vpt.reset_index()
damage_area_post_tdf_vpt = damage_area_post_tdf_vpt.reset_index()

In [None]:
# Plot the number of visits per time unit on the same axes, as we did with the radius of gyration
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
damage_area_pre_tdf_vpt.plot(x='datetime', y='n_visits', ax=ax, label='Before May 15th')
damage_area_dur_tdf_vpt.plot(x='datetime', y='n_visits', ax=ax, label='During May 15th')
damage_area_post_tdf_vpt.plot(x='datetime', y='n_visits', ax=ax, label='After May 15th')

# Second y-axis to show the number of unique users in each hour
#ax2 = ax.twinx()

ax.set_title('Visits per time unit')
plt.xlabel('Date')
ax.set_ylabel('Visits per time unit')
ax.set_ylim((0, None))
#ax[1].set_ylabel('Number of unique users')
#ax[1].set_ylim((0, None))
plt.legend()

Again, the trend is not obvious on the daily level, but let's aggregate further by hour

In [None]:
# Sort by hour
damage_area_pre_tdf_vpt_df = damage_area_pre_tdf_vpt.groupby(damage_area_pre_tdf_vpt['datetime'].dt.hour).mean()
damage_area_pre_tdf_vpt_df['std'] = damage_area_pre_tdf_vpt.groupby(damage_area_pre_tdf_vpt['datetime'].dt.hour).std()['n_visits']

damage_area_dur_tdf_vpt_df = damage_area_dur_tdf_vpt.groupby(damage_area_dur_tdf_vpt['datetime'].dt.hour).mean()
damage_area_dur_tdf_vpt_df['std'] = damage_area_dur_tdf_vpt.groupby(damage_area_dur_tdf_vpt['datetime'].dt.hour).std()['n_visits']

damage_area_post_tdf_vpt_df = damage_area_post_tdf_vpt.groupby(damage_area_post_tdf_vpt['datetime'].dt.hour).mean()
damage_area_post_tdf_vpt_df['std'] = damage_area_post_tdf_vpt.groupby(damage_area_post_tdf_vpt['datetime'].dt.hour).std()['n_visits']

# Plot the number of visits per time unit on the same axes, as we did with the radius of gyration
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
damage_area_pre_tdf_vpt_df.plot(y='n_visits', ax=ax, label='Before May 15th')
damage_area_dur_tdf_vpt_df.plot(y='n_visits',  ax=ax, label='During May 15th')
damage_area_post_tdf_vpt_df.plot(y='n_visits',  ax=ax, label='After May 15th')
# Fill the area between the mean and the standard deviation
ax.fill_between(damage_area_pre_tdf_vpt_df.index, damage_area_pre_tdf_vpt_df['n_visits'] - damage_area_pre_tdf_vpt_df['std'], damage_area_pre_tdf_vpt_df['n_visits'] + damage_area_pre_tdf_vpt_df['std'], alpha=0.2)
ax.fill_between(damage_area_dur_tdf_vpt_df.index, damage_area_dur_tdf_vpt_df['n_visits'] - damage_area_dur_tdf_vpt_df['std'], damage_area_dur_tdf_vpt_df['n_visits'] + damage_area_dur_tdf_vpt_df['std'], alpha=0.2)
ax.fill_between(damage_area_post_tdf_vpt_df.index, damage_area_post_tdf_vpt_df['n_visits'] - damage_area_post_tdf_vpt_df['std'], damage_area_post_tdf_vpt_df['n_visits'] + damage_area_post_tdf_vpt_df['std'], alpha=0.2)

# Add vertical line at 13:37
plt.axvline(x=13.37, color='black', linestyle='--', label='Tornado begins')
plt.axvline(x=13.49, color='black', linestyle='-', label='Tornado ends')

plt.legend()

ax.set_title('Visits per time unit')
plt.xlabel('Hour')
ax.set_ylim((0, None))

Wow! There appears to be a clear spike in visits per hour right after the tornado. This is definitely an indicator that something may be up.

### Location Frequency

In [None]:
from skmob.measures.individual import location_frequency

damage_area_pre_tdf_lf = damage_area_pre_tdf.groupby(['date']).apply(lambda x: location_frequency(x, show_progress=True))
damage_area_dur_tdf_lf = damage_area_dur_tdf.groupby(['date']).apply(lambda x: location_frequency(x, show_progress=True))
damage_area_post_tdf_lf = damage_area_post_tdf.groupby(['date']).apply(lambda x: location_frequency(x, show_progress=True))

damage_area_pre_tdf_lf = damage_area_pre_tdf_lf.reset_index()
damage_area_dur_tdf_lf = damage_area_dur_tdf_lf.reset_index()
damage_area_post_tdf_lf = damage_area_post_tdf_lf.reset_index()

## Detecting "stay locations" in the bounding box. 
* To qualify, a user needs to spend a certain amount of time within certain radius (the two parameters of the stay detection algorithm)
* We can further cluster the detected locations by spatial proximity.

In [None]:
from skmob.preprocessing import detection
from skmob.preprocessing import clustering

pre_stdf = detection.stay_locations(skmob.TrajDataFrame(damage_area_pre_tdf, longitude='lng', latitude='lat', datetime='datetime', user_id='uid'))
pre_stdf_c = clustering.cluster(pre_stdf)

dur_stdf = detection.stay_locations(skmob.TrajDataFrame(damage_area_dur_tdf, longitude='lng', latitude='lat', datetime='datetime', user_id='uid'))
dur_stdf_c = clustering.cluster(dur_stdf)

In [None]:
dur_stdf_c.head()

In [None]:
ahh1 = pd.concat([damage_area_pre_tdf_lf, damage_area_dur_tdf_lf], axis=1)
#ahh1 = ahh1[(ahh1.columns != 'uid') & (ahh1.columns != 'geometry')]
ahh1.columns = ['date1', 'uid1', 'lat1', 'lng1', 'lf1', 'geometry1', 
                'date2', 'uid2', 'lat2', 'lng2', 'lf2', 'geometry2']

In [None]:
import numpy as np
tolerance = 1e-5

ahh1[ahh1.apply(lambda x: np.any(np.isclose([x.lat1, x.lng1], 
                                      [x.lat2, x.lng2], atol=tolerance)), axis=1)]

In [None]:
damage_area_dur_tdf_lf.groupby(['lat', 'lng'])['location_frequency'].mean()

In [None]:
damage_area_pre_tdf_lf.groupby(['date'].apply(lambda x: np.all(np.isclose([x.lat, x.lng, ]))))

In [None]:
metric_match = metric_tot.apply(lambda x: np.all(np.isclose([x.test_lat, x.test_lng, x[f'test_{metric_name.lower()}']], 
            [x.pred_lat, x.pred_lng, x[f'pred_{metric_name.lower()}']], atol=tolerance)), axis=1)
metric_perc = np.count_nonzero(metric_match) / len(metric_match)

In [None]:
damage_area_pre_tdf_lf.groupby(['date'])

In [None]:
# Turn into geodataframe
damage_area_pre_tdf_lf_gdf = gpd.GeoDataFrame(damage_area_pre_tdf_lf, geometry=gpd.points_from_xy(damage_area_pre_tdf_lf['lng'], damage_area_pre_tdf_lf['lat']))
damage_area_dur_tdf_lf_gdf = gpd.GeoDataFrame(damage_area_dur_tdf_lf, geometry=gpd.points_from_xy(damage_area_dur_tdf_lf['lng'], damage_area_dur_tdf_lf['lat']))
damage_area_post_tdf_lf_gdf = gpd.GeoDataFrame(damage_area_post_tdf_lf, geometry=gpd.points_from_xy(damage_area_post_tdf_lf['lng'], damage_area_post_tdf_lf['lat']))

# Plot the first day with random_location_entropy as heatmap
fig, axs = plt.subplots(3, 1, figsize=(10, 10))
damage_area_pre_tdf_lf_gdf.plot(column='location_frequency', ax=axs[0], legend=True, cmap='Reds', markersize=0.7, vmin=0, vmax=1)
damage_area_dur_tdf_lf_gdf.plot(column='location_frequency', ax=axs[1], legend=True, cmap='Reds', markersize=0.7, vmin=0, vmax=1)
damage_area_post_tdf_lf_gdf.plot(column='location_frequency', ax=axs[2], legend=True, cmap='Reds', markersize=0.7, vmin=0, vmax=1)
#damage_area_pre_tdf_rle_gdf
plt.show()

In [None]:
# Replicate the same analysis with random location entropy
from skmob.measures.collective import random_location_entropy

damage_area_pre_tdf_rle = damage_area_pre_tdf.groupby(['date']).apply(lambda x: random_location_entropy(x, show_progress=True))
damage_area_dur_tdf_rle = damage_area_dur_tdf.groupby(['date']).apply(lambda x: random_location_entropy(x, show_progress=True))
damage_area_post_tdf_rle = damage_area_post_tdf.groupby(['date']).apply(lambda x: random_location_entropy(x, show_progress=True))

damage_area_pre_tdf_rle = damage_area_pre_tdf_rle.reset_index()
damage_area_dur_tdf_rle = damage_area_dur_tdf_rle.reset_index()
damage_area_post_tdf_rle = damage_area_post_tdf_rle.reset_index()

In [None]:
# Turn into geodataframe
damage_area_pre_tdf_rle_gdf = gpd.GeoDataFrame(damage_area_pre_tdf_rle, geometry=gpd.points_from_xy(damage_area_pre_tdf_rle['lng'], damage_area_pre_tdf_rle['lat']))
damage_area_dur_tdf_rle_gdf = gpd.GeoDataFrame(damage_area_dur_tdf_rle, geometry=gpd.points_from_xy(damage_area_dur_tdf_rle['lng'], damage_area_dur_tdf_rle['lat']))
damage_area_post_tdf_rle_gdf = gpd.GeoDataFrame(damage_area_post_tdf_rle, geometry=gpd.points_from_xy(damage_area_post_tdf_rle['lng'], damage_area_post_tdf_rle['lat']))

In [None]:
# Plot random location entropy
damage_area_post_tdf_rle_gdf.plot(column='random_location_entropy', legend=True)

In [None]:
# Plot the first day with random_location_entropy as heatmap
fig, axs = plt.subplots(3, 1, figsize=(10, 10))
damage_area_pre_tdf_rle_gdf.plot(column='random_location_entropy', ax=axs[0], legend=True, cmap='Reds', markersize=0.7, vmin=0, vmax=1)
damage_area_dur_tdf_rle_gdf.plot(column='random_location_entropy', ax=axs[1], legend=True, cmap='Reds', markersize=0.7, vmin=0, vmax=1)
damage_area_post_tdf_rle_gdf.plot(column='random_location_entropy', ax=axs[2], legend=True, cmap='Reds', markersize=0.7, vmin=0, vmax=1)
#damage_area_pre_tdf_rle_gdf
plt.show()

In [None]:
from skmob.measures.collective import visits_per_time_unit

# Groupby date
damage_area_pre_tdf_vpt = damage_area_pre_tdf.groupby(['date','hour']).apply(lambda x: visits_per_time_unit(x, time_unit='1h'))
damage_area_dur_tdf_vpt = damage_area_dur_tdf.groupby(['date','hour']).apply(lambda x: visits_per_time_unit(x, time_unit='1h'))
damage_area_post_tdf_vpt = damage_area_post_tdf.groupby(['date','hour']).apply(lambda x: visits_per_time_unit(x, time_unit='1h'))

### Let's look at the comparison sites

In [None]:
# Load comparison sites
comp1 = pd.read_csv('data/mobile/OK_comp1_May02-22.csv')

comp1_gpd = gpd.GeoDataFrame(comp1, crs='EPSG:4269', geometry=gpd.points_from_xy(comp1['lon'], comp1['lat']))

# Exclude points outside of Oklahoma
comp1_gpd = gpd.sjoin(comp1_gpd, OK_tracts, how='left', predicate='within')

# Create datetime column using timestamps
comp1_gpd['datetime'] = pd.to_datetime(comp1_gpd['timestamp'], unit='s')

# Keep first 13 columns
comp1_gpd = comp1_gpd.iloc[:, :13]
# Create a date column
comp1_gpd['date'] = comp1_gpd['datetime'].dt.date

In [None]:
# Preprocess mobility data; separate by user_ID, filter out points with speed > 400 km/h, 
# and compress points within 200 m of each other in the same trajectory
tdf = skmob.TrajDataFrame(comp1_gpd, latitude='lat', longitude='lon', datetime='datetime', user_id='uid')
f_tdf = skmob.preprocessing.filtering.filter(tdf, max_speed_kmh=400, include_loops=False)
fc_tdf = skmob.preprocessing.compression.compress(f_tdf, spatial_radius_km=0.2)

# Let's see how many points we eliminated
print('Original number of points: ', tdf.shape[0])
print('Number of points after filtering: ', f_tdf.shape[0])
print('Number of points after compression: ', fc_tdf.shape[0])