In [None]:
import pandas as pd
import numpy as np

In [None]:
# df = pd.read_parquet('rome.parquet')
df = pd.read_parquet('geolife.parquet')
display(df.info())

In [None]:
view = df.loc[:, ['user', 'traj_id', 'time']].copy()
view.sort_values(by = 'time', inplace = True)
view.reset_index(drop = True, inplace = True)

display(view)
display(view.info())

#### Computing trajectory-wide statistics...

In [None]:
gb = view.groupby(['user','traj_id'], sort = False)
traj_info = pd.DataFrame()

traj_info['start'] = gb['time'].min()
traj_info['end'] = gb['time'].max()
traj_info['duration'] = traj_info['end'] - traj_info['start']
traj_info['num_samples'] = gb['time'].size()
view['diff'] = view['time'] - gb['time'].shift(1)
traj_info['avg_sampling'] = gb['diff'].mean()
del view

display(traj_info)
display(traj_info.info())

In [None]:
traj_info.reset_index(drop = False, inplace = True)
display(traj_info)
traj_info.sort_values(by = ['user', 'start'], inplace = True)
gb = traj_info.groupby(['user'], sort = False)

traj_info['gap_trajectory'] = gb['end'].shift(1)
traj_info['gap_trajectory'] = (traj_info['start'] - traj_info['gap_trajectory'])
display(traj_info)
display(traj_info.info())

#### Computing user-wide statistics

In [None]:
stats_user = pd.DataFrame()
stats_user['begin'] = gb['start'].min()
stats_user['end'] = gb['end'].max()
stats_user['timespan'] = stats_user['end'] - stats_user['begin']
stats_user['num_trajectories'] = gb['traj_id'].size()
stats_user['avg_duration_trajectories'] = gb['duration'].mean()
stats_user['avg_num_samples_trajectories'] = gb['num_samples'].mean()
stats_user['avg_sampling_trajectories'] = gb['avg_sampling'].mean()
stats_user['frequency_trajectories'] = stats_user['timespan'] / stats_user['num_trajectories']
stats_user['avg_gap_trajectories'] = gb['gap_trajectory'].mean()
display(stats_user.info())

# Take into account those users who have just 1 trajectory, or all the trajectories with just 1 sample.
stats_user['avg_sampling_trajectories'] = stats_user['avg_sampling_trajectories'].fillna(pd.Timedelta(0))
stats_user['avg_gap_trajectories'] = stats_user['avg_gap_trajectories'].fillna(pd.Timedelta(0))

#### Computing dataset-wide statistics

In [None]:
display(stats_user.info())
#display(stats_user.sort_values(by = ['timespan'], ascending = False).head(50))

num_users = len(stats_user)
num_trajectories = stats_user['num_trajectories'].sum()
avg_timespan = stats_user['timespan'].mean()
std_timespan = stats_user['timespan'].std()
avg_num_trajs = round(stats_user['num_trajectories'].mean(), 2)
std_num_trajs = round(stats_user['num_trajectories'].std(), 2)
avg_duration_trajs = stats_user['avg_duration_trajectories'].mean()
std_duration_trajs = stats_user['avg_duration_trajectories'].std()
avg_num_samples_trajs = round(stats_user['avg_num_samples_trajectories'].mean(), 2)
std_num_samples_trajs = round(stats_user['avg_num_samples_trajectories'].std(), 2)
avg_sampling_trajs = stats_user['avg_sampling_trajectories'].mean()
std_sampling_trajs = stats_user['avg_sampling_trajectories'].std()
avg_freq_trajs = stats_user['frequency_trajectories'].mean()
std_freq_trajs = stats_user['frequency_trajectories'].std()
avg_gap_trajs = stats_user['avg_gap_trajectories'].mean()
std_gap_trajs = stats_user['avg_gap_trajectories'].std()
num_notable_users_month = len(stats_user.loc[stats_user['timespan'] >= pd.Timedelta(days = 28), :])
num_notable_users_week = len(stats_user.loc[stats_user['timespan'] >= pd.Timedelta(days = 7), :])
num_notable_users_day = len(stats_user.loc[stats_user['timespan'] >= pd.Timedelta(days = 1), :])

print(f"Number of users: {num_users}\n" +
      f"Number of trajectories: {num_trajectories}\n" +
      f"Time spanned by users: mean => {avg_timespan}, std => {std_timespan}\n\n" +
      f"Number of trajectories per user: mean => {avg_num_trajs}, std => {std_num_trajs}\n" +
      f"Duration of trajectories: mean => {avg_duration_trajs}, std => {std_duration_trajs}\n" +
      f"Average number of samples per trajectory: mean => {avg_num_samples_trajs}, std => {std_num_samples_trajs}\n" +
      f"Average sampling rate trajectories: mean => {avg_sampling_trajs}, std => {std_sampling_trajs}\n" +
      f"Average frequency of trajectories per user:  mean => {avg_freq_trajs}, std => {std_freq_trajs}\n" +
      f"Average gap between trajectories:  mean => {avg_gap_trajs}, std => {std_gap_trajs}\n" +
      f"Number of users with data spanning more than: 4 weeks => {num_notable_users_month}, 1 week => {num_notable_users_week}, 1 day => {num_notable_users_day}")