In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np

%matplotlib inline

# Import data
df = pd.read_csv("data/all_data_3.csv");

# Set user groups from enabled features

# Abbreviations:
# - BG: User got no explanations (base group).
# - AE: User got explanations about the routing algorithm (alogorithm explanations).
# - NE: User got explanations depending on the active navigation (navigation explanations).
# - GE: User got both types of explanations (grouped explanations).

df.loc[(df['LowAccuracyCounter'].isna()), 'LowAccuracyCounter'] = 0
df.loc[(df['MaxLowAccuracy'].isna()), 'MaxLowAccuracy'] = 0
df.loc[(df['NumberOfUnnormalRoutes'].isna()), 'NumberOfUnnormalRoutes'] = 0
df.loc[(df['OffRoutePerTenKilometers'].isna()), 'OffRoutePerTenKilometers'] = 0
df.loc[(df['DistanceTraveledInMeters'].isna()), 'DistanceTraveledInMeters'] = 0
df.loc[(df['NumberOfTimesSeenUserCountExplanationShort'].isna()), 'NumberOfTimesSeenUserCountExplanationShort'] = 0
df.loc[(df['NumberOfTimesSeenUserCountExplanationLong'].isna()), 'NumberOfTimesSeenUserCountExplanationLong'] = 0
df.loc[(df['NumberOfTimesSeenCollaborativeRoutingExplanation'].isna()), 'NumberOfTimesSeenCollaborativeRoutingExplanation'] = 0

df["LowAccuracyPerKilometer"] = df["LowAccuracyCounter"] / (df["DistanceTraveledInMeters"] / 1000)
df.loc[(df['LowAccuracyPerKilometer'].isna()), 'LowAccuracyPerKilometer'] = 0

df.loc[(df['GpsQuality'] == False) & (df['TrafficVolume'] == False) & (df['RouteExplanation'] == False) & (df['UserCount'] == False), 'task'] = "BG"
df.loc[(df['GpsQuality'] == False) & (df['TrafficVolume'] == False) & (df['RouteExplanation'] == True) & (df['UserCount'] == True), 'task'] = "AE"
df.loc[(df['GpsQuality'] == True) & (df['TrafficVolume'] == True) & ((df['MaxLowAccuracy'] > 0) | (df['NumberOfUnnormalRoutes'] > 0)) & (df['RouteExplanation'] == False) & (df['UserCount'] == False), 'task'] = "NE"
df.loc[(df['GpsQuality'] == True) & (df['TrafficVolume'] == True) & ((df['MaxLowAccuracy'] > 0) | (df['NumberOfUnnormalRoutes'] > 0)) & (df['RouteExplanation'] == True) & (df['UserCount'] == True), 'task'] = "GE"
df.loc[(df['GpsQuality'] == True) & (df['TrafficVolume'] == True) & ((df['MaxLowAccuracy'] <= 0) & (df['NumberOfUnnormalRoutes'] <= 0)) & (df['RouteExplanation'] == False) & (df['UserCount'] == False), 'task'] = "BG"
df.loc[(df['GpsQuality'] == True) & (df['TrafficVolume'] == True) & ((df['MaxLowAccuracy'] <= 0) & (df['NumberOfUnnormalRoutes'] <= 0)) & (df['RouteExplanation'] == True) & (df['UserCount'] == True), 'task'] = "AE"

df.loc[(df['task'] == "BG"), 'group_named'] = "Gruppe 1"
df.loc[(df['task'] == "AE"), 'group_named'] = "Gruppe 2"
df.loc[(df['task'] == "NE"), 'group_named'] = "Gruppe 3"
df.loc[(df['task'] == "GE"), 'group_named'] = "Gruppe 4"

df = df[df['task'].notna()]

# Remove fields with redundant information
df = df.drop(columns=['GpsQuality', 'TrafficVolume', 'RouteExplanation', 'UserCount'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41540 entries, 0 to 41539
Data columns (total 17 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   DeviceId                                          41540 non-null  object 
 1   numberOfRoutes                                    41540 non-null  int64  
 2   InitialTrafficVolume                              41540 non-null  object 
 3   DistanceTraveledInMeters                          41540 non-null  int64  
 4   OffRoutePerTenKilometers                          41540 non-null  float64
 5   LowAccuracyCounter                                41540 non-null  int64  
 6   Rating                                            696 non-null    float64
 7   NumberOfTimesSeenUserCountExplanationShort        41540 non-null  float64
 8   NumberOfTimesSeenUserCountExplanationLong         41540 non-null  float64
 9   NumberOfTimesSeen

In [2]:
# Filter for unusable data
filtered_data = df

filtered_data = filtered_data[filtered_data['DistanceTraveledInMeters']>5000]
filtered_data = filtered_data[(filtered_data['task'] == "AE") | (filtered_data['task'] == "GE")]

filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7153 entries, 1 to 41344
Data columns (total 17 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   DeviceId                                          7153 non-null   object 
 1   numberOfRoutes                                    7153 non-null   int64  
 2   InitialTrafficVolume                              7153 non-null   object 
 3   DistanceTraveledInMeters                          7153 non-null   int64  
 4   OffRoutePerTenKilometers                          7153 non-null   float64
 5   LowAccuracyCounter                                7153 non-null   int64  
 6   Rating                                            308 non-null    float64
 7   NumberOfTimesSeenUserCountExplanationShort        7153 non-null   float64
 8   NumberOfTimesSeenUserCountExplanationLong         7153 non-null   float64
 9   NumberOfTimesSeenC

In [3]:

filtered_data.loc[(filtered_data['NumberOfTimesSeenUserCountExplanationShort'] > 4), 'NumberOfTimesSeenUserCountExplanationShort'] = 5
filtered_data.loc[(filtered_data['NumberOfTimesSeenUserCountExplanationLong'] > 4), 'NumberOfTimesSeenUserCountExplanationLong'] = 5
filtered_data.loc[(filtered_data['NumberOfTimesSeenCollaborativeRoutingExplanation'] > 4), 'NumberOfTimesSeenCollaborativeRoutingExplanation'] = 5

filtered_data.loc[(filtered_data['NumberOfTimesSeenUserCountExplanationLong'] != 0), 'UserCountExplanationLong'] = True
filtered_data.loc[(filtered_data['NumberOfTimesSeenUserCountExplanationLong'] == 0), 'UserCountExplanationLong'] = False

filtered_data.loc[(filtered_data['NumberOfTimesSeenCollaborativeRoutingExplanation'] != 0), 'CollaborativeRoutingExplanation'] = True
filtered_data.loc[(filtered_data['NumberOfTimesSeenCollaborativeRoutingExplanation'] == 0), 'CollaborativeRoutingExplanation'] = False

In [4]:
g = filtered_data.groupby(['DeviceId', 'NumberOfTimesSeenUserCountExplanationShort', 'NumberOfTimesSeenUserCountExplanationLong', 'NumberOfTimesSeenCollaborativeRoutingExplanation']).count()
data = g.reset_index()

In [5]:
g = data.groupby(['NumberOfTimesSeenUserCountExplanationShort']).count()
g.reset_index().head()

Unnamed: 0,NumberOfTimesSeenUserCountExplanationShort,DeviceId,NumberOfTimesSeenUserCountExplanationLong,NumberOfTimesSeenCollaborativeRoutingExplanation,numberOfRoutes,InitialTrafficVolume,DistanceTraveledInMeters,OffRoutePerTenKilometers,LowAccuracyCounter,Rating,ActualDrivingTime,EstimatedDrivingTime,MaxLowAccuracy,NumberOfUnnormalRoutes,LowAccuracyPerKilometer,task,group_named,UserCountExplanationLong,CollaborativeRoutingExplanation
0,0.0,1723,1723,1723,1723,1723,1723,1723,1723,1723,1723,1723,1723,1723,1723,1723,1723,1723,1723
1,1.0,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28
2,2.0,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
3,3.0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
4,4.0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3


In [6]:
g = data.groupby(['NumberOfTimesSeenUserCountExplanationLong']).count()
g.reset_index().head()

Unnamed: 0,NumberOfTimesSeenUserCountExplanationLong,DeviceId,NumberOfTimesSeenUserCountExplanationShort,NumberOfTimesSeenCollaborativeRoutingExplanation,numberOfRoutes,InitialTrafficVolume,DistanceTraveledInMeters,OffRoutePerTenKilometers,LowAccuracyCounter,Rating,ActualDrivingTime,EstimatedDrivingTime,MaxLowAccuracy,NumberOfUnnormalRoutes,LowAccuracyPerKilometer,task,group_named,UserCountExplanationLong,CollaborativeRoutingExplanation
0,0.0,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665
1,1.0,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78
2,2.0,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
3,3.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
4,4.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [7]:
g = data.groupby(['NumberOfTimesSeenCollaborativeRoutingExplanation']).count()
g.reset_index().head()

Unnamed: 0,NumberOfTimesSeenCollaborativeRoutingExplanation,DeviceId,NumberOfTimesSeenUserCountExplanationShort,NumberOfTimesSeenUserCountExplanationLong,numberOfRoutes,InitialTrafficVolume,DistanceTraveledInMeters,OffRoutePerTenKilometers,LowAccuracyCounter,Rating,ActualDrivingTime,EstimatedDrivingTime,MaxLowAccuracy,NumberOfUnnormalRoutes,LowAccuracyPerKilometer,task,group_named,UserCountExplanationLong,CollaborativeRoutingExplanation
0,0.0,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412
1,1.0,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254
2,2.0,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68
3,3.0,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19
4,4.0,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10


In [13]:
deviceIds = filtered_data["DeviceId"].to_numpy()

deviceIds

array(['6f9f3afd-06c1-480f-b8fd-a7b726266367',
       'fabc99d3-b91b-42b5-b068-d6d90c3233f9',
       'e7146c0e-d87c-40e4-a1d4-44fe276ad1aa', ...,
       '538a53ef-091e-4e81-a2d7-b972b14ba99e',
       '17d7e377-d767-4c04-bc32-ebac9f7591bc',
       'c4f79fbc-52a1-4bd9-9dad-93fa8cb2f22d'], dtype=object)

In [59]:
explanations = pd.read_csv("data/explanations.csv");

explanations.count()

DeviceId                   1748
Explanation.explanation    1748
Explanation.durationMs     1748
dtype: int64

In [60]:
explanations = explanations[explanations.DeviceId.isin(deviceIds)]

explanations.count()

DeviceId                   542
Explanation.explanation    542
Explanation.durationMs     542
dtype: int64

In [61]:
explanations.loc[(explanations['Explanation.durationMs'] > 1500), 'Read'] = True
explanations.loc[(explanations['Explanation.durationMs'] <= 1500), 'Read'] = False

In [63]:
g = explanations.groupby(['Explanation.explanation', 'Read']).count()
test = g.reset_index().head()

test

Unnamed: 0,Explanation.explanation,Read,DeviceId,Explanation.durationMs
0,ROUTE_PREVIEW_EXPLANATION_LONG,True,13,13
1,USER_COUNT_EXPLANATION_LONG,False,3,3
2,USER_COUNT_EXPLANATION_LONG,True,125,125
3,USER_COUNT_EXPLANATION_SHORT,False,23,23
4,USER_COUNT_EXPLANATION_SHORT,True,378,378
