In [1]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from rich import print
import warnings
warnings.filterwarnings("ignore")

Import the data

In [2]:
data = pd.read_csv('HealthApp_2k.log_structured.csv',index_col='LineId')

Convert time to datetime format, extract the date from the time column and sort the dataframe according to time

In [3]:
timeFormat = '%Y%m%d-%H:%M:%S:%f'

data['Time'] = pd.to_datetime(data['Time'], format=timeFormat)

data['Date'] = data['Time'].dt.day
data.sort_values('Time',ascending=True,inplace=True)

Checking for missing values

In [4]:
data.isna().sum()

Time             0
Component        0
Pid              0
Content          0
EventId          0
EventTemplate    0
Date             0
dtype: int64

There are no missing values in the dataset

Calculating the time span covered in the dataset

In [5]:
timeFrame = data['Time'].to_list()
startTime = timeFrame[0]
endTime = timeFrame[-1]
totalTimeForLogs = timeFrame[-1] - timeFrame[0]


StartDate = startTime.strftime('%Y-%m-%d')
endDate = endTime.strftime('%Y-%m-%d')

startTime = startTime.strftime('%Y-%m-%d %H:%M:%S')
endTime = endTime.strftime('%Y-%m-%d %H:%M:%S')

days = totalTimeForLogs.days
hours, remainder = divmod(totalTimeForLogs.seconds, 3600)
minutes, seconds = divmod(remainder, 60)


print(f"Total Time considered in the logs is [bold]{days}[/bold] days [bold]{hours}[/bold] hours [bold]{minutes}[/bold] minutes and [bold]{seconds}[/bold] seconds\nStarting at [bold]{startTime}[/bold] and Ending at [bold]{endTime}[/bold]")
print(f"Dates covered in the logs [bold]{','.join(map(str,list(data['Date'].unique())))}[/bold] in the year {''.join(map(str,list(data['Time'].dt.year.unique())))}")


## Steps Analysis

Steps are Counted at EventID `E22` which gives total detailed steps in the following way under the Content column `getTodayTotalDetailSteps = <*>##<*>##<*>##<*>##<*>##<*>` in this looking at the values we can deduce that the value are in the format `<Unix epoch time>##<steps>`...

So we use the `splitContent` function to extrat the step count and add a new column to the steps dataframe called steps

In [6]:
steps = data[data['EventId']=='E22'].reset_index(drop=True)

def splitContent(row):
    row = row.split('##')[1]
    return int(row)

steps['steps'] = steps['Content'].apply(splitContent)

Here we use plotly to create a line plot of steps taken over the time considered, There are 3 plots with the first plot showing the cumulative steps taken for the whole dataset and the other two show the steps taken on the respective dates covered in the dataset, in the cumulative plot we can see when the steps were reset to `0` after midnight

In [7]:
stepFigSub = make_subplots(rows=2, cols=2, specs=[[{"colspan": 2}, None], [{}, {}]],subplot_titles=['Steps Over Total Time',f'Steps On {StartDate}',f'Steps On {endDate}'])


overallSteps = go.Scatter(x=steps["Time"], y=steps["steps"], mode='lines+markers',showlegend=False)
stepsDay1 = go.Scatter(x=steps["Time"], y=steps[steps["Date"]==23]["steps"], mode='lines+markers',showlegend=False)
stepsDay2 = go.Scatter(x=steps["Time"], y=steps[steps["Date"]==24]["steps"], mode='lines+markers',showlegend=False)


stepFigSub.add_trace(overallSteps, row=1, col=1)
stepFigSub.add_trace(stepsDay1, row=2, col=1)
stepFigSub.add_trace(stepsDay2, row=2, col=2)

# Add an annotation
stepFigSub.add_annotation(
    x=steps.iloc[steps[steps['steps'] == 0].index[0]]['Time'],
    y=1,
    text=f"Steps Reset at {steps.iloc[steps[steps['steps'] == 0].index[0]]['Time'].strftime('%Y-%m-%d %H:%M:%S')}",
    showarrow=True,
    arrowhead=1,
    ax=20,
    ay=-20,
    xref="x",
    yref="y",
    row=1,
    col=1
)

stepFigSub.update_layout(
    height=800, 
    width=1200, 
    title_text="Step Count Over Time", 
    title_font_size=16,
    
    xaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    xaxis2=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis2=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    xaxis3=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis3=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    
    font=dict(family="Arial, sans-serif", size=12, color="black"),
    
    plot_bgcolor='white',
    paper_bgcolor='white',
    
)

stepFigSub.show()

Using group by statement we find the last step recorded on each date and plot them using a bar chart, as we can see there is no second bar graph as the steps covered are equal to zero

In [8]:
stepsPerDate = steps.groupby('Date')['steps'].last().reset_index()
fig = px.bar(stepsPerDate, x="Date", y="steps", text="steps")

fig.update_layout(
    height=600,
    width=1200,
    title='Steps Per Date',
    xaxis_title='Date',
    yaxis_title='Steps',
    legend=dict(title='Steps', x=1, y=1),
    title_font_size=16,
    font=dict(family="Arial, sans-serif", size=12, color="black"),
    xaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    plot_bgcolor='white',
    paper_bgcolor='white',
)

fig.show()

## Calorie Analysis

Calories are Counted at EventID `E4` which gives total calories with cache in the following way under the Content column `calculateCaloriesWithCache totalCalories=<*>` in this looking at the values we can deduce the calory value from the string

So we use the `splitCalories` function to extrat the total calory and add a new column to the steps dataframe called calories

In [9]:
def splitCalories(row):
    row = row.split('=')[-1]
    return int(row)

calories = data[data['EventId']=='E4'].reset_index(drop=True)

calories['calories']=calories['Content'].apply(splitCalories)

Plot of total calories over the time period. Looking at the below plot one issue that arises is that the total calory count is abnormaly high for an individual as the values are > 1*10^5 as the normal daily calorie intake of a human is 1200-3000, we can also see that the value is not accumulated over the whole timeframe of the logs as the totalCalories value is reset to 0 at midnight

In [10]:
calorieFigSub = make_subplots(rows=2, cols=2, specs=[[{"colspan": 2}, None], [{}, {}]],subplot_titles=['Calories Over Total Time',f'calories On {StartDate}',f'calories On {endDate}'])


overallSteps = go.Scatter(x=calories["Time"], y=calories["calories"], mode='lines+markers',showlegend=False)
caloriesDay1 = go.Scatter(x=calories["Time"], y=calories[calories["Date"]==23]["calories"], mode='lines+markers',showlegend=False)
caloriesDay2 = go.Scatter(x=calories["Time"], y=calories[calories["Date"]==24]["calories"], mode='lines+markers',showlegend=False)


calorieFigSub.add_trace(overallSteps, row=1, col=1)
calorieFigSub.add_trace(caloriesDay1, row=2, col=1)
calorieFigSub.add_trace(caloriesDay2, row=2, col=2)


calorieFigSub.update_layout(
    height=800, 
    width=1200, 
    title_text="Calorie Count Over Time", 
    title_font_size=16,
    
    xaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    xaxis2=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis2=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    xaxis3=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis3=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    
    font=dict(family="Arial, sans-serif", size=12, color="black"),
    
    plot_bgcolor='white',
    paper_bgcolor='white',
    
)

calorieFigSub.show()

## Activity Times

To look at a measurement of active time for the user we can see the times when a standup was logged for the user, a standup event was logged under event id `E42` and was in the format `onStandStepChanged <*>` which shows that the step was changed on the user standing up `splitActive` function was used to extract the standup step change count of the day and added to a column called standup

In [11]:
active = data[data['EventId']=='E42']

def splitActive(row):

    return int(row.split(' ')[-1])
    
active['standUp'] = active['Content'].apply(splitActive)

We create histograms of measuring the amount of times a user stood up during a time interval of 10 minutes

In [12]:
activeFig = make_subplots(rows=2, cols=2, specs=[[{"colspan": 2}, None], [{}, {}]],subplot_titles=['Standup Count Over Total Time',f'Standup Count on {StartDate}',f'Standup Count {endDate}'])


overallActive = go.Histogram(x=active["Time"],showlegend=False,nbinsx=20)
activeDay1 = go.Histogram(x=active[active["Date"]==23]["Time"],showlegend=False,nbinsx=20)
activeDay2 = go.Histogram(x=active[active["Date"]==24]["Time"],showlegend=False,nbinsx=20)


activeFig.add_trace(overallActive, row=1, col=1)
activeFig.add_trace(activeDay1, row=2, col=1)
activeFig.add_trace(activeDay2, row=2, col=2)

activeFig.update_layout(
    height=800, 
    width=1200, 
    title_text="Standup Count Over Time", 
    title_font_size=16,
    
    xaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    xaxis2=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis2=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    xaxis3=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis3=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    
    font=dict(family="Arial, sans-serif", size=12, color="black"),
    
    plot_bgcolor='white',
    paper_bgcolor='white',
    
)

activeFig.show()

## Screen Stats

The intent change in turning the screen on or off was recorded under event id `E41` and `E40` respectively, we use this information to consider the time the screen was on and off 

In [13]:
screenData = data.copy()
screenData['statusChange'] = 0

screenData.loc[screenData['EventId'] == 'E41', 'statusChange'] = 1
screenData.loc[screenData['EventId'] == 'E40', 'statusChange'] = -1

screenData['screenStatus'] = screenData['statusChange'].cumsum().clip(lower=0)

screenData.drop('statusChange', axis=1, inplace=True)
screenData['yVal'] = 'Screen Status'
screenData['ScreenStatusCat'] = screenData['screenStatus'].apply(lambda x: 'Screen On' if x > 0 else 'Screen Off')

screenData['timeShifted'] = screenData['Time'].shift(-1)

screenData['duration'] = (screenData['timeShifted'] - screenData['Time']).dt.total_seconds() /60

The below plot plots the screen status vs time with a 1 representing that sceen was on and a 0 representing that the screen was off

In [14]:
statusFig = px.line(screenData, x='Time', y='screenStatus')

statusFig.update_layout(
    height=400, 
    width=1200, 
    title_text='Screen Status Over Time',
    title_font_size=16,
    
    xaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    
    font=dict(family="Arial, sans-serif", size=12, color="black"),
    
    plot_bgcolor='white',
    paper_bgcolor='white',
    
)

statusFig.show()


below is the timeline view of the screen status with screen off shown in blue and screen on shown in red

In [15]:
timelineFig = px.timeline(screenData, x_start='Time', x_end='timeShifted', y='yVal', color='ScreenStatusCat', color_continuous_scale='bluered')

timelineFig.update_layout(
    height=400, 
    width=1200, 
    title_text='Event Timeline of Screen Status',
    title_font_size=16,
    xaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True,title_text=''),
    font=dict(family="Arial, sans-serif", size=12, color="black"),
    plot_bgcolor='white',
    paper_bgcolor='white',
)

timelineFig.show()

### Component Analysis

We Create a histogram to see which component was used the most

In [16]:
componentCount = data["Component"].value_counts().reset_index()
componentCount.columns = ['Component', 'Frequency']

componentHistogram = go.Figure(go.Bar(
    x=componentCount['Component'], 
    y=componentCount['Frequency'],
    showlegend=False,
    text=componentCount['Frequency']
))
componentHistogram.update_layout(
    height=400, 
    width=1200, 
    title_text='Histogram for Component',
    title_font_size=16,
    xaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True,title_text=''),
    font=dict(family="Arial, sans-serif", size=12, color="black"),
    plot_bgcolor='white',
    paper_bgcolor='white',
)

componentHistogram.show()

We Create a histogram to see which event id was invoked the most

In [17]:
eventIdCount = data["EventId"].value_counts().reset_index()
eventIdCount.columns = ['EventId', 'Frequency']

eventIdHistogram = go.Figure(go.Bar(
    x=eventIdCount['EventId'], 
    y=eventIdCount['Frequency'],
    showlegend=False,
    text=eventIdCount['Frequency']
))
eventIdHistogram.update_layout(
    height=400, 
    width=1200, 
    title_text='Histogram for EventId',
    title_font_size=16,
    xaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True),
    yaxis=dict(showgrid=False, linecolor='black', linewidth=2, mirror=True,title_text=''),
    font=dict(family="Arial, sans-serif", size=12, color="black"),
    plot_bgcolor='white',
    paper_bgcolor='white',
)

eventIdHistogram.show()