In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

from datetime import timedelta
from utils import downcast

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas(strings_to_categorical=True)
    except Exception as e:
        print(e)

In [3]:
df = load_df('datasets/periods.parquet.gzip')
df = downcast(df)
df = df.drop(['network_status', 'screen_on', 'boundary'], axis=1)

In [4]:
# df['period_acc'] = df['time_diff'].cumsum()
# df.loc[df['period'] != df['period'].shift(), 'period_acc'] = None

In [5]:
df['size'] = df.groupby('period')['period'].transform('size')

In [6]:
df['direction'] = df['change'].apply(lambda x: 1 if x > 0 else -1)

In [7]:
df['max_change'] = df['change_acc'].abs().groupby(df['period']).transform('max')

In [8]:
df['max_time'] = df['time_acc'].groupby(df['period']).transform('max')

In [9]:
df['ppm'] = df['max_change'].div(df['max_time'].div(60)).round(4)

In [10]:
df.head()

Unnamed: 0,device_id,timestamp,battery_level,change,period,change_acc,time_diff,time_acc,size,direction,max_change,max_time,ppm
0,1,2017-10-15 18:36:46,99,,1,,,,11,-1,11.0,3258.0,0.2026
1,1,2017-10-15 18:41:54,98,-1.0,1,-1.0,308.0,308.0,11,-1,11.0,3258.0,0.2026
2,1,2017-10-15 18:46:54,97,-1.0,1,-2.0,300.0,608.0,11,-1,11.0,3258.0,0.2026
3,1,2017-10-15 18:50:35,96,-1.0,1,-3.0,221.0,829.0,11,-1,11.0,3258.0,0.2026
4,1,2017-10-15 18:54:14,95,-1.0,1,-4.0,219.0,1048.0,11,-1,11.0,3258.0,0.2026
