In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

from datetime import timedelta
from utils import downcast

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas(strings_to_categorical=True)
    except Exception as e:
        print(e)

In [3]:
df = load_df('datasets/periods.parquet.gzip')
df = downcast(df)

In [4]:
df['size'] = df.groupby('period')['period'].transform('size')

In [5]:
df['period_acc'] = df['time_diff'].cumsum()
df.loc[df['period'] != df['period'].shift(), 'period_acc'] = None

In [6]:
gf = df[df['size'] >= 50].copy()
gf = gf.reset_index(drop=True)

In [7]:
gf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4543502 entries, 0 to 4543501
Data columns (total 13 columns):
device_id         uint16
timestamp         datetime64[ns]
battery_level     uint8
network_status    category
screen_on         uint8
change            float32
boundary          uint8
period            uint32
change_acc        float32
time_diff         float32
time_acc          float32
size              int64
period_acc        float32
dtypes: category(1), datetime64[ns](1), float32(5), int64(1), uint16(1), uint32(1), uint8(3)
memory usage: 199.3 MB
