In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq


def load_df(path, columns=None, nthreads=4, strings_to_categorical=True):
    try:
        table = pq.read_table(path, columns=columns, nthreads=nthreads)
        return table.to_pandas(strings_to_categorical=strings_to_categorical)
    except Exception as e:
        print(e)


cols = ['device_id', 'timestamp', 'battery_state', 'battery_level']
        
df = load_df('../src/samples.parquet', cols).sort_values(by=['device_id', 'timestamp'])

df = df.reset_index(drop=True)

df_int = df.select_dtypes(include=['int'])
converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')

df[converted_int.columns] = converted_int

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2854918 entries, 0 to 2854917
Data columns (total 4 columns):
device_id        uint16
timestamp        datetime64[ns]
battery_state    category
battery_level    int8
dtypes: category(1), datetime64[ns](1), int8(1), uint16(1)
memory usage: 32.7 MB


In [2]:
# additional features
df['time_diff'] = df['timestamp'].diff()
df.loc[df.device_id != df.device_id.shift(), 'time_diff'] = None

In [4]:
df[df.device_id == 2330].head(50)

Unnamed: 0,device_id,timestamp,battery_state,battery_level,time_diff
2113676,2330,2017-11-01 00:39:27,discharging,97,NaT
2113677,2330,2017-11-01 01:45:16,discharging,96,01:05:49
2113678,2330,2017-11-01 02:41:14,discharging,95,00:55:58
2113679,2330,2017-11-01 04:13:24,discharging,94,01:32:10
2113680,2330,2017-11-01 05:51:28,discharging,93,01:38:04
2113681,2330,2017-11-01 07:15:24,discharging,92,01:23:56
2113682,2330,2017-11-01 08:13:33,discharging,91,00:58:09
2113683,2330,2017-11-01 08:52:37,discharging,90,00:39:04
2113684,2330,2017-11-01 09:21:18,discharging,89,00:28:41
2113685,2330,2017-11-01 10:16:56,discharging,88,00:55:38
