In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_style('whitegrid')
figsize=(8,6)


def load_df(path, columns=None, nthreads=4, strings_to_categorical=True):
    try:
        table = pq.read_table(path, columns=columns, nthreads=nthreads)
        return table.to_pandas(strings_to_categorical=strings_to_categorical)
    except Exception as e:
        print(e)

        
def prepare(path):
    cols = ['device_id', 'timestamp', 'battery_level', 'charger', 'health', 'voltage', 'temperature', 
            'capacity', 'charge_counter', 'current_average', 'current_now', 'energy_counter']
    
    # sorted by [device_id, timestamp]
    df = load_df(path, cols)

    df = df.reset_index(drop=True)

    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')

    df[converted_int.columns] = converted_int
    
    return df
    
df = prepare('../src/samples_battery.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4587326 entries, 0 to 4587325
Data columns (total 12 columns):
device_id          uint16
timestamp          datetime64[ns]
battery_level      uint8
charger            category
health             category
voltage            float32
temperature        float32
capacity           uint8
charge_counter     int64
current_average    int64
current_now        int64
energy_counter     int64
dtypes: category(2), datetime64[ns](1), float32(2), int64(4), uint16(1), uint8(2)
memory usage: 236.2 MB


In [4]:
df.head()

Unnamed: 0,device_id,timestamp,battery_level,charger,health,voltage,temperature,capacity,charge_counter,current_average,current_now,energy_counter
0,1,2017-10-15 18:36:46,99,unplugged,good,4.2,32.0,0,0,0,-224,-1
1,1,2017-10-15 18:41:54,98,unplugged,good,4.16,32.599998,0,0,0,-341,-1
2,1,2017-10-15 18:46:54,97,unplugged,good,4.17,32.599998,0,0,0,-590,-1
3,1,2017-10-15 18:50:35,96,unplugged,good,4.14,33.799999,0,0,0,-521,-1
4,1,2017-10-15 18:54:14,95,unplugged,good,4.15,34.299999,0,0,0,-245,-1


In [3]:
# additional features
facts = ['device_id', 'voltage', 'temperature', 'capacity',
         'charge_counter', 'current_average', 'current_now', 'energy_counter']
df[facts].describe()

Unnamed: 0,device_id,voltage,temperature,capacity,charge_counter,current_average,current_now,energy_counter
count,4587326.0,4587326.0,4587326.0,4587326.0,4587326.0,4587326.0,4587326.0,4587326.0
mean,1548.31,13.4492,28.78005,0.0,184687.7,5088.521,86.60199,384715.4
std,920.7455,194.3314,5.770491,0.0,1885188.0,123088.7,652.3913,84669990.0
min,1.0,0.0,-30.0,0.0,-1661971.0,-2535468.0,-24620.0,-1.0
25%,763.0,3.84,25.0,0.0,0.0,0.0,-1.0,-1.0
50%,1532.0,4.03,28.8,0.0,0.0,0.0,0.0,-1.0
75%,2334.0,4.2,32.7,0.0,0.0,0.0,0.0,-1.0
max,3253.0,4415.1,66.0,0.0,32767980.0,2751000.0,19290.0,30558810000.0


In [11]:
df.current_now.value_counts()

 0        2383118
-1         366217
-2         235673
 1         113880
 2           5540
-3           4247
 12          3793
 13          3453
-9           3266
-10          3071
-7           2887
-12          2868
-11          2742
-8           2682
-6           2552
-5           2498
-13          2299
-14          1952
-4           1807
 3           1749
 14          1735
-27          1689
-28          1609
-15          1530
-24          1434
-16          1425
 11          1414
 9           1378
-25          1367
-23          1306
           ...   
-2737           1
-2733           1
 6300           1
-2777           1
-7730           1
 7300           1
-2828           1
-2823           1
 10430          1
 18610          1
 11390          1
 5380           1
-2805           1
 5390           1
 15480          1
 4550           1
-23290          1
 8640           1
 8190           1
-2800           1
 17690          1
 10410          1
 9490           1
 17680          1
-5980     