In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df


In [6]:
cols = ['id','sample_id','charger','health','voltage','temperature','capacity','charge_counter','current_average','current_now','energy_counter']
df = prepare('parquet_files/battery_details.parquet',cols)

#fix unsigned int
df_level = df.id 
converted_level = df_level.astype(np.int32)
df['id'] = converted_level
df_level = df.sample_id 
converted_level = df_level.astype(np.int32)
df['sample_id'] = converted_level
df_level = df.capacity
converted_level = df_level.astype(np.int32)
df['capacity'] = converted_level

df['charger'] = df['charger'].apply(lambda x: x.upper())
df['health'] = df['health'].apply(lambda x: x.upper())




df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11957118 entries, 0 to 11957117
Data columns (total 11 columns):
id                 int32
sample_id          int32
charger            object
health             object
voltage            float32
temperature        float32
capacity           int32
charge_counter     int64
current_average    int64
current_now        int64
energy_counter     int64
dtypes: float32(2), int32(3), int64(4), object(2)
memory usage: 775.4+ MB


In [5]:
df.head(100)

Unnamed: 0,id,sample_id,charger,health,voltage,temperature,capacity,charge_counter,current_average,current_now,energy_counter
0,1,1,unplugged,Good,4.05,28.500000,0,0,0,-229,-1
1,2,2,unplugged,Good,4.05,28.500000,0,0,0,-229,-1
2,3,3,unplugged,Good,4.04,29.000000,0,0,0,-248,-1
3,4,4,unplugged,Good,3.97,29.900000,0,0,0,-313,-1
4,5,5,unplugged,Good,4.03,29.200001,0,0,0,-234,-1
5,6,6,ac,Good,4.20,27.299999,0,0,0,518,-1
6,7,7,ac,Good,4.22,26.000000,0,0,0,538,-1
7,8,8,ac,Good,4.22,25.000000,0,0,0,480,-1
8,9,9,ac,Good,4.24,25.200001,0,0,0,528,-1
9,10,10,ac,Good,4.27,24.100000,0,0,0,555,-1


In [8]:
#dfx1 = df.groupby(['charger'])['charger'].count()
#dfx2 = df.groupby(['health'])['health'].count()
#dfx3 = df.groupby(['capacity'])['capacity'].count()
#dfx4 = df.groupby(['charge_counter'])['charge_counter'].count()
#dfx5 = df.groupby(['current_average'])['current_average'].count()
