In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df


In [3]:
cols = ['sample_id','charger','health','voltage','temperature','charge_counter','current_average','current_now','energy_counter']
df = prepare('1-parquet-files/battery_details.parquet',cols)

#fix unsigned int
df_level = df.sample_id 
converted_level = df_level.astype(np.int32)
df['sample_id'] = converted_level

df['charger'] = df['charger'].apply(lambda x: x.upper())
df['health'] = df['health'].apply(lambda x: x.upper())




df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23600501 entries, 0 to 23600500
Data columns (total 9 columns):
sample_id          int32
charger            object
health             object
voltage            float32
temperature        float32
charge_counter     int64
current_average    int64
current_now        int64
energy_counter     int64
dtypes: float32(2), int32(1), int64(4), object(2)
memory usage: 1.3+ GB


In [4]:
df.head(100)

Unnamed: 0,sample_id,charger,health,voltage,temperature,charge_counter,current_average,current_now,energy_counter
0,1,UNPLUGGED,GOOD,4.05,28.500000,0,0,-229,-1
1,2,UNPLUGGED,GOOD,4.05,28.500000,0,0,-229,-1
2,3,UNPLUGGED,GOOD,4.04,29.000000,0,0,-248,-1
3,4,UNPLUGGED,GOOD,3.97,29.900000,0,0,-313,-1
4,5,UNPLUGGED,GOOD,4.03,29.200001,0,0,-234,-1
...,...,...,...,...,...,...,...,...,...
95,96,AC,GOOD,3.93,27.400000,0,0,530,-1
96,97,AC,GOOD,3.93,27.400000,0,0,527,-1
97,98,AC,GOOD,3.93,27.400000,0,0,266,-1
98,99,AC,GOOD,3.94,27.400000,0,0,519,-1


In [5]:
#dfx1 = df.groupby(['charger'])['charger'].count()
#dfx2 = df.groupby(['health'])['health'].count()
#dfx3 = df.groupby(['capacity'])['capacity'].count()
#dfx4 = df.groupby(['charge_counter'])['charge_counter'].count()
#dfx5 = df.groupby(['current_average'])['current_average'].count()


In [6]:
df.to_parquet('2-datasets/battery_details.parquet', compression='none') 