In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df


In [3]:
cols = ['id','sample_id','charger','health','voltage','temperature','capacity','charge_counter','current_average','current_now','energy_counter']
df = prepare('parquet_files/battery_details.parquet',cols)

#fix unsigned int
df_level = df.id 
converted_level = df_level.astype(np.int32)
df['id'] = converted_level
df_level = df.sample_id 
converted_level = df_level.astype(np.int32)
df['sample_id'] = converted_level
df_level = df.capacity
converted_level = df_level.astype(np.int32)
df['capacity'] = converted_level

df['charger'] = df['charger'].apply(lambda x: x.upper())
df['health'] = df['health'].apply(lambda x: x.upper())




df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11957118 entries, 0 to 11957117
Data columns (total 11 columns):
id                 int32
sample_id          int32
charger            object
health             object
voltage            float32
temperature        float32
capacity           int32
charge_counter     int64
current_average    int64
current_now        int64
energy_counter     int64
dtypes: float32(2), int32(3), int64(4), object(2)
memory usage: 775.4+ MB


In [12]:
df.head(100)

Unnamed: 0,id,sample_id,charger,health,voltage,temperature,capacity,charge_counter,current_average,current_now,energy_counter
11957018,12042921,12090650,UNPLUGGED,GOOD,3.54,30.900000,0,0,-691,1,-1
11957019,12042922,12090651,UNPLUGGED,GOOD,3.95,35.400002,0,0,0,120,-1
11957020,12042923,12090652,UNPLUGGED,GOOD,4.03,40.000000,0,0,0,-554,-1
11957021,12042924,12090653,UNPLUGGED,GOOD,3.96,40.799999,0,0,0,-388,-1
11957022,12042925,12090654,UNPLUGGED,GOOD,3.96,39.799999,0,0,0,-981,-1
11957023,12042926,12090655,UNPLUGGED,GOOD,4.15,33.599998,0,0,0,-367,-1
11957024,12042927,12090656,UNPLUGGED,GOOD,4.08,34.500000,0,0,0,-537,-1
11957025,12042928,12090657,UNPLUGGED,GOOD,4.19,36.200001,0,0,0,-103,-1
11957026,12042929,12090658,UNPLUGGED,GOOD,3.92,36.099998,0,0,0,-858,-1
11957027,12042930,12090659,UNPLUGGED,GOOD,4.11,37.000000,0,0,0,-539,-1


In [7]:
#dfx1 = df.groupby(['charger'])['charger'].count()
#dfx2 = df.groupby(['health'])['health'].count()
#dfx3 = df.groupby(['capacity'])['capacity'].count()
#dfx4 = df.groupby(['charge_counter'])['charge_counter'].count()
#dfx5 = df.groupby(['current_average'])['current_average'].count()
