In [44]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [45]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df

In [46]:
def pieOthers(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
        
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    colorsX = plt.cm.tab20.colors
    patches, texts = plt.pie(df2['value'], shadow=True, startangle=90, colors = colorsX)
    plt.legend(patches, labels, loc="best")
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
    
def pieOthers2(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
    
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    fig=plt.figure(figsize=(7,5))
    gs1 = gridspec.GridSpec(1,1,
        left=0.1,right=0.7,
        bottom=0.1,top=0.7,
    )
    pie_ax=fig.add_subplot(gs1[0])
    
    colors = plt.cm.tab20.colors
    wedges, texts = pie_ax.pie(
        df2['value'],
        shadow=True,
        colors=colors,
        startangle=90,
    )
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(xycoords='data', textcoords='data', arrowprops=dict(arrowstyle="-"), zorder=0, va="center")

    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1)/2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
        kw["arrowprops"].update({"connectionstyle": connectionstyle,"color":colors[i]})
        pie_ax.annotate(labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                     horizontalalignment=horizontalalignment, **kw)
    pie_ax.axis('equal')

    plt.show()

In [62]:
cols1 = ['id','sample_id','charger','health']
dfBatteryDetails = prepare('2-datasets/battery_details.parquet',cols1)

#cols2 = ['id','device_id', 'timestamp', 'battery_level', 'network_status', 'screen_brightness', 'screen_on', 'timezone', 'country_code', 'period', 'size', 'direction', 'ppm']
cols2 = ['id','timestamp', 'battery_level','period', 'direction', 'ppm']
dfSamples = prepare('2-datasets/samplesPPM.parquet',cols2)

In [63]:
df_inner = pd.merge(dfSamples, dfBatteryDetails, left_on='id', right_on='sample_id', how='inner')

In [49]:
df_inner_droppedDup = df_inner.drop(['id_x', 'id_y', 'sample_id', 'timestamp', 'battery_level'], axis=1).drop_duplicates()

In [50]:
df_inner_droppedDup.head(100)

Unnamed: 0,period,direction,ppm,charger,health
0,1,-1.0,0.2026,UNPLUGGED,GOOD
11,2,-1.0,0.1869,UNPLUGGED,GOOD
60,4,1.0,0.7848,AC,GOOD
107,5,-1.0,0.1449,UNPLUGGED,GOOD
118,6,1.0,0.5735,AC,GOOD
150,13,-1.0,0.1536,UNPLUGGED,GOOD
165,14,1.0,0.4161,AC,GOOD
177,15,-1.0,0.2050,UNPLUGGED,GOOD
239,16,1.0,0.6333,AC,GOOD
298,19,-1.0,0.2457,UNPLUGGED,GOOD


In [54]:
def batteryDetailsGroupBy(dataframe, var, direction):
    df = dataframe.groupby([var,'direction'])['ppm'].agg(average_ppm='mean', std='std', count='count').reset_index().sort_values(['average_ppm'], ascending=False)
    return df.loc[(df['direction'] == direction)]

dfChargerGBPos = batteryDetailsGroupBy(df_inner_droppedDup,'charger',1)
dfChargerGBNeg = batteryDetailsGroupBy(df_inner_droppedDup,'charger',-1)

dfHealthGBPos = batteryDetailsGroupBy(df_inner_droppedDup,'health',1)
dfHealthGBNeg = batteryDetailsGroupBy(df_inner_droppedDup,'health',-1)

In [55]:
print(dfChargerGBPos);print(dfChargerGBNeg)

     charger  direction  average_ppm       std   count
1         AC        1.0     0.691381  0.680846  131690
3  UNPLUGGED        1.0     0.544329  0.590443   16929
5        USB        1.0     0.497072  1.254475   11502
     charger  direction  average_ppm       std   count
0         AC       -1.0     0.457355  1.348891   16348
4        USB       -1.0     0.447726  1.253840    3658
2  UNPLUGGED       -1.0     0.317061  0.893696  133989


In [56]:
print(dfHealthGBPos);print(dfHealthGBNeg)

                 health  direction  average_ppm       std   count
11  UNSPECIFIED FAILURE        1.0     0.844088  0.325824      26
9               UNKNOWN        1.0     0.723302  0.905791    1050
3                  GOOD        1.0     0.661653  0.730813  158810
5          OVER VOLTAGE        1.0     0.601324  0.315470      55
7              OVERHEAT        1.0     0.535529  0.295972      42
1                  DEAD        1.0     0.478884  0.204937     138
                 health  direction  average_ppm       std   count
0                  DEAD       -1.0     0.706251  0.703242     214
6              OVERHEAT       -1.0     0.601542  0.647767     224
10  UNSPECIFIED FAILURE       -1.0     0.589336  0.635228      33
8               UNKNOWN       -1.0     0.495685  1.361943    1750
4          OVER VOLTAGE       -1.0     0.342498  0.269110     171
2                  GOOD       -1.0     0.332223  0.958499  151603


In [52]:
df_inner.loc[(df_inner['charger'] == "UNPLUGGED") & (df_inner['direction'] == 1.0)]

Unnamed: 0,id_x,period,timestamp,battery_level,size,direction,ppm,id_y,sample_id,charger,health,voltage,temperature
837,750456,39,2017-10-21 02:29:42,72,69,1.0,0.4382,750335,750456,UNPLUGGED,GOOD,4.04,32.200001
1430,854766,62,2017-10-23 00:29:31,85,24,1.0,0.2762,854627,854766,UNPLUGGED,GOOD,4.10,31.500000
1631,879080,65,2017-10-23 15:02:03,59,48,1.0,0.8263,878941,879080,UNPLUGGED,GOOD,4.06,29.200001
1637,890091,69,2017-10-23 15:50:34,57,31,1.0,0.4604,889951,890091,UNPLUGGED,GOOD,3.86,31.400000
2107,1044830,111,2017-10-26 21:38:32,55,10,1.0,0.3232,1044611,1044830,UNPLUGGED,GOOD,3.85,42.700001
2108,1045627,111,2017-10-26 21:38:32,55,10,1.0,0.3232,1045408,1045627,UNPLUGGED,GOOD,3.85,42.700001
2163,1049415,116,2017-10-27 00:13:46,63,23,1.0,0.3714,1049196,1049415,UNPLUGGED,GOOD,3.83,39.200001
4141,1564711,218,2017-11-08 02:26:51,22,10,1.0,0.5281,1564314,1564711,UNPLUGGED,GOOD,3.85,32.000000
4219,1726999,223,2017-11-12 17:37:41,43,26,1.0,0.0583,1726583,1726999,UNPLUGGED,GOOD,3.68,28.000000
4459,1965442,270,2017-11-19 23:43:12,47,52,1.0,0.1025,1965010,1965442,UNPLUGGED,GOOD,3.81,19.500000


In [12]:
df_inner.loc[df_inner['period'] == 565256]

Unnamed: 0,id_x,period,direction,ppm,id_y,sample_id,charger,health,voltage,temperature
4595945,5472681,565256,1.0,6.0,5469206,5472681,AC,GOOD,3.61,30.0
4595946,5472680,565256,1.0,6.0,5469205,5472680,AC,GOOD,3.61,30.0
4595947,5472678,565256,1.0,6.0,5469203,5472678,AC,GOOD,3.61,30.0
4595948,5472676,565256,1.0,6.0,5469201,5472676,AC,GOOD,3.62,30.0
4595949,5472675,565256,1.0,6.0,5469200,5472675,AC,GOOD,3.62,30.0
4595950,5472673,565256,1.0,6.0,5469198,5472673,AC,GOOD,3.62,30.0
4595951,5472753,565256,1.0,6.0,5469278,5472753,AC,GOOD,3.62,30.0
4595952,5472752,565256,1.0,6.0,5469277,5472752,AC,GOOD,3.62,30.0
4595953,5472647,565256,1.0,6.0,5469172,5472647,AC,GOOD,3.62,31.0
4595954,5472646,565256,1.0,6.0,5469171,5472646,AC,GOOD,3.62,31.0


In [64]:
df_inner.loc[df_inner['period'] == 802384]

Unnamed: 0,id_x,timestamp,battery_level,period,direction,ppm,id_y,sample_id,charger,health
6218042,7051617,2018-08-18 02:10:32,21,802384,-1.0,63.157902,7041614,7051617,USB,GOOD
6218043,7054136,2018-08-18 02:10:34,20,802384,-1.0,63.157902,7044133,7054136,USB,GOOD
6218044,7054139,2018-08-18 02:10:35,19,802384,-1.0,63.157902,7044136,7054139,USB,GOOD
6218045,7054143,2018-08-18 02:10:36,18,802384,-1.0,63.157902,7044140,7054143,USB,GOOD
6218046,7054145,2018-08-18 02:10:37,17,802384,-1.0,63.157902,7044142,7054145,USB,GOOD
6218047,7054146,2018-08-18 02:10:38,16,802384,-1.0,63.157902,7044143,7054146,USB,GOOD
6218048,7054149,2018-08-18 02:10:39,15,802384,-1.0,63.157902,7044146,7054149,USB,GOOD
6218049,7054151,2018-08-18 02:10:40,13,802384,-1.0,63.157902,7044148,7054151,USB,GOOD
6218050,7054153,2018-08-18 02:10:41,12,802384,-1.0,63.157902,7044150,7054153,USB,GOOD
6218051,7054134,2018-08-18 02:10:42,11,802384,-1.0,63.157902,7044131,7054134,USB,GOOD
