In [34]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [35]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df

In [36]:
def pieOthers(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
        
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    colorsX = plt.cm.tab20.colors
    patches, texts = plt.pie(df2['value'], shadow=True, startangle=90, colors = colorsX)
    plt.legend(patches, labels, loc="best")
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
    
def pieOthers2(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
    
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    fig=plt.figure(figsize=(7,5))
    gs1 = gridspec.GridSpec(1,1,
        left=0.1,right=0.7,
        bottom=0.1,top=0.7,
    )
    pie_ax=fig.add_subplot(gs1[0])
    
    colors = plt.cm.tab20.colors
    wedges, texts = pie_ax.pie(
        df2['value'],
        shadow=True,
        colors=colors,
        startangle=90,
    )
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(xycoords='data', textcoords='data', arrowprops=dict(arrowstyle="-"), zorder=0, va="center")

    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1)/2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
        kw["arrowprops"].update({"connectionstyle": connectionstyle,"color":colors[i]})
        pie_ax.annotate(labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                     horizontalalignment=horizontalalignment, **kw)
    pie_ax.axis('equal')

    plt.show()

In [37]:
cols1 = ['sample_id', 'bluetooth_enabled', 'location_enabled', 'power_saver_enabled', 'flashlight_enabled', 'nfc_enabled', 'developer_mode']
dfSettings = prepare('datasets/settings.parquet',cols1)

#cols2 = ['id','device_id', 'timestamp', 'battery_level', 'network_status', 'screen_brightness', 'screen_on', 'timezone', 'country_code', 'period', 'size', 'direction', 'ppm']
cols2 = ['id', 'period', 'direction', 'ppm']
dfSamples = prepare('datasets/samplesPPM.parquet',cols2)

In [38]:
df_inner = pd.merge(dfSamples, dfSettings, left_on='id', right_on='sample_id', how='inner')

In [39]:
df_inner_droppedDup = df_inner.drop(['id', 'sample_id'], axis=1).drop_duplicates()

In [41]:
df_inner_droppedDup.head(100)

Unnamed: 0,period,direction,ppm,bluetooth_enabled,location_enabled,power_saver_enabled,flashlight_enabled,nfc_enabled,developer_mode
0,1,-1.0,0.2026,0,1,0,0,0,0
11,2,-1.0,0.1869,0,1,0,0,0,0
60,4,1.0,0.7848,0,1,0,0,0,0
107,5,-1.0,0.1449,0,1,0,0,0,0
118,6,1.0,0.5735,0,1,0,0,0,0
150,13,-1.0,0.1536,0,1,0,0,0,0
165,14,1.0,0.4161,0,1,0,0,0,0
177,15,-1.0,0.2050,0,1,0,0,0,0
239,16,1.0,0.6333,0,1,0,0,0,0
298,19,-1.0,0.2457,0,1,0,0,0,0


In [42]:
def settingsGroupBy(dataframe, var, direction):
    df = dataframe.groupby([var,'direction'])['ppm'].agg(average_ppm='mean', std='std', count='count').reset_index().sort_values(['average_ppm'], ascending=False)
    return df.loc[(df['direction'] == direction)]

dfBluetoothGBPos = settingsGroupBy(df_inner_droppedDup, 'bluetooth_enabled', 1)
dfBluetoothGBNeg = settingsGroupBy(df_inner_droppedDup, 'bluetooth_enabled', -1)

dfLocationGBPos = settingsGroupBy(df_inner_droppedDup, 'location_enabled', 1)
dfLocationGBNeg = settingsGroupBy(df_inner_droppedDup, 'location_enabled', -1)

dfPowerSaverGBPos = settingsGroupBy(df_inner_droppedDup, 'power_saver_enabled', 1)
dfPowerSaverGBNeg = settingsGroupBy(df_inner_droppedDup, 'power_saver_enabled', -1)

dfFlashlightGBPos = settingsGroupBy(df_inner_droppedDup, 'flashlight_enabled', 1)
dfFlashlightGBNeg = settingsGroupBy(df_inner_droppedDup, 'flashlight_enabled', -1)

dfNfcGBPos = settingsGroupBy(df_inner_droppedDup, 'nfc_enabled', 1)
dfNfcGBNeg = settingsGroupBy(df_inner_droppedDup, 'nfc_enabled', -1)

dfDeveloperGBPos = settingsGroupBy(df_inner_droppedDup, 'developer_mode', 1)
dfDeveloperGBNeg = settingsGroupBy(df_inner_droppedDup, 'developer_mode', -1)


In [43]:
print(dfBluetoothGBPos); print(dfBluetoothGBNeg)

   bluetooth_enabled  direction  average_ppm       std   count
1                  0        1.0     0.683141  0.764208  119917
3                  1        1.0     0.627560  0.609182   25786
   bluetooth_enabled  direction  average_ppm       std   count
0                  0       -1.0     0.330678  0.898402  123821
2                  1       -1.0     0.266102  1.130234   32581


In [44]:
print(dfLocationGBPos); print(dfLocationGBNeg)

   location_enabled  direction  average_ppm       std  count
1                 0        1.0     0.687179  0.874465  76455
3                 1        1.0     0.657987  0.552951  69248
   location_enabled  direction  average_ppm       std  count
0                 0       -1.0     0.347414  1.011627  82541
2                 1       -1.0     0.283489  0.878749  73861


In [45]:
print(dfPowerSaverGBPos); print(dfPowerSaverGBNeg)

   power_saver_enabled  direction  average_ppm       std   count
3                    1        1.0     0.733029  0.414390    9357
1                    0        1.0     0.669206  0.756479  136346
   power_saver_enabled  direction  average_ppm       std   count
2                    1       -1.0     0.350739  1.347074   16648
0                    0       -1.0     0.313233  0.892960  139754


In [46]:
print(dfFlashlightGBPos); print(dfFlashlightGBNeg)

   flashlight_enabled  direction  average_ppm       std   count
1                   0        1.0     0.673305  0.739446  145703
   flashlight_enabled  direction  average_ppm       std   count
0                   0       -1.0     0.317226  0.951724  156402


In [47]:
print(dfNfcGBPos); print(dfNfcGBNeg)

   nfc_enabled  direction  average_ppm       std   count
3            1        1.0     0.695315  0.425057   19718
1            0        1.0     0.669860  0.777171  125985
   nfc_enabled  direction  average_ppm       std   count
0            0       -1.0     0.327244  1.011395  134464
2            1       -1.0     0.255822  0.428262   21938


In [48]:
print(dfDeveloperGBPos); print(dfDeveloperGBNeg)

   developer_mode  direction  average_ppm       std   count
1               0        1.0     0.676007  0.765011  131367
3               1        1.0     0.648544  0.440103   14336
   developer_mode  direction  average_ppm       std   count
0               0       -1.0     0.318911  0.994141  137397
2               1       -1.0     0.305044  0.555807   19005
