In [35]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [36]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df

In [37]:
def pieOthers(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
        
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    colorsX = plt.cm.tab20.colors
    patches, texts = plt.pie(df2['value'], shadow=True, startangle=90, colors = colorsX)
    plt.legend(patches, labels, loc="best")
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
    
def pieOthers2(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
    
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    fig=plt.figure(figsize=(7,5))
    gs1 = gridspec.GridSpec(1,1,
        left=0.1,right=0.7,
        bottom=0.1,top=0.7,
    )
    pie_ax=fig.add_subplot(gs1[0])
    
    colors = plt.cm.tab20.colors
    wedges, texts = pie_ax.pie(
        df2['value'],
        shadow=True,
        colors=colors,
        startangle=90,
    )
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(xycoords='data', textcoords='data', arrowprops=dict(arrowstyle="-"), zorder=0, va="center")

    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1)/2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
        kw["arrowprops"].update({"connectionstyle": connectionstyle,"color":colors[i]})
        pie_ax.annotate(labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                     horizontalalignment=horizontalalignment, **kw)
    pie_ax.axis('equal')

    plt.show()

In [38]:
cols1 = ['id','model', 'manufacturer', 'brand', 'product', 'os_version', 'codename']
dfDeviceDetails = prepare('datasets/devices.parquet',cols1)

#cols2 = ['id','device_id', 'timestamp', 'battery_level', 'network_status', 'screen_brightness', 'screen_on', 'timezone', 'country_code', 'period', 'size', 'direction', 'ppm']
cols2 = ['id','device_id', 'period', 'direction', 'ppm']
dfSamples = prepare('datasets/samplesPPM.parquet',cols2)

In [39]:
df_inner = pd.merge(dfSamples, dfDeviceDetails, left_on='device_id', right_on='id', how='inner')

In [40]:
df_inner_droppedDup = df_inner.drop(['id_x', 'device_id', 'id_y'], axis=1).drop_duplicates()

In [41]:
df_inner_droppedDup.head(100)

Unnamed: 0,period,direction,ppm,model,manufacturer,brand,product,os_version,codename
0,1,-1.0,0.2026,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
11,2,-1.0,0.1869,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
60,4,1.0,0.7848,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
107,5,-1.0,0.1449,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
118,6,1.0,0.5735,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
150,13,-1.0,0.1536,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
165,14,1.0,0.4161,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
177,15,-1.0,0.2050,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
239,16,1.0,0.6333,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
298,19,-1.0,0.2457,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow


In [44]:
def devicesGroupBy(dataframe, var, direction, minCount):
    df = dataframe.groupby([var,'direction'])['ppm'].agg(average_ppm='mean', std='std', count='count').reset_index().sort_values(['average_ppm'], ascending=False)
    return df.loc[(df['direction'] == direction) & (df['count'] >= minCount)]

dfModelGBPos = devicesGroupBy(df_inner_droppedDup, 'model', 1, 30)
dfModelGBNeg = devicesGroupBy(df_inner_droppedDup, 'model', -1, 30)

dfManufacturerGBPos = devicesGroupBy(df_inner_droppedDup, 'manufacturer', 1, 30)
dfManufacturerGBNeg = devicesGroupBy(df_inner_droppedDup, 'manufacturer', -1, 30)

dfBrandGBPos = devicesGroupBy(df_inner_droppedDup, 'brand', 1, 30)
dfBrandGBNeg = devicesGroupBy(df_inner_droppedDup, 'brand', -1, 30)

dfProductGBPos = devicesGroupBy(df_inner_droppedDup, 'product', 1, 30)
dfProductGBNeg = devicesGroupBy(df_inner_droppedDup, 'product', -1, 30)

dfOsVersionGBPos = devicesGroupBy(df_inner_droppedDup, 'os_version', 1, 30)
dfOsVersionGBNeg = devicesGroupBy(df_inner_droppedDup, 'os_version', -1, 30)

dfCodenameGBPos = devicesGroupBy(df_inner_droppedDup, 'codename', 1, 30)
dfCodenameGBNeg = devicesGroupBy(df_inner_droppedDup, 'codename', -1, 30)


In [45]:
print(dfModelGBPos); print(dfModelGBNeg)

                  model  direction  average_ppm        std  count
2060            LG-X210        1.0     6.291762  10.564640    193
758             C5-S508        1.0     2.834986   2.503366    251
2356           MITO T71        1.0     1.811050   0.278333     36
2717  POSITIVO BGH M840        1.0     1.649336   0.578855     91
226                  A1        1.0     1.632209   0.416943     33
...                 ...        ...          ...        ...    ...
2845               R50A        1.0     0.202071   0.101621     55
1599   ITEL IT1508 PLUS        1.0     0.199174   0.064392     38
661       BLADE V6 PLUS        1.0     0.186491   0.064600     35
969      ELEPHONE P8000        1.0     0.168060   0.056947    105
1794       LENOVO P1A42        1.0     0.150529   0.129557     58

[657 rows x 5 columns]
                    model  direction  average_ppm        std  count
2323       MICROMAX Q4202       -1.0     3.604729   8.455028    254
2871           RCT6513W87       -1.0     3.32256

In [46]:
df_inner_droppedDup[df_inner_droppedDup['model'] == 'LG-X210']

Unnamed: 0,period,direction,ppm,model,manufacturer,brand,product,os_version,codename
6217873,802373,-1.0,0.567500,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop
6217885,802379,1.0,55.531898,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop
6217973,802380,-1.0,0.199800,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop
6218007,802383,-1.0,3.816300,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop
6218042,802384,-1.0,63.157902,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop
6218062,802385,1.0,0.000000,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop
6218075,802386,-1.0,0.093200,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop
6218138,802387,1.0,51.000000,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop
6218156,802389,1.0,38.947399,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop
6218205,802390,-1.0,0.098100,LG-X210,ALPS,LGE,M13G_GLOBAL_COM,5.1,Lollipop


In [26]:
#df_inner_droppedDup.loc[df_inner_droppedDup['model']=='1016']

In [47]:
print(dfManufacturerGBPos); print(dfManufacturerGBNeg)

               manufacturer  direction  average_ppm       std  count
311                   INSYS        1.0     2.530705  2.427332    294
550            POSITIVO BGH        1.0     1.649336  0.578855     91
199                EUROSTAR        1.0     1.630676  0.419291    134
112             CELKON_Q405        1.0     1.612990  0.358663     41
725                  VEGA-1        1.0     1.589865  0.461121     75
..                      ...        ...          ...       ...    ...
542  PLUS ONE JAPAN LIMITED        1.0     0.325161  0.164636     41
137                    CMDC        1.0     0.324062  0.111897    113
607                   SHARP        1.0     0.275359  0.122830     99
168                 DIGICEL        1.0     0.266593  0.111740     42
571                    R50A        1.0     0.202071  0.101621     55

[147 rows x 5 columns]
            manufacturer  direction  average_ppm       std  count
603             SERVICOM       -1.0     2.984075  7.046419     40
111          CEL

In [48]:
print(dfBrandGBPos); print(dfBrandGBNeg)

           brand  direction  average_ppm       std  count
312        INSYS        1.0     2.834986  2.503366    251
195       EPAD-7        1.0     1.630676  0.419291    134
115  CELKON_Q405        1.0     1.612990  0.358663     41
723       VEGA-1        1.0     1.589865  0.461121     75
282     I-MOBILE        1.0     1.488354  0.444645     65
..           ...        ...          ...       ...    ...
559     POLAROID        1.0     0.307819  0.128363     57
37       ALLCALL        1.0     0.289649  0.180930    185
165      DIGICEL        1.0     0.266593  0.111740     42
619        SHARP        1.0     0.241747  0.049609     86
590         R50A        1.0     0.186433  0.078812     51

[148 rows x 5 columns]
                 brand  direction  average_ppm       std  count
616  SERVICOM SERVICOM       -1.0     3.058879  7.122425     39
114        CELKON_Q405       -1.0     2.037490  0.522809     40
591                RCA       -1.0     1.870828  7.035153    562
722             VEGA-1  

In [49]:
print(dfProductGBPos); print(dfProductGBNeg)

              product  direction  average_ppm        std  count
2611  M13G_GLOBAL_COM        1.0     6.736192  10.943134    176
861      CEDRIC_AMXLA        1.0     4.867926   9.064026     35
796           C5-S508        1.0     2.834986   2.503366    251
3723         SI7067SB        1.0     1.811050   0.278333     36
2671             M840        1.0     1.649336   0.578855     91
...               ...        ...          ...        ...    ...
3518             R50A        1.0     0.202071   0.101621     55
1974  ITEL_IT1508PLUS        1.0     0.199174   0.064392     38
3124   P635A32_MX_TEL        1.0     0.180019   0.058720     32
1176   ELEPHONE P8000        1.0     0.168060   0.056947    105
3236      PASSION_ROW        1.0     0.150529   0.129557     58

[674 rows x 5 columns]
                  product  direction  average_ppm        std  count
3447                Q4202       -1.0     3.604729   8.455028    254
3212              P895T20       -1.0     3.601461   4.869990     97
3537

In [50]:
print(dfOsVersionGBPos); print(dfOsVersionGBNeg)

   os_version  direction  average_ppm       std  count
47      8.0.0        1.0     1.058752  0.214584     31
28        5.1        1.0     0.901270  1.854688  10010
7       4.1.2        1.0     0.832813  0.728974    406
26      5.0.2        1.0     0.805244  1.005590   3857
17      4.4.2        1.0     0.796635  0.976646   4095
38        7.0        1.0     0.760047  0.532656  44956
49      8.1.0        1.0     0.688099  0.395134     73
22        5.0        1.0     0.676369  1.325149   3645
42      7.1.1        1.0     0.655496  0.519029   9389
44      7.1.2        1.0     0.626902  0.344404   5010
32        6.0        1.0     0.605201  0.485120  15688
24      5.0.1        1.0     0.570982  0.333592   1399
34      6.0.1        1.0     0.570625  0.498260  32423
20      4.4.4        1.0     0.551542  0.276831   1311
13        4.3        1.0     0.540576  0.332758    161
11      4.2.2        1.0     0.539786  0.364200    726
5       4.1.1        1.0     0.531610  0.177109    157
30      5.

In [51]:
print(dfCodenameGBPos); print(dfCodenameGBNeg)

       codename  direction  average_ppm       std  count
15         Oreo        1.0     0.798582  0.389326    104
5        KitKat        1.0     0.737198  0.867233   5406
13       Nougat        1.0     0.731644  0.519407  59472
7      Lollipop        1.0     0.715005  1.298902  27425
3    Jelly Bean        1.0     0.617375  0.496356   1467
9   Marshmallow        1.0     0.581663  0.494078  48171
       codename  direction  average_ppm       std  count
6      Lollipop       -1.0     0.448262  1.131093  30964
4        KitKat       -1.0     0.359189  0.704154   6699
2    Jelly Bean       -1.0     0.338592  0.731027   1924
8   Marshmallow       -1.0     0.297597  1.007097  59283
12       Nougat       -1.0     0.250731  0.594827  35180
14         Oreo       -1.0     0.222369  0.212070     59
