In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df

In [3]:
def pieOthers(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
        
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    colorsX = plt.cm.tab20.colors
    patches, texts = plt.pie(df2['value'], shadow=True, startangle=90, colors = colorsX)
    plt.legend(patches, labels, loc="best")
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
    
def pieOthers2(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
    
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    fig=plt.figure(figsize=(7,5))
    gs1 = gridspec.GridSpec(1,1,
        left=0.1,right=0.7,
        bottom=0.1,top=0.7,
    )
    pie_ax=fig.add_subplot(gs1[0])
    
    colors = plt.cm.tab20.colors
    wedges, texts = pie_ax.pie(
        df2['value'],
        shadow=True,
        colors=colors,
        startangle=90,
    )
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(xycoords='data', textcoords='data', arrowprops=dict(arrowstyle="-"), zorder=0, va="center")

    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1)/2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
        kw["arrowprops"].update({"connectionstyle": connectionstyle,"color":colors[i]})
        pie_ax.annotate(labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                     horizontalalignment=horizontalalignment, **kw)
    pie_ax.axis('equal')

    plt.show()

In [4]:
cols1 = ['id','model', 'manufacturer', 'brand', 'product', 'os_version', 'codename']
dfDeviceDetails = prepare('2-datasets/devices.parquet',cols1)

#cols2 = ['id','device_id', 'timestamp', 'battery_level', 'network_status', 'screen_brightness', 'screen_on', 'timezone', 'country_code', 'period', 'size', 'direction', 'ppm']
cols2 = ['id','device_id', 'period', 'direction', 'ppm']
dfSamples = prepare('2-datasets/samplesPPM.parquet',cols2)

In [5]:
df_inner = pd.merge(dfSamples, dfDeviceDetails, left_on='device_id', right_on='id', how='inner')

In [6]:
df_inner_droppedDup = df_inner.drop(['id_x', 'device_id', 'id_y'], axis=1).drop_duplicates()

In [7]:
df_inner_droppedDup.head(100)

Unnamed: 0,period,direction,ppm,model,manufacturer,brand,product,os_version,codename
0,1,-1.0,0.2026,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
11,2,-1.0,0.1869,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
60,4,1.0,0.7848,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
107,5,-1.0,0.1449,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
118,6,1.0,0.5735,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
150,13,-1.0,0.1536,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
165,14,1.0,0.4161,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
177,15,-1.0,0.2050,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
239,16,1.0,0.6333,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow
298,19,-1.0,0.2457,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,Marshmallow


In [8]:
def devicesGroupBy(dataframe, var, direction, minCount):
    df = dataframe.groupby([var,'direction'])['ppm'].agg(average_ppm='mean', std='std', count='count').reset_index().sort_values(['average_ppm'], ascending=False)
    return df.loc[(df['direction'] == direction) & (df['count'] >= minCount)]

dfModelGBPos = devicesGroupBy(df_inner_droppedDup, 'model', 1, 30)
dfModelGBNeg = devicesGroupBy(df_inner_droppedDup, 'model', -1, 30)

dfManufacturerGBPos = devicesGroupBy(df_inner_droppedDup, 'manufacturer', 1, 30)
dfManufacturerGBNeg = devicesGroupBy(df_inner_droppedDup, 'manufacturer', -1, 30)

dfBrandGBPos = devicesGroupBy(df_inner_droppedDup, 'brand', 1, 30)
dfBrandGBNeg = devicesGroupBy(df_inner_droppedDup, 'brand', -1, 30)

dfProductGBPos = devicesGroupBy(df_inner_droppedDup, 'product', 1, 30)
dfProductGBNeg = devicesGroupBy(df_inner_droppedDup, 'product', -1, 30)

dfOsVersionGBPos = devicesGroupBy(df_inner_droppedDup, 'os_version', 1, 30)
dfOsVersionGBNeg = devicesGroupBy(df_inner_droppedDup, 'os_version', -1, 30)

dfCodenameGBPos = devicesGroupBy(df_inner_droppedDup, 'codename', 1, 30)
dfCodenameGBNeg = devicesGroupBy(df_inner_droppedDup, 'codename', -1, 30)


In [17]:
print(dfModelGBPos); print(dfModelGBNeg)

               model  direction  average_ppm       std  count
2320  MICROMAX Q4202       -1.0     0.624427  0.599726    203


In [18]:
df_inner_droppedDup[df_inner_droppedDup['model'] == 'LG-X210']

Unnamed: 0,period,direction,ppm,model,manufacturer,brand,product,os_version,codename
5633544,728586,1.0,0.5644,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow
5633555,728635,-1.0,0.4181,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow
5633565,728636,-1.0,0.7229,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow
5633587,728638,-1.0,0.8324,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow
7039992,917685,1.0,0.3826,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow
7040014,917718,-1.0,1.1050,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow
7040025,917740,-1.0,0.9712,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow
7040038,917749,-1.0,0.9730,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow
7040057,917757,-1.0,0.1743,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow
7040095,917762,-1.0,0.3045,MICROMAX Q4202,MICROMAX,MICROMAX,Q4202,6.0.1,Marshmallow


In [11]:
print(dfManufacturerGBPos); print(dfManufacturerGBNeg)

     manufacturer  direction  average_ppm       std  count
199      EUROSTAR        1.0     1.630676  0.419291    134
112   CELKON_Q405        1.0     1.612990  0.358663     41
725        VEGA-1        1.0     1.589865  0.461121     75
550  POSITIVO BGH        1.0     1.586252  0.508924     87
283      I-MOBILE        1.0     1.488354  0.444645     65
..            ...        ...          ...       ...    ...
137          CMDC        1.0     0.324062  0.111897    113
607         SHARP        1.0     0.275359  0.122830     99
168       DIGICEL        1.0     0.266593  0.111740     42
299          INEW        1.0     0.243427  0.104957     30
571          R50A        1.0     0.202071  0.101621     55

[147 rows x 5 columns]
            manufacturer  direction  average_ppm       std  count
111          CELKON_Q405       -1.0     1.955858  0.376925     38
724               VEGA-1       -1.0     1.597564  0.523626     66
620           SOFTWINNER       -1.0     1.197871  0.644911     42
402 

In [12]:
print(dfBrandGBPos); print(dfBrandGBNeg)

           brand  direction  average_ppm       std  count
195       EPAD-7        1.0     1.630676  0.419291    134
115  CELKON_Q405        1.0     1.612990  0.358663     41
722       VEGA-1        1.0     1.589865  0.461121     75
282     I-MOBILE        1.0     1.488354  0.444645     65
751         VSUN        1.0     1.358033  0.616313     48
..           ...        ...          ...       ...    ...
37       ALLCALL        1.0     0.289649  0.180930    185
165      DIGICEL        1.0     0.266593  0.111740     42
296         INEW        1.0     0.243427  0.104957     30
618        SHARP        1.0     0.241747  0.049609     86
589         R50A        1.0     0.186433  0.078812     51

[148 rows x 5 columns]
           brand  direction  average_ppm       std  count
114  CELKON_Q405       -1.0     1.955858  0.376925     38
721       VEGA-1       -1.0     1.597564  0.523626     66
54           AOC       -1.0     1.080791  0.616590     35
203     EVERCOSS       -1.0     1.052295  0.6704

In [13]:
print(dfProductGBPos); print(dfProductGBNeg)

              product  direction  average_ppm       std  count
3718         SI7067SB        1.0     1.811050  0.278333     36
223                A1        1.0     1.632209  0.416943     33
1191           EPAD-7        1.0     1.630676  0.419291    134
4134     V130_WKT_0F1        1.0     1.612990  0.358663     41
229            A1000M        1.0     1.598331  0.488614    145
...               ...        ...          ...       ...    ...
3514             R50A        1.0     0.202071  0.101621     55
1972  ITEL_IT1508PLUS        1.0     0.199174  0.064392     38
3120   P635A32_MX_TEL        1.0     0.180019  0.058720     32
1174   ELEPHONE P8000        1.0     0.168060  0.056947    105
3232      PASSION_ROW        1.0     0.150529  0.129557     58

[672 rows x 5 columns]
                               product  direction  average_ppm       std  \
4133                      V130_WKT_0F1       -1.0     1.955858  0.376925   
4191                            VEGA-1       -1.0     1.597564  0.52

In [14]:
print(dfOsVersionGBPos); print(dfOsVersionGBNeg)

   os_version  direction  average_ppm       std  count
47      8.0.0        1.0     1.058752  0.214584     31
7       4.1.2        1.0     0.776249  0.522782    392
28        5.1        1.0     0.773540  0.543793   9849
38        7.0        1.0     0.754613  0.419146  44797
49      8.1.0        1.0     0.688099  0.395134     73
26      5.0.2        1.0     0.667400  0.413050   3744
42      7.1.1        1.0     0.651719  0.375057   9368
17      4.4.2        1.0     0.649118  0.414145   3966
22        5.0        1.0     0.639885  0.363754   3616
44      7.1.2        1.0     0.625783  0.332819   5003
32        6.0        1.0     0.586389  0.367685  15574
24      5.0.1        1.0     0.571178  0.321388   1392
34      6.0.1        1.0     0.563578  0.299976  32311
20      4.4.4        1.0     0.550259  0.265374   1308
13        4.3        1.0     0.547376  0.329217    159
11      4.2.2        1.0     0.536174  0.351204    725
5       4.1.1        1.0     0.531610  0.177109    157
30      5.

In [15]:
print(dfCodenameGBPos); print(dfCodenameGBNeg)

       codename  direction  average_ppm       std  count
15         Oreo        1.0     0.798582  0.389326    104
13       Nougat        1.0     0.726810  0.408657  59285
7      Lollipop        1.0     0.643693  0.432405  27087
5        KitKat        1.0     0.624600  0.385031   5274
3    Jelly Bean        1.0     0.599103  0.403718   1450
9   Marshmallow        1.0     0.570773  0.323689  47945
       codename  direction  average_ppm       std  count
6      Lollipop       -1.0     0.387164  0.450928  30593
4        KitKat       -1.0     0.307259  0.410645   6619
2    Jelly Bean       -1.0     0.290330  0.393962   1905
8   Marshmallow       -1.0     0.257452  0.292909  58873
12       Nougat       -1.0     0.227830  0.216158  34867
14         Oreo       -1.0     0.222369  0.212070     59
