In [18]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [19]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df

In [20]:
def pieOthers(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
        
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    colorsX = plt.cm.tab20.colors
    patches, texts = plt.pie(df2['value'], shadow=True, startangle=90, colors = colorsX)
    plt.legend(patches, labels, loc="best")
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
    
def pieOthers2(dfVars,dfVals,threshold):
    df = pd.DataFrame(
        data = {'var': dfVars, 'value' : dfVals},
        ).sort_values('value', ascending = False)
    
    if threshold > 0 :
        df2 = df[:threshold].copy()
        new_row = pd.DataFrame(data = {
            'var' : ['Others'],
            'value' : [dfVals[threshold:].sum()]
        })

        df2 = pd.concat([df2, new_row])
    else:
        df2 = df
    
    percent = 100.*df2['value']/df2['value'].sum()
    labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(df2['var'], percent)]
    
    #labels = ['{0} - {1}'.format(i,j) for i,j in zip(df2['var'], df2['value'])]
    
    fig=plt.figure(figsize=(7,5))
    gs1 = gridspec.GridSpec(1,1,
        left=0.1,right=0.7,
        bottom=0.1,top=0.7,
    )
    pie_ax=fig.add_subplot(gs1[0])
    
    colors = plt.cm.tab20.colors
    wedges, texts = pie_ax.pie(
        df2['value'],
        shadow=True,
        colors=colors,
        startangle=90,
    )
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(xycoords='data', textcoords='data', arrowprops=dict(arrowstyle="-"), zorder=0, va="center")

    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1)/2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
        kw["arrowprops"].update({"connectionstyle": connectionstyle,"color":colors[i]})
        pie_ax.annotate(labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                     horizontalalignment=horizontalalignment, **kw)
    pie_ax.axis('equal')

    plt.show()

In [21]:
cols1 = ['sample_id', 'network_type', 'mobile_network_type', 'mobile_data_status', 'mobile_data_activity', 'roaming_enabled', 'wifi_status', 'wifi_signal_strength', 'wifi_link_speed', 'wifi_ap_status', 'network_operator', 'sim_operator', 'mcc', 'mnc']
dfNetworkDetails = prepare('2-datasets/network_details.parquet',cols1)

#cols2 = ['id','device_id', 'timestamp', 'battery_level', 'network_status', 'screen_brightness', 'screen_on', 'timezone', 'country_code', 'period', 'size', 'direction', 'ppm']
cols2 = ['id', 'period', 'direction', 'ppm']
dfSamples = prepare('2-datasets/samplesPPM.parquet',cols2)

In [22]:
df_inner = pd.merge(dfSamples, dfNetworkDetails, left_on='id', right_on='sample_id', how='inner')

In [27]:
df_inner_droppedDupGeneral = df_inner.drop(['id','sample_id','wifi_signal_strength','wifi_link_speed','mcc','mnc','wifi_ap_status'], axis=1).drop_duplicates()

In [28]:
df_inner_droppedDupGeneral.head(100)

Unnamed: 0,period,direction,ppm,network_type,mobile_network_type,mobile_data_status,mobile_data_activity,roaming_enabled,wifi_status,network_operator,sim_operator
0,1,-1.0,0.2026,MOBILE,LTE,CONNECTED,NONE,0,ENABLED,VERIZON,UNKNOWN
11,2,-1.0,0.1869,MOBILE,LTE,CONNECTED,NONE,0,ENABLED,VERIZON,UNKNOWN
15,2,-1.0,0.1869,WIFI,LTE,DISCONNECTED,NONE,0,ENABLED,VERIZON,UNKNOWN
17,2,-1.0,0.1869,MOBILE,LTE,CONNECTED,INOUT,0,ENABLED,VERIZON,UNKNOWN
20,2,-1.0,0.1869,MOBILE,EVDO_A,CONNECTED,NONE,0,ENABLED,VERIZON WIRELESS,UNKNOWN
21,2,-1.0,0.1869,WIFI,LTE,DISCONNECTED,INOUT,0,ENABLED,VERIZON WIRELESS,UNKNOWN
23,2,-1.0,0.1869,WIFI,EVDO_A,DISCONNECTED,NONE,0,ENABLED,VERIZON WIRELESS,UNKNOWN
27,2,-1.0,0.1869,WIFI,0,DISCONNECTED,NONE,0,ENABLED,VERIZON WIRELESS,UNKNOWN
49,2,-1.0,0.1869,WIFI,1XRTT,DISCONNECTED,NONE,0,ENABLED,VERIZON WIRELESS,UNKNOWN
53,2,-1.0,0.1869,WIFI,LTE,DISCONNECTED,NONE,1,ENABLED,ROAMING,UNKNOWN


In [31]:
#due to functional dependicies, individual duplication drops must be done
df_inner_droppedDup_NetworkType = df_inner_droppedDupGeneral.drop(df_inner_droppedDupGeneral.columns.difference(['period', 'direction', 'ppm', 'network_type']), axis=1).drop_duplicates()
df_inner_droppedDup_MobileNetworkType = df_inner_droppedDupGeneral.drop(df_inner_droppedDupGeneral.columns.difference(['period', 'direction', 'ppm', 'mobile_network_type']), axis=1).drop_duplicates()
df_inner_droppedDup_MobileDataStatus = df_inner_droppedDupGeneral.drop(df_inner_droppedDupGeneral.columns.difference(['period', 'direction', 'ppm', 'mobile_data_status']), axis=1).drop_duplicates()
df_inner_droppedDup_MobileDataActivity = df_inner_droppedDupGeneral.drop(df_inner_droppedDupGeneral.columns.difference(['period', 'direction', 'ppm', 'mobile_data_activity']), axis=1).drop_duplicates()
df_inner_droppedDup_Roaming = df_inner_droppedDupGeneral.drop(df_inner_droppedDupGeneral.columns.difference(['period', 'direction', 'ppm', 'roaming_enabled']), axis=1).drop_duplicates()
df_inner_droppedDup_WifiStatus = df_inner_droppedDupGeneral.drop(df_inner_droppedDupGeneral.columns.difference(['period', 'direction', 'ppm', 'wifi_status']), axis=1).drop_duplicates()
df_inner_droppedDup_NetworkOperator = df_inner_droppedDupGeneral.drop(df_inner_droppedDupGeneral.columns.difference(['period', 'direction', 'ppm', 'network_operator']), axis=1).drop_duplicates()
df_inner_droppedDup_SimOperator = df_inner_droppedDupGeneral.drop(df_inner_droppedDupGeneral.columns.difference(['period', 'direction', 'ppm', 'sim_operator']), axis=1).drop_duplicates()

In [53]:
def networkGroupBy(dataframe, var, direction, minCount):
    df = dataframe.groupby([var,'direction'])['ppm'].agg(average_ppm='mean', std='std', count='count').reset_index().sort_values(['average_ppm'], ascending=False)
    return df.loc[(df['direction'] == direction) & (df['count'] >= minCount)]

df_inner_droppedDup_NetworkTypePos = networkGroupBy(df_inner_droppedDup_NetworkType, 'network_type', 1, 0)
df_inner_droppedDup_NetworkTypeNeg = networkGroupBy(df_inner_droppedDup_NetworkType, 'network_type', -1, 0)

df_inner_droppedDup_MobileNetworkTypeGBPos = networkGroupBy(df_inner_droppedDup_MobileNetworkType, 'mobile_network_type', 1, 0)
df_inner_droppedDup_MobileNetworkTypeGBNeg = networkGroupBy(df_inner_droppedDup_MobileNetworkType, 'mobile_network_type', -1, 0)

df_inner_droppedDup_MobileDataStatusGBPos = networkGroupBy(df_inner_droppedDup_MobileDataStatus, 'mobile_data_status', 1, 0)
df_inner_droppedDup_MobileDataStatusGBNeg = networkGroupBy(df_inner_droppedDup_MobileDataStatus, 'mobile_data_status', -1, 0)

df_inner_droppedDup_MobileDataActivityPos = networkGroupBy(df_inner_droppedDup_MobileDataActivity, 'mobile_data_activity', 1, 0)
df_inner_droppedDup_MobileDataActivityNeg = networkGroupBy(df_inner_droppedDup_MobileDataActivity, 'mobile_data_activity', -1, 0)

df_inner_droppedDup_RoamingPos = networkGroupBy(df_inner_droppedDup_Roaming, 'roaming_enabled', 1, 0)
df_inner_droppedDup_RoamingNeg = networkGroupBy(df_inner_droppedDup_Roaming, 'roaming_enabled', -1, 0)

df_inner_droppedDup_WifiStatusPos = networkGroupBy(df_inner_droppedDup_WifiStatus, 'wifi_status', 1, 0)
df_inner_droppedDup_WifiStatusNeg = networkGroupBy(df_inner_droppedDup_WifiStatus, 'wifi_status', -1, 0)

df_inner_droppedDup_NetworkOperatorPos = networkGroupBy(df_inner_droppedDup_NetworkOperator, 'network_operator', 1, 30)
df_inner_droppedDup_NetworkOperatorNeg = networkGroupBy(df_inner_droppedDup_NetworkOperator, 'network_operator', -1, 30)

df_inner_droppedDup_SimOperatorPos = networkGroupBy(df_inner_droppedDup_SimOperator, 'sim_operator', 1, 30)
df_inner_droppedDup_SimOperatorNeg = networkGroupBy(df_inner_droppedDup_SimOperator, 'sim_operator', -1, 30)

In [54]:
print(df_inner_droppedDup_NetworkTypePos);print(df_inner_droppedDup_NetworkTypeNeg)

          network_type  direction  average_ppm       std  count
1  BLUETOOTH TETHERING        1.0     1.240044  0.938938     32
7                 WIFI        1.0     0.677904  0.823091  85120
5              UNKNOWN        1.0     0.648864  0.636306  45951
3               MOBILE        1.0     0.646241  0.479413  32678
          network_type  direction  average_ppm       std  count
0  BLUETOOTH TETHERING       -1.0     0.574522  0.748161     59
6                 WIFI       -1.0     0.295967  0.878222  96794
2               MOBILE       -1.0     0.268557  0.434953  64155
4              UNKNOWN       -1.0     0.260419  0.804993  61861


In [55]:
print(df_inner_droppedDup_MobileNetworkTypeGBPos);print(df_inner_droppedDup_MobileNetworkTypeGBNeg)

   mobile_network_type  direction  average_ppm       std  count
29                HSPA        1.0     0.723018  1.457040  18007
37                UTMS        1.0     0.699891  0.560787  23976
23              EVDO_A        1.0     0.688177  0.327161    589
31               HSPAP        1.0     0.674995  0.478367  12246
35                 LTE        1.0     0.665830  0.485619  70858
7                   18        1.0     0.648186  0.437347   1294
19                EDGE        1.0     0.644883  0.443122   9609
3                   16        1.0     0.637119  0.433520     21
33               HSUPA        1.0     0.634723  0.356509   3566
1                    0        1.0     0.633798  0.753579  27598
25                GPRS        1.0     0.623252  0.495276   1330
27               HSDPA        1.0     0.595565  0.303057   1693
5                   17        1.0     0.583400       NaN      1
11               1XRTT        1.0     0.519321  0.279454    258
21               EHRPD        1.0     0.

In [56]:
print(df_inner_droppedDup_MobileDataStatusGBPos);print(df_inner_droppedDup_MobileDataStatusGBNeg)

  mobile_data_status  direction  average_ppm       std   count
5       DISCONNECTED        1.0     0.677518  0.783662  118791
3         CONNECTING        1.0     0.658646  0.358063    1031
1          CONNECTED        1.0     0.646708  0.477230   33778
7          SUSPENDED        1.0     0.588849  0.335253     372
  mobile_data_status  direction  average_ppm       std   count
4       DISCONNECTED       -1.0     0.309005  0.967163  117004
2         CONNECTING       -1.0     0.298090  0.616724    3405
0          CONNECTED       -1.0     0.269786  0.440945   64920
6          SUSPENDED       -1.0     0.194103  0.180462    1990


In [57]:
print(df_inner_droppedDup_MobileDataActivityPos);print(df_inner_droppedDup_MobileDataActivityNeg)

  mobile_data_activity  direction  average_ppm       std   count
1                   IN        1.0     0.719080  0.505850    2539
3                INOUT        1.0     0.685273  0.511212   32982
7                  OUT        1.0     0.663954  0.517112   12099
5                 NONE        1.0     0.663089  0.766945  122999
  mobile_data_activity  direction  average_ppm       std   count
4                 NONE       -1.0     0.318366  0.943617  126255
6                  OUT       -1.0     0.313367  0.452473   31198
0                   IN       -1.0     0.296257  0.481651    8525
2                INOUT       -1.0     0.278199  0.476609   60675


In [58]:
print(df_inner_droppedDup_RoamingPos);print(df_inner_droppedDup_RoamingNeg)

   roaming_enabled  direction  average_ppm       std   count
1                0        1.0     0.676568  0.751588  139375
3                1        1.0     0.665274  0.412796    3044
   roaming_enabled  direction  average_ppm       std   count
0                0       -1.0     0.324615  0.941128  131870
2                1       -1.0     0.257286  0.440861    3526


In [59]:
print(df_inner_droppedDup_WifiStatusPos);print(df_inner_droppedDup_WifiStatusNeg)

  wifi_status  direction  average_ppm       std  count
9     UNKNOWN        1.0     1.077510  0.651271     48
7    ENABLING        1.0     1.067696  1.381098     81
3   DISABLING        1.0     1.027747  1.418947     60
5     ENABLED        1.0     0.676315  0.787501  99865
1    DISABLED        1.0     0.657686  0.626574  51292
  wifi_status  direction  average_ppm       std   count
8     UNKNOWN       -1.0     0.681308  0.694797      50
4     ENABLED       -1.0     0.311048  0.926691  108818
0    DISABLED       -1.0     0.279110  0.681507   58790
6    ENABLING       -1.0     0.277402  0.415649     243
2   DISABLING       -1.0     0.272333  0.434689     202


In [60]:
print(df_inner_droppedDup_NetworkOperatorPos);print(df_inner_droppedDup_NetworkOperatorNeg)

              network_operator  direction  average_ppm       std  count
1583                       MTN        1.0     2.009577  2.600614     35
1346                  KYIVSTAR        1.0     1.659248  1.453959     54
3451     VODA P;SEM REDE MÓVEL        1.0     1.505865  0.561107     96
295             ALTICE MEO;NOS        1.0     1.442007  0.432801     58
1429           MENCARI LAYANAN        1.0     1.421008  0.871016     80
...                        ...        ...          ...       ...    ...
557                    C SPIRE        1.0     0.271514  0.112528     36
2422  SEM SERVIÇO;TIM 91 | TIM        1.0     0.264467  0.102661     36
1262                JAZZ;UFONE        1.0     0.263318  0.119593     38
2909                   TH-DTAC        1.0     0.235913  0.062202     31
759         DTAC-T;ไม่มีบริการ        1.0     0.209657  0.131350     42

[374 rows x 5 columns]
                             network_operator  direction  average_ppm  \
1142            IDEA;NO SERVICE — VODAF

In [61]:
print(df_inner_droppedDup_SimOperatorPos);print(df_inner_droppedDup_SimOperatorNeg)

      sim_operator  direction  average_ppm       std  count
243  GLOBE TELECOM        1.0     1.236824  0.564728    128
320       KYIVSTAR        1.0     1.174005  1.155434    112
658     TIM BRASIL        1.0     1.120007  0.487220    445
386       MOLDCELL        1.0     1.103919  0.414031     54
430             MY        1.0     1.102544  1.900289     99
..             ...        ...          ...       ...    ...
580       T-MOBILE        1.0     0.332956  0.197624    257
536      SAFARICOM        1.0     0.323094  0.172040     72
699      UNITEL T+        1.0     0.312739  0.145084     67
117        C SPIRE        1.0     0.271514  0.112528     36
684       U MOBILE        1.0     0.253267  0.093137     85

[183 rows x 5 columns]
         sim_operator  direction  average_ppm       std  count
345           LUMITEL       -1.0     2.638927  2.280685     58
262              IDEA       -1.0     1.370618  5.011832    788
242     GLOBE TELECOM       -1.0     1.088558  0.747383    119
508 