In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

from datetime import timedelta
from utils import downcast

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas(strings_to_categorical=True)
    except Exception as e:
        print(e)

In [3]:
df = load_df('1-parquet-files/samplesPeriods.parquet')
df = downcast(df)
#df = df.drop(['network_status', 'screen_on', 'boundary'], axis=1)

In [4]:
df.head(100)

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,change,boundary,period,change_acc,time_diff,time_acc
0,447027,1,2017-10-15 18:36:46,99,LTE,-1,1,AMERICA/CHICAGO,us,0.0,0,1,0.0,0.0,0.0
1,447015,1,2017-10-15 18:41:54,98,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-1.0,308.0,308.0
2,447012,1,2017-10-15 18:46:54,97,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-2.0,300.0,608.0
3,447011,1,2017-10-15 18:50:35,96,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-3.0,221.0,829.0
4,446225,1,2017-10-15 18:54:14,95,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-4.0,219.0,1048.0
5,447009,1,2017-10-15 18:54:14,95,LTE,-1,1,AMERICA/CHICAGO,us,0.0,1,1,-4.0,0.0,1048.0
6,446218,1,2017-10-15 18:57:54,94,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-5.0,220.0,1268.0
7,446217,1,2017-10-15 19:02:47,93,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-6.0,293.0,1561.0
8,443535,1,2017-10-15 19:11:41,91,LTE,-1,1,AMERICA/CHICAGO,us,-2.0,1,1,-8.0,534.0,2095.0
9,443533,1,2017-10-15 19:21:25,89,LTE,-1,1,AMERICA/CHICAGO,us,-2.0,1,1,-10.0,584.0,2679.0


In [5]:
df['size'] = df.groupby('period')['period'].transform('size')

In [6]:
df['max_change'] = df['change_acc'].abs().groupby(df['period']).transform('max')

In [7]:
df['max_time'] = df['time_acc'].groupby(df['period']).transform('max')

In [8]:
df['ppm'] = df['max_change'].div(df['max_time'].div(60)).round(4)

In [9]:
#dftest = df.loc[(df['period'] == 111)]

In [10]:
def getPeriodDirection(period):
    dfTemp = df.loc[(df['period'] == period) & (df['change'] != 0.0) , 'change']
    val = dfTemp.median()
    return val

#if change is 0.0 (repeated measurements with 0 battery change) look at the previous change to have a direction
df['change'] = df['change'].mask(cond = df['change'] == 0.0 , other = df.groupby('period').change.transform('mean'))



In [11]:
#reset change when device stays the same, but a new period is made
changeBoundary = df['boundary'].isin(range(0,1))
df.loc[changeBoundary, 'change'] = 0.0




In [12]:
df['direction'] = df['change'].apply(lambda x: 1 if x > 0.0 else -1 if x < 0.0 else 999)


In [13]:
#need to also fix two cases when change = 0.0
#means either it is the first in its period (Should assume the direction of the next sample)
#or battery just didn't change (assume direction of previous)

#first in the period has time_acc = 0.0, so direction looks forward
#if the battery just didn't change, then time_acc > 0.0, so look back

directionRangeAll = df['direction'].isin(range(999,1000))
directionRangeAll.head(100)
df.loc[directionRangeAll, 'direction'] = df['direction'].shift()

changeBoundary = df['boundary'].isin(range(0,1))
df.loc[changeBoundary, 'direction'] = df['direction'].shift(-1)

In [14]:
dfx = df.drop(["boundary", "change", "change_acc", "time_diff", "time_acc", "max_change", "max_time"], axis=1)
#dfx = df.drop(["time_diff", "max_change", "max_time"], axis=1)
#dfx = df


In [15]:
#obtain list of periods with more than 10 and less than 100
dfx = dfx[(dfx['size'] >= 10) & (dfx['size'] <= 100)]

In [17]:
dfz = dfx.groupby(['period'])['period'].count().reset_index(name='count')
print("Min: "+str(dfz['count'].min()))
print("Mean: "+str(dfz['count'].mean()))
print("Max: "+str(dfz['count'].max()))

Min: 10
Mean: 32.399579287322865
Max: 100


In [49]:
#Calculate outliers in our PPM values, while removing 0 ppm values

def detect_outlier(df):
    outliers=[]
    threshold=3
    mean_1 = np.mean(df)
    std_1 =np.std(df)
    
    
    for y in df:
        z_score= (y - mean_1)/std_1 
        if np.abs(z_score) > threshold:
            outliers.append(y)
    return outliers

dfx_droppedDup = dfx.drop(dfx.columns.difference(['period', 'direction', 'ppm']), axis=1).drop_duplicates()
dfx_droppedDup_Pos = dfx_droppedDup.loc[(dfx_droppedDup['direction'] == 1) & (dfx_droppedDup['ppm'] > 0.0)].drop(['period', 'direction'], axis=1)
dfx_droppedDup_Neg = dfx_droppedDup.loc[(dfx_droppedDup['direction'] == -1) & (dfx_droppedDup['ppm'] > 0.0)].drop(['period', 'direction'], axis=1)

dfx_pos_list = list(dfx_droppedDup_Pos['ppm'].dropna())
dfx_neg_list = list(dfx_droppedDup_Neg['ppm'].dropna())

#Detect the outlier's lowest ppm value
upperbound_outliers_pos = min(detect_outlier(dfx_pos_list))
upperbound_outliers_neg = min(detect_outlier(dfx_neg_list))

#dfx_pos = dfx.loc[(dfx['ppm'] < upperbound_outliers_pos) & (dfx['ppm'] > 0.0) & (dfx['direction'] == 1)]
#dfx_neg = dfx.loc[(dfx['ppm'] < upperbound_outliers_neg) & (dfx['ppm'] > 0.0) & (dfx['direction'] == -1)]
dfx_no_outliers = dfx.loc[(dfx['ppm'] > 0.0) & 
                          (((dfx['ppm'] < upperbound_outliers_pos) & (dfx['direction'] == 1)) |
                          ((dfx['ppm'] < upperbound_outliers_neg) & (dfx['direction'] == -1)))
                         ]


8948699

8935543
4971587
3977112

4965574
3969969

4950984
3954152

8905136


In [50]:
dfx_no_outliers.to_parquet('2-datasets/samplesPPM.parquet', compression='none') 

In [18]:
dfx.loc[dfx['period'] == 65]

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,period,size,ppm,direction
1713,877231,1,2017-10-23 14:05:10,12,WIFI,-1,1,AMERICA/CHICAGO,us,65,48,0.8263,1.0
1714,877232,1,2017-10-23 14:06:40,13,WIFI,-1,1,AMERICA/CHICAGO,us,65,48,0.8263,1.0
1715,877233,1,2017-10-23 14:08:04,14,WIFI,-1,0,AMERICA/CHICAGO,us,65,48,0.8263,1.0
1716,877234,1,2017-10-23 14:08:49,15,WIFI,-1,0,AMERICA/CHICAGO,us,65,48,0.8263,1.0
1717,877235,1,2017-10-23 14:09:19,16,WIFI,-1,0,AMERICA/CHICAGO,us,65,48,0.8263,1.0
1718,877236,1,2017-10-23 14:10:38,17,WIFI,-1,0,AMERICA/CHICAGO,us,65,48,0.8263,1.0
1719,877237,1,2017-10-23 14:11:58,18,WIFI,-1,0,AMERICA/CHICAGO,us,65,48,0.8263,1.0
1720,877241,1,2017-10-23 14:13:18,19,WIFI,-1,0,AMERICA/CHICAGO,us,65,48,0.8263,1.0
1721,877245,1,2017-10-23 14:14:32,20,WIFI,-1,0,AMERICA/CHICAGO,us,65,48,0.8263,1.0
1722,877265,1,2017-10-23 14:15:57,21,WIFI,-1,0,AMERICA/CHICAGO,us,65,48,0.8263,1.0


In [20]:
dfx.loc[dfx['period'] == 565256]

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,period,size,ppm,direction
5798747,5472681,5572,2018-07-01 13:11:51,16,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798748,5472680,5572,2018-07-01 13:12:00,17,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798749,5472678,5572,2018-07-01 13:12:11,18,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798750,5472676,5572,2018-07-01 13:12:20,19,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798751,5472675,5572,2018-07-01 13:12:31,20,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798752,5472673,5572,2018-07-01 13:12:40,21,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798753,5472753,5572,2018-07-01 13:12:51,22,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798754,5472752,5572,2018-07-01 13:13:00,23,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798755,5472647,5572,2018-07-01 13:13:11,24,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798756,5472646,5572,2018-07-01 13:13:20,25,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0


In [22]:
#Using arshren's IQR outlier identification notebook (https://github.com/arshren/MachineLearning/blob/master/Identifying%20outliers.ipynb)

def outliers_modified_z_score(ys):
    outliers=[]
    threshold = 3.5

    median_y = np.median(ys)
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ys])
    for y in ys:
        modified_z_scores = 0.6745 * (y - median_y) / median_absolute_deviation_y
        if np.abs(modified_z_scores) > threshold:
            outliers.append(y)
    return outliers

In [25]:
outlier_datapoints_pos = detect_outlier(dfx_pos_list)
outlier_datapoints_neg = detect_outlier(dfx_neg_list)
outlier_datapoints_pos_modified = outliers_modified_z_score(dfx_pos_list)
outlier_datapoints_neg_modified = outliers_modified_z_score(dfx_neg_list)
print(len(dfx_pos_list))
print(len(outlier_datapoints_pos))
print(len(outlier_datapoints_pos_modified))

print()
print(len(dfx_neg_list))
print(len(outlier_datapoints_neg))
print(len(outlier_datapoints_neg_modified))
print()
print("Min pos zscore: "+str(min(outlier_datapoints_pos)))
print("Min pos mod zscore: "+str(min(outlier_datapoints_pos_modified)))

print("Min neg zscore: "+str(min(outlier_datapoints_neg)))
print("Min neg mod zscore: "+str(min(outlier_datapoints_neg_modified)))

print(outlier_datapoints_neg_modified)

141700
533
4908

133665
729
9423

Min pos zscore: 2.921099901199341
Min pos mod zscore: 1.6492999792099
Min neg zscore: 3.1342999935150146
Min neg mod zscore: 0.7677000164985657
[0.9083999991416931, 0.8784999847412109, 0.8087999820709229, 0.8148000240325928, 0.8260999917984009, 0.8208000063896179, 1.2269999980926514, 0.8059999942779541, 1.0332000255584717, 0.8075000047683716, 0.8633000254631042, 0.8129000067710876, 0.9625999927520752, 0.8593999743461609, 0.9402999877929688, 1.079699993133545, 0.8772000074386597, 0.9297000169754028, 3.085700035095215, 1.0095000267028809, 0.9886999726295471, 1.222599983215332, 0.7882000207901001, 0.9836000204086304, 0.9002000093460083, 0.9014000296592712, 0.8118000030517578, 0.7955999970436096, 1.080399990081787, 0.9345999956130981, 0.8219000101089478, 0.8199999928474426, 0.7875000238418579, 0.9266999959945679, 0.9660000205039978, 1.0714000463485718, 0.9650999903678894, 0.9099000096321106, 0.784500002861023, 0.7944999933242798, 1.4907000064849854, 0.8583

In [170]:
dfxSortedPos = sorted(dfx_pos_list)
dfxSortedNeg = sorted(dfx_neg_list)


In [221]:
q1Pos, q3Pos = np.percentile(dfxSortedPos,[25,75],interpolation='midpoint')
iqrPos = q3Pos - q1Pos
lower_boundPos = q1Pos - (1.5 * iqrPos)
upper_boundPos = q3Pos + (1.5 * iqrPos)

In [222]:
q1Neg, q3Neg = np.percentile(dfxSortedNeg,[25,75],interpolation='midpoint')
iqrNeg = q3Neg - q1Neg
lower_boundNeg = q1Neg - (1.5 * iqrNeg)
upper_boundNeg = q3Neg + (1.5 * iqrNeg)

In [228]:
print("Upper Pos:"+str(upper_boundPos))
print("Upper Neg"+str(upper_boundNeg))

Upper Pos:1.4941000491380692
Upper Neg0.7443000003695488


In [223]:
beforePos = len(dfx_droppedDup_Pos['ppm'])
dfxSortedPosAfter = dfx_droppedDup_Pos.loc[dfx_droppedDup_Pos['ppm'] <= upper_boundPos]
afterPos = len(dfxSortedPosAfter)
print("iqr Before:" + str(beforePos))
print("iqr After:" + str(afterPos))
print("iqr Diff:" + str(beforePos-afterPos))

Before:141700
After:134431
Diff:7269


In [226]:
beforeNeg = len(dfx_droppedDup_Neg['ppm'])
dfxSortedNegAfter = dfx_droppedDup_Neg.loc[dfx_droppedDup_Neg['ppm'] <= upper_boundNeg]
afterNeg = len(dfxSortedNegAfter)
print("iqr Before:" + str(len(dfx_droppedDup_Neg['ppm'])))
print("iqr After:" + str(len(dfxSortedNegAfter)))
print("iqr Diff:" + str(beforeNeg-afterNeg))

Before:133665
After:123740
Diff:9925
