In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas(strings_to_categorical=True)
    except Exception as e:
        print(e)
        
def typecast_objects(gl_obj):
    gl_obj = gl_obj.apply(lambda x: x.str.strip())
    gl_obj = gl_obj.apply(lambda x: x.str.lower())
    
    converted_obj = pd.DataFrame()
    
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = gl_obj[col]
    
    return converted_obj


def downcast(df):
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    
    df_float = df.select_dtypes(include=['float'])
    converted_float = df_float.apply(pd.to_numeric, downcast='float')

    df_obj = df.select_dtypes(include=['object'])
    converted_obj = typecast_objects(df_obj)

    df[converted_int.columns] = converted_int
    df[converted_float.columns] = converted_float
    df[converted_obj.columns] = converted_obj
    
    return df

In [3]:
df = load_df('1-parquet-files/samplesPeriods.parquet')
df = downcast(df)
#df = df.drop(['network_status', 'screen_on', 'boundary'], axis=1)

In [4]:
df.head(100)

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,change,boundary,period,change_acc,time_diff,time_acc
0,447027,1,2017-10-15 18:36:46,99,LTE,-1,1,AMERICA/CHICAGO,us,0.0,0,1,0.0,0.0,0.0
1,447015,1,2017-10-15 18:41:54,98,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-1.0,308.0,308.0
2,447012,1,2017-10-15 18:46:54,97,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-2.0,300.0,608.0
3,447011,1,2017-10-15 18:50:35,96,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-3.0,221.0,829.0
4,446225,1,2017-10-15 18:54:14,95,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-4.0,219.0,1048.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,460087,1,2017-10-16 01:00:44,63,WIFI,-1,0,AMERICA/CHICAGO,us,1.0,1,4,28.0,89.0,2119.0
96,460088,1,2017-10-16 01:01:59,64,WIFI,-1,0,AMERICA/CHICAGO,us,1.0,1,4,29.0,75.0,2194.0
97,460089,1,2017-10-16 01:03:19,65,WIFI,-1,0,AMERICA/CHICAGO,us,1.0,1,4,30.0,80.0,2274.0
98,460090,1,2017-10-16 01:04:48,66,WIFI,-1,0,AMERICA/CHICAGO,us,1.0,1,4,31.0,89.0,2363.0


In [5]:
df['size'] = df.groupby('period')['period'].transform('size')

In [6]:
df['max_change'] = df['change_acc'].abs().groupby(df['period']).transform('max')

In [7]:
df['max_time'] = df['time_acc'].groupby(df['period']).transform('max')

In [8]:
df['ppm'] = df['max_change'].div(df['max_time'].div(60)).round(4)

In [9]:
def getPeriodDirection(period):
    dfTemp = df.loc[(df['period'] == period) & (df['change'] != 0.0) , 'change']
    val = dfTemp.median()
    return val

#if change is 0.0 (repeated measurements with 0 battery change) look at the previous change to have a direction
df['change'] = df['change'].mask(cond = df['change'] == 0.0 , other = df.groupby('period').change.transform('mean'))



In [10]:
#reset change when device stays the same, but a new period is made
changeBoundary = df['boundary'].isin(range(0,1))
df.loc[changeBoundary, 'change'] = 0.0

In [11]:
df['direction'] = df['change'].apply(lambda x: 1 if x > 0.0 else -1 if x < 0.0 else 999)


In [12]:
#need to also fix two cases when change = 0.0
#means either it is the first in its period (Should assume the direction of the next sample)
#or battery just didn't change (assume direction of previous)

#first in the period has time_acc = 0.0, so direction looks forward
#if the battery just didn't change, then time_acc > 0.0, so look back

directionRangeAll = df['direction'].isin(range(999,1000))
directionRangeAll.head(100)
df.loc[directionRangeAll, 'direction'] = df['direction'].shift()

changeBoundary = df['boundary'].isin(range(0,1))
df.loc[changeBoundary, 'direction'] = df['direction'].shift(-1)

In [13]:
dfx = df.drop(["boundary", "change", "change_acc", "time_diff", "time_acc", "max_change", "max_time"], axis=1)
#dfx = df.drop(["time_diff", "max_change", "max_time"], axis=1)
#dfx = df


In [14]:
#obtain list of periods with more than 10 and less than 100
dfx = dfx[(dfx['size'] >= 10) & (dfx['size'] <= 100)]

In [15]:
dfz = dfx.groupby(['period'])['period'].count().reset_index(name='count')
print("Min: "+str(dfz['count'].min()))
print("Mean: "+str(dfz['count'].mean()))
print("Max: "+str(dfz['count'].max()))

Min: 10
Mean: 31.902244532090354
Max: 100


In [16]:
#Calculate outliers in our PPM values, while removing 0 ppm values

def detect_outlier(df):
    outliers=[]
    threshold=3
    mean_1 = np.mean(df)
    std_1 =np.std(df)
    
    
    for y in df:
        z_score= (y - mean_1)/std_1 
        if np.abs(z_score) > threshold:
            outliers.append(y)
    return outliers

dfx_droppedDup = dfx.drop(dfx.columns.difference(['period', 'direction', 'ppm']), axis=1).drop_duplicates()
dfx_droppedDup_Pos = dfx_droppedDup.loc[(dfx_droppedDup['direction'] == 1) & (dfx_droppedDup['ppm'] > 0.0)].drop(['period', 'direction'], axis=1)
dfx_droppedDup_Neg = dfx_droppedDup.loc[(dfx_droppedDup['direction'] == -1) & (dfx_droppedDup['ppm'] > 0.0)].drop(['period', 'direction'], axis=1)

dfx_pos_list = list(dfx_droppedDup_Pos['ppm'].dropna())
dfx_neg_list = list(dfx_droppedDup_Neg['ppm'].dropna())

#Detect the outlier's lowest ppm value
upperbound_outliers_pos = min(detect_outlier(dfx_pos_list))
upperbound_outliers_neg = min(detect_outlier(dfx_neg_list))

dfx_no_outliers = dfx.loc[(dfx['ppm'] > 0.0) & 
                          (((dfx['ppm'] < upperbound_outliers_pos) & (dfx['direction'] == 1)) |
                          ((dfx['ppm'] < upperbound_outliers_neg) & (dfx['direction'] == -1)))
                         ]


In [17]:
dfx_no_outliers.to_parquet('2-datasets/samplesPPM.parquet', compression='none') 

In [18]:
dfx.loc[dfx['period'] == 65]

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,period,size,ppm,direction
1620,858687,1,2017-10-23 03:35:36,99,WIFI,-1,1,AMERICA/CHICAGO,us,65,93,0.1409,-1.0
1621,858688,1,2017-10-23 03:42:03,98,WIFI,-1,1,AMERICA/CHICAGO,us,65,93,0.1409,-1.0
1622,858689,1,2017-10-23 03:49:15,97,WIFI,-1,1,AMERICA/CHICAGO,us,65,93,0.1409,-1.0
1623,858690,1,2017-10-23 03:56:42,96,WIFI,-1,1,AMERICA/CHICAGO,us,65,93,0.1409,-1.0
1624,858691,1,2017-10-23 04:04:32,95,WIFI,-1,1,AMERICA/CHICAGO,us,65,93,0.1409,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1708,877225,1,2017-10-23 13:42:36,15,WIFI,-1,1,AMERICA/CHICAGO,us,65,93,0.1409,-1.0
1709,877227,1,2017-10-23 13:43:21,14,DISCONNECTED,-1,1,AMERICA/CHICAGO,us,65,93,0.1409,-1.0
1710,877228,1,2017-10-23 13:47:18,13,DISCONNECTED,-1,1,AMERICA/CHICAGO,us,65,93,0.1409,-1.0
1711,877229,1,2017-10-23 13:54:44,12,DISCONNECTED,-1,1,AMERICA/CHICAGO,us,65,93,0.1409,-1.0


In [19]:
dfx.loc[dfx['period'] == 565256]

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,period,size,ppm,direction


In [20]:
#Using arshren's IQR outlier identification notebook (https://github.com/arshren/MachineLearning/blob/master/Identifying%20outliers.ipynb)

def outliers_modified_z_score(ys):
    outliers=[]
    threshold = 3.5

    median_y = np.median(ys)
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ys])
    for y in ys:
        modified_z_scores = 0.6745 * (y - median_y) / median_absolute_deviation_y
        if np.abs(modified_z_scores) > threshold:
            outliers.append(y)
    return outliers

In [21]:
outlier_datapoints_pos = detect_outlier(dfx_pos_list)
outlier_datapoints_neg = detect_outlier(dfx_neg_list)
outlier_datapoints_pos_modified = outliers_modified_z_score(dfx_pos_list)
outlier_datapoints_neg_modified = outliers_modified_z_score(dfx_neg_list)
print(len(dfx_pos_list))
print(len(outlier_datapoints_pos))
print(len(outlier_datapoints_pos_modified))

print()
print(len(dfx_neg_list))
print(len(outlier_datapoints_neg))
print(len(outlier_datapoints_neg_modified))
print()
print("Min pos zscore: "+str(min(outlier_datapoints_pos)))
print("Min pos mod zscore: "+str(min(outlier_datapoints_pos_modified)))

print("Min neg zscore: "+str(min(outlier_datapoints_neg)))
print("Min neg mod zscore: "+str(min(outlier_datapoints_neg_modified)))

print(outlier_datapoints_neg_modified)

275544
1251
10954

280097
2004
18666

Min pos zscore: 2.872299909591675
Min pos mod zscore: 1.6008000373840332
Min neg zscore: 3.7795000076293945
Min neg mod zscore: 0.9671000242233276
[1.2269999980926514, 1.0332000255584717, 1.079699993133545, 3.085700035095215, 1.0095000267028809, 0.9886999726295471, 1.222599983215332, 0.9836000204086304, 1.080399990081787, 1.0714000463485718, 1.4907000064849854, 1.0, 1.8181999921798706, 1.2958999872207642, 1.1486999988555908, 0.9987000226974487, 5.625, 5.669300079345703, 5.454500198364258, 2.074899911880493, 5.454500198364258, 5.950399875640869, 1.2475999593734741, 5.862100124359131, 3.8592000007629395, 2.2221999168395996, 4.744200229644775, 1.9129999876022339, 2.806999921798706, 9.397600173950195, 0.984499990940094, 5.294099807739258, 3.808000087738037, 1.3914999961853027, 1.406999945640564, 4.800000190734863, 4.756100177764893, 3.546299934387207, 1.3666000366210938, 1.0526000261306763, 2.3076999187469482, 5.927700042724609, 1.2345999479293823, 1.8

In [22]:
dfxSortedPos = sorted(dfx_pos_list)
dfxSortedNeg = sorted(dfx_neg_list)


In [23]:
q1Pos, q3Pos = np.percentile(dfxSortedPos,[25,75],interpolation='midpoint')
iqrPos = q3Pos - q1Pos
lower_boundPos = q1Pos - (1.5 * iqrPos)
upper_boundPos = q3Pos + (1.5 * iqrPos)

In [24]:
q1Neg, q3Neg = np.percentile(dfxSortedNeg,[25,75],interpolation='midpoint')
iqrNeg = q3Neg - q1Neg
lower_boundNeg = q1Neg - (1.5 * iqrNeg)
upper_boundNeg = q3Neg + (1.5 * iqrNeg)

In [25]:
print("Upper Pos:"+str(upper_boundPos))
print("Upper Neg"+str(upper_boundNeg))

Upper Pos:1.4433499872684479
Upper Neg0.9266000352799892


In [26]:
beforePos = len(dfx_droppedDup_Pos['ppm'])
dfxSortedPosAfter = dfx_droppedDup_Pos.loc[dfx_droppedDup_Pos['ppm'] <= upper_boundPos]
afterPos = len(dfxSortedPosAfter)
print("iqr Before:" + str(beforePos))
print("iqr After:" + str(afterPos))
print("iqr Diff:" + str(beforePos-afterPos))

iqr Before:275544
iqr After:260476
iqr Diff:15068


In [27]:
beforeNeg = len(dfx_droppedDup_Neg['ppm'])
dfxSortedNegAfter = dfx_droppedDup_Neg.loc[dfx_droppedDup_Neg['ppm'] <= upper_boundNeg]
afterNeg = len(dfxSortedNegAfter)
print("iqr Before:" + str(len(dfx_droppedDup_Neg['ppm'])))
print("iqr After:" + str(len(dfxSortedNegAfter)))
print("iqr Diff:" + str(beforeNeg-afterNeg))

iqr Before:280097
iqr After:260095
iqr Diff:20002
