In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

from datetime import timedelta
from utils import downcast

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [3]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas(strings_to_categorical=True)
    except Exception as e:
        print(e)

In [4]:
df = load_df('parquet_files/samplesPeriods.parquet')
df = downcast(df)
#df = df.drop(['network_status', 'screen_on', 'boundary'], axis=1)

In [5]:
df.head(100)

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,change,boundary,period,change_acc,time_diff,time_acc
0,447027,1,2017-10-15 18:36:46,99,LTE,-1,1,AMERICA/CHICAGO,us,0.0,0,1,0.0,0.0,0.0
1,447015,1,2017-10-15 18:41:54,98,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-1.0,308.0,308.0
2,447012,1,2017-10-15 18:46:54,97,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-2.0,300.0,608.0
3,447011,1,2017-10-15 18:50:35,96,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-3.0,221.0,829.0
4,446225,1,2017-10-15 18:54:14,95,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-4.0,219.0,1048.0
5,447009,1,2017-10-15 18:54:14,95,LTE,-1,1,AMERICA/CHICAGO,us,0.0,1,1,-4.0,0.0,1048.0
6,446218,1,2017-10-15 18:57:54,94,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-5.0,220.0,1268.0
7,446217,1,2017-10-15 19:02:47,93,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-6.0,293.0,1561.0
8,443535,1,2017-10-15 19:11:41,91,LTE,-1,1,AMERICA/CHICAGO,us,-2.0,1,1,-8.0,534.0,2095.0
9,443533,1,2017-10-15 19:21:25,89,LTE,-1,1,AMERICA/CHICAGO,us,-2.0,1,1,-10.0,584.0,2679.0


In [6]:
df['size'] = df.groupby('period')['period'].transform('size')

In [7]:
df['max_change'] = df['change_acc'].abs().groupby(df['period']).transform('max')

In [8]:
df['max_time'] = df['time_acc'].groupby(df['period']).transform('max')

In [9]:
df['ppm'] = df['max_change'].div(df['max_time'].div(60)).round(4)

In [10]:
#dftest = df.loc[(df['period'] == 111)]

In [11]:
#dftest.head(100)

In [12]:
def getPeriodDirection(period):
    dfTemp = df.loc[(df['period'] == period) & (df['change'] != 0.0) , 'change']
    val = dfTemp.median()
    return val

#if change is 0.0 (repeated measurements with 0 battery change) look at the previous change to have a direction
df['change'] = df['change'].mask(cond = df['change'] == 0.0 , other = df.groupby('period').change.transform('mean'))



In [13]:
#reset change when device stays the same, but a new period is made
changeBoundary = df['boundary'].isin(range(0,1))
df.loc[changeBoundary, 'change'] = 0.0




In [14]:
df['direction'] = df['change'].apply(lambda x: 1 if x > 0.0 else -1 if x < 0.0 else 999)


In [15]:
#need to also fix two cases when change = 0.0
#means either it is the first in its period (Should assume the direction of the next sample)
#or battery just didn't change (assume direction of previous)

#first in the period has time_acc = 0.0, so direction looks forward
#if the battery just didn't change, then time_acc > 0.0, so look back

directionRangeAll = df['direction'].isin(range(999,1000))
directionRangeAll.head(100)
df.loc[directionRangeAll, 'direction'] = df['direction'].shift()

changeBoundary = df['boundary'].isin(range(0,1))
df.loc[changeBoundary, 'direction'] = df['direction'].shift(-1)

In [16]:
dfx = df.drop(["boundary", "change", "change_acc", "time_diff", "time_acc", "max_change", "max_time"], axis=1)
#dfx = df.drop(["time_diff", "max_change", "max_time"], axis=1)
#dfx = df


In [17]:
#obtain list of periods with more than 10 and less than 100
dfx = dfx[(dfx['size'] >= 10) & (dfx['size'] <= 100)]




In [94]:
dfz = dfx.groupby(['period'])['period'].count().reset_index(name='count')
print("Min: "+str(dfz['count'].min()))
print("Mean: "+str(dfz['count'].mean()))
print("Max: "+str(dfz['count'].max()))

Min: 10
Mean: 32.399579287322865
Max: 100


In [95]:
dfx.to_parquet('datasets/samplesPPM.parquet', compression='none') 

In [18]:
dfx.loc[dfx['period'] == 802384]

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,period,size,ppm,direction
7934159,7051617,14086,2018-08-18 02:10:32,21,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0
7934160,7054136,14086,2018-08-18 02:10:34,20,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0
7934161,7054139,14086,2018-08-18 02:10:35,19,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0
7934162,7054143,14086,2018-08-18 02:10:36,18,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0
7934163,7054145,14086,2018-08-18 02:10:37,17,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0
7934164,7054146,14086,2018-08-18 02:10:38,16,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0
7934165,7054149,14086,2018-08-18 02:10:39,15,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0
7934166,7054151,14086,2018-08-18 02:10:40,13,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0
7934167,7054153,14086,2018-08-18 02:10:41,12,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0
7934168,7054134,14086,2018-08-18 02:10:42,11,WIFI,109,1,AMERICA/MEXICO_CITY,mx,802384,20,63.157902,-1.0


In [20]:
dfx.loc[dfx['period'] == 565256]

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,period,size,ppm,direction
5798747,5472681,5572,2018-07-01 13:11:51,16,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798748,5472680,5572,2018-07-01 13:12:00,17,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798749,5472678,5572,2018-07-01 13:12:11,18,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798750,5472676,5572,2018-07-01 13:12:20,19,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798751,5472675,5572,2018-07-01 13:12:31,20,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798752,5472673,5572,2018-07-01 13:12:40,21,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798753,5472753,5572,2018-07-01 13:12:51,22,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798754,5472752,5572,2018-07-01 13:13:00,23,HSPA,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798755,5472647,5572,2018-07-01 13:13:11,24,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
5798756,5472646,5572,2018-07-01 13:13:20,25,UTMS,10,1,AFRICA/PORTO-NOVO,bj,565256,17,6.0,1.0
