In [18]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

from datetime import timedelta
from utils import downcast

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [19]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas(strings_to_categorical=True)
    except Exception as e:
        print(e)

In [20]:
df = load_df('parquet_files/samplesPeriods.parquet')
df = downcast(df)
#df = df.drop(['network_status', 'screen_on', 'boundary'], axis=1)

In [21]:
df.head(100)

Unnamed: 0,id,device_id,timestamp,battery_level,network_status,screen_brightness,screen_on,timezone,country_code,change,boundary,period,change_acc,time_diff,time_acc
0,447027,1,2017-10-15 18:36:46,99,LTE,-1,1,AMERICA/CHICAGO,us,0.0,0,1,0.0,0.0,0.0
1,447015,1,2017-10-15 18:41:54,98,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-1.0,308.0,308.0
2,447012,1,2017-10-15 18:46:54,97,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-2.0,300.0,608.0
3,447011,1,2017-10-15 18:50:35,96,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-3.0,221.0,829.0
4,446225,1,2017-10-15 18:54:14,95,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-4.0,219.0,1048.0
5,447009,1,2017-10-15 18:54:14,95,LTE,-1,1,AMERICA/CHICAGO,us,0.0,1,1,-4.0,0.0,1048.0
6,446218,1,2017-10-15 18:57:54,94,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-5.0,220.0,1268.0
7,446217,1,2017-10-15 19:02:47,93,LTE,-1,1,AMERICA/CHICAGO,us,-1.0,1,1,-6.0,293.0,1561.0
8,443535,1,2017-10-15 19:11:41,91,LTE,-1,1,AMERICA/CHICAGO,us,-2.0,1,1,-8.0,534.0,2095.0
9,443533,1,2017-10-15 19:21:25,89,LTE,-1,1,AMERICA/CHICAGO,us,-2.0,1,1,-10.0,584.0,2679.0


In [22]:
df['size'] = df.groupby('period')['period'].transform('size')

In [23]:
df['direction'] = df['change'].apply(lambda x: 1 if x >= 0.0 else -1)

In [24]:
df['max_change'] = df['change_acc'].abs().groupby(df['period']).transform('max')

In [25]:
df['max_time'] = df['time_acc'].groupby(df['period']).transform('max')

In [26]:
df['ppm'] = df['max_change'].div(df['max_time'].div(60)).round(4)

In [27]:
dfx = df.drop(["boundary", "battery_level", "change", "change_acc", "time_diff", "time_acc", "max_change", "max_time"], axis=1)
#dfx = df.drop(["size"], axis=1)

In [28]:
#obtain list of filters periods
dfx = dfx[(dfx['size'] >= 10) & (dfx['size'] <= 100)]


In [39]:
dfz = dfx.groupby(['period'])['period'].count().reset_index(name='count')
print("Min: "+str(dfz['count'].min()))
print("Mean: "+str(dfz['count'].mean()))
print("Max: "+str(dfz['count'].max()))

Min: 10
Mean: 32.399579287322865
Max: 100


In [40]:
dfx.to_parquet('datasets/samplesPPM.parquet', compression='none') 