In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

from datetime import timedelta
from utils import downcast

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [3]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas(strings_to_categorical=True)
    except Exception as e:
        print(e)

In [4]:
df = load_df('datasets/periods.parquet')
df = downcast(df)
df = df.drop(['network_status', 'screen_on', 'boundary'], axis=1)

KeyboardInterrupt: 

In [None]:
df.head(100)

In [None]:
df['size'] = df.groupby('period')['period'].transform('size')

In [None]:
df['direction'] = df['change'].apply(lambda x: 1 if x >= 0.0 else -1)

In [None]:
df['max_change'] = df['change_acc'].abs().groupby(df['period']).transform('max')

In [None]:
df['max_time'] = df['time_acc'].groupby(df['period']).transform('max')

In [None]:
df['ppm'] = df['max_change'].div(df['max_time'].div(60)).round(4)

In [None]:
dfx = df.drop(["timestamp", "battery_level", "change", "change_acc", "time_diff", "time_acc", "max_change", "max_time"], axis=1)
#dfx = df.drop(["size"], axis=1)
df_level = dfx.device_id 
converted_level = df_level.astype(np.int32)
dfx['device_id'] = converted_level

df_level = dfx.id 
converted_level = df_level.astype(np.int32)
dfx['id'] = converted_level

df_level = dfx.period 
converted_level = df_level.astype(np.int32)
dfx['period'] = converted_level

In [None]:
#obtain list of filters periods
dfx = dfx[(dfx['size'] >= 10) & (dfx['size'] <= 100)]
#dfx.groupby(['period'])['period'].count()
print(dfx)

In [None]:
df.loc[df['period'] == 981877]

In [None]:
dfx.to_parquet('datasets/ppm.parquet', compression='none') 