In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

from datetime import timedelta
from utils import downcast

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas(strings_to_categorical=True)
    except Exception as e:
        print(e)

In [3]:
df = load_df('datasets/periods.parquet.gzip')
df = downcast(df)
df = df.drop(['network_status', 'screen_on', 'boundary'], axis=1)

In [4]:
devices = pd.read_csv('datasets/devices.csv', usecols=['id', 'manufacturer', 'brand', 'os_version'])
devices = downcast(devices)

In [5]:
# df['period_acc'] = df['time_diff'].cumsum()
# df.loc[df['period'] != df['period'].shift(), 'period_acc'] = None

In [6]:
df['size'] = df.groupby('period')['period'].transform('size')

In [7]:
df = df[(df['size'] >= 10) & (df['size'] <= 100)]

In [8]:
df['direction'] = df['change'].apply(lambda x: 1 if x >= 0 else -1)

In [9]:
df['max_change'] = df['change_acc'].abs().groupby(df['period']).transform('max')

In [10]:
df['max_time'] = df['time_acc'].groupby(df['period']).transform('max')

In [11]:
df['ppm'] = df['max_change'].div(df['max_time'].div(60)).round(4)

In [12]:
df = df.merge(devices, 'left', left_on='device_id', right_on='id')

In [13]:
gf = df[['period', 'size', 'direction', 'ppm', 'brand', 'os_version']].copy()

In [14]:
gf = gf.drop_duplicates('period')
gf = gf.reset_index(drop=True)

In [18]:
gf.head()

Unnamed: 0,period,size,direction,ppm,brand,os_version
0,1,11,-1,0.2026,lge,6.0.1
1,2,49,-1,0.1914,lge,6.0.1
2,4,47,1,0.1571,lge,6.0.1
3,5,11,-1,0.0601,lge,6.0.1
4,6,32,1,0.0522,lge,6.0.1


In [16]:
brands = list(gf['brand'].value_counts()[:10].keys())

In [17]:
gf[gf['brand'].isin(brands)].groupby(['brand', 'direction'])['ppm'].mean()

brand     direction
bq        -1           0.087999
           1          -0.384124
huawei    -1           0.227981
           1           0.008395
lenovo    -1           0.053790
           1          -0.072871
lge       -1           0.173712
           1           0.073622
motorola  -1           0.033732
           1           0.056201
oneplus   -1          -0.049000
           1           0.119840
samsung   -1           0.064320
           1           0.070546
tcl       -1           0.056982
           1           0.028410
vodafone  -1           0.008505
           1          -0.922912
zte       -1           0.042612
           1           5.338967
Name: ppm, dtype: float32