In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df


In [3]:
cols = ['sample_id', 'bluetooth_enabled', 'location_enabled', 'power_saver_enabled', 'flashlight_enabled', 'nfc_enabled', 'unknown_sources', 'developer_mode']
df = prepare('1-parquet-files/settings.parquet',cols)


#fix unsigned int
df_level = df.bluetooth_enabled 
converted_level = df_level.astype(np.int8)
df['bluetooth_enabled'] = converted_level
df_level = df.location_enabled 
converted_level = df_level.astype(np.int8)
df['location_enabled'] = converted_level
df_level = df.power_saver_enabled 
converted_level = df_level.astype(np.int8)
df['power_saver_enabled'] = converted_level
df_level = df.flashlight_enabled 
converted_level = df_level.astype(np.int8)
df['flashlight_enabled'] = converted_level
df_level = df.nfc_enabled 
converted_level = df_level.astype(np.int8)
df['nfc_enabled'] = converted_level
df_level = df.unknown_sources 
converted_level = df_level.astype(np.int8)
df['unknown_sources'] = converted_level
df_level = df.developer_mode 
converted_level = df_level.astype(np.int8)
df['developer_mode'] = converted_level

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23600501 entries, 0 to 23600500
Data columns (total 8 columns):
sample_id              uint32
bluetooth_enabled      int8
location_enabled       int8
power_saver_enabled    int8
flashlight_enabled     int8
nfc_enabled            int8
unknown_sources        int8
developer_mode         int8
dtypes: int8(7), uint32(1)
memory usage: 247.6 MB


In [4]:
df.head(100)

Unnamed: 0,sample_id,bluetooth_enabled,location_enabled,power_saver_enabled,flashlight_enabled,nfc_enabled,unknown_sources,developer_mode
0,1,0,1,0,0,0,0,0
1,2,0,1,0,0,0,0,0
2,3,0,1,0,0,0,0,0
3,4,0,1,0,0,0,0,0
4,5,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
95,96,0,0,0,0,1,1,0
96,97,0,0,0,0,1,1,0
97,98,0,0,0,0,1,1,0
98,99,0,0,0,0,1,1,0


In [5]:
df.to_parquet('2-datasets/settings.parquet', compression='none') 