In [7]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [8]:
def typecast_objects(gl_obj):
    gl_obj = gl_obj.apply(lambda x: x.str.strip())
    gl_obj = gl_obj.apply(lambda x: x.str.lower())
    # convert object to category columns
    # when unique values < 50% of total
    converted_obj = pd.DataFrame()
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = gl_obj[col]
    return converted_obj


def save_df(df, path, compression='snappy', use_dictionary=True):
    """
    Save a pandas DataFrame to a parquet file
    """
    try:
        df.to_parquet(path, compression=compression,
                      use_dictionary=use_dictionary)
        print(path, 'created!')
    except Exception as e:
        print(e)

In [9]:
cols = ['id', 'device_id', 'timestamp', 'battery_state', 'battery_level',
'network_status', 'screen_brightness', 'screen_on','timezone', 'country_code']

df = pd.read_csv('0-Original-CSV/samples.csv', usecols=cols, parse_dates=['timestamp'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11957118 entries, 0 to 11957117
Data columns (total 10 columns):
id                   int64
device_id            int64
timestamp            datetime64[ns]
battery_state        object
battery_level        float64
network_status       object
screen_brightness    int64
screen_on            int64
timezone             object
country_code         object
dtypes: datetime64[ns](1), float64(1), int64(4), object(4)
memory usage: 912.3+ MB


In [10]:
# sorting
df = df.sort_values(by=['device_id', 'timestamp'])

# date filtering
df = df[pd.Timestamp('2017-10-15') <= df.timestamp]

# reset indexes
df = df.reset_index(drop=True)

# explicitly cast battery level to integer
df_level = df.battery_level * 100
converted_level = df_level.astype(np.uint8)

# downcast integer columns
df_int = df.select_dtypes(include=['int'])
converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')

# downcast float columns
df_float = df.select_dtypes(include=['float']).drop('battery_level', axis=1)
converted_float = df_float.apply(pd.to_numeric, downcast='float')

# convert object to category columns
# when unique values < 50% of total
df_obj = df.select_dtypes(include=['object'])
converted_obj = typecast_objects(df_obj)

# transform optimized types
df[converted_int.columns] = converted_int
df[converted_float.columns] = converted_float
df[converted_obj.columns] = converted_obj
df['battery_level'] = converted_level

# filter out malformed records
df = df[df.battery_level <= 100]

In [11]:
save_df(df, 'parquet_files/samples.parquet')

parquet_files/samples.parquet created!


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11503813 entries, 0 to 11503816
Data columns (total 10 columns):
id                   uint32
device_id            uint16
timestamp            datetime64[ns]
battery_state        category
battery_level        uint8
network_status       category
screen_brightness    int64
screen_on            uint8
timezone             category
country_code         category
dtypes: category(4), datetime64[ns](1), int64(1), uint16(1), uint32(1), uint8(2)
memory usage: 416.9 MB
