In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
def typecast_objects(gl_obj):
    gl_obj = gl_obj.apply(lambda x: x.str.strip())
    gl_obj = gl_obj.apply(lambda x: x.str.lower())
    # convert object to category columns
    # when unique values < 50% of total
    converted_obj = pd.DataFrame()
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = gl_obj[col]
    return converted_obj


def save_df(df, path, compression='snappy', use_dictionary=True):
    """
    Save a pandas DataFrame to a parquet file
    """
    try:
        df.to_parquet(path, compression=compression,
                      use_dictionary=use_dictionary)
        print(path, 'created!')
    except Exception as e:
        print(e)

In [8]:
cols = ['id', 'device_id', 'timestamp', 'battery_state', 'battery_level',
'network_status', 'screen_brightness', 'screen_on','timezone', 'country_code']

df = pd.read_csv('0-original-csv/samplesNew.csv', usecols=cols, parse_dates=['timestamp'])

df.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23273115 entries, 0 to 23273114
Data columns (total 10 columns):
id                   object
device_id            object
timestamp            object
battery_state        object
battery_level        object
network_status       object
screen_brightness    object
screen_on            object
timezone             object
country_code         object
dtypes: object(10)
memory usage: 1.7+ GB


In [10]:
df.tail(100)

Unnamed: 0,id,device_id,timestamp,battery_state,battery_level,network_status,screen_brightness,screen_on,timezone,country_code
23273015,23467535,79646,2019-07-29 12:32:13,Discharging,0.97,WIFI,255,1,America/New_York,us
23273016,23467536,79646,2019-07-29 12:37:14,Discharging,0.96,WIFI,255,1,America/New_York,us
23273017,23467537,79646,2019-07-29 12:41:45,Discharging,0.95,WIFI,255,1,America/New_York,us
23273018,23467538,79646,2019-07-29 12:46:45,Discharging,0.94,WIFI,255,1,America/New_York,us
23273019,23467539,79646,2019-07-29 12:56:51,Discharging,0.93,WIFI,255,0,America/New_York,us
23273020,23467540,79646,2019-07-29 13:11:50,Discharging,0.92,WIFI,255,0,America/New_York,us
23273021,23467541,79646,2019-07-29 13:20:07,Discharging,0.91,WIFI,255,0,America/New_York,us
23273022,23467542,79646,2019-07-29 13:24:47,Discharging,0.9,WIFI,255,0,America/New_York,us
23273023,23467543,79646,2019-07-29 13:37:37,Discharging,0.89,WIFI,255,0,America/New_York,us
23273024,23467544,79646,2019-07-29 13:51:52,Discharging,0.88,WIFI,255,1,America/New_York,us


In [5]:
# sorting
df = df.sort_values(by=['device_id', 'timestamp'])

# date filtering
df = df[pd.Timestamp('2017-10-15') <= df.timestamp]

# reset indexes
df = df.reset_index(drop=True)

# explicitly cast battery level to integer
df_level = df.battery_level * 100
converted_level = df_level.astype(np.uint8)

# downcast integer columns
df_int = df.select_dtypes(include=['int'])
converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')

# downcast float columns
df_float = df.select_dtypes(include=['float']).drop('battery_level', axis=1)
converted_float = df_float.apply(pd.to_numeric, downcast='float')

# convert object to category columns
# when unique values < 50% of total
df_obj = df.select_dtypes(include=['object'])
converted_obj = typecast_objects(df_obj)

# transform optimized types
df[converted_int.columns] = converted_int
df[converted_float.columns] = converted_float
df[converted_obj.columns] = converted_obj
df['battery_level'] = converted_level

# filter out malformed records
df = df[df.battery_level <= 100]

KeyboardInterrupt: 

In [11]:
save_df(df, '1-parquet-files/samples.parquet')

parquet_files/samples.parquet created!


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11503813 entries, 0 to 11503816
Data columns (total 10 columns):
id                   uint32
device_id            uint16
timestamp            datetime64[ns]
battery_state        category
battery_level        uint8
network_status       category
screen_brightness    int64
screen_on            uint8
timezone             category
country_code         category
dtypes: category(4), datetime64[ns](1), int64(1), uint16(1), uint32(1), uint8(2)
memory usage: 416.9 MB
