# Samples cleaning

This task will fetch the raw data from the remote database given a SQL query file.

## Import libraries

In [1]:
import sys

sys.path.append('..')

In [2]:
import os
import gc
import yaml
import json
import requests
import numpy as np
import pandas as pd

from pymongo import MongoClient

from utils import get_files, memory_usage, \
correct_encoding, update_page_metadata

## Load the data

In [3]:
BASE_PATH = '../../'

CONFIG_DIR = os.path.join(BASE_PATH, 'config')
STORAGE_DIR = os.path.join(BASE_PATH, 'storage')
DATA_DIR = os.path.join(BASE_PATH, 'data')

config = yaml.load(open(os.path.join(CONFIG_DIR, 'env.yml')),
                   Loader=yaml.FullLoader)

In [4]:
model = 'samples'
model_storage = os.path.join(STORAGE_DIR, model)

if not os.path.exists(model_storage):
    os.makedirs(model_storage)
    
client = MongoClient('mongodb://{}:{}@{}:{}'.format(config['MONGO_USERNAME'], config['MONGO_PASSWORD'],
                                                    config['MONGO_HOST'], config['MONGO_PORT']))
metadata_db = client[config['MONGO_DATABASE']]

model_metadata = metadata_db[model]

model_metadata.delete_many({})
model_metadata.insert_one({'pages': []})

<pymongo.results.InsertOneResult at 0x7fe9a6fb8400>

In [5]:
csv_files = get_files(os.path.join(DATA_DIR, 'raw', model, '*.csv'))

In [6]:
if not csv_files:
    raise FileNotFoundError('Couldn\'t find any csv files! Please make sure the filepath exists')

In [7]:
df = pd.read_csv(csv_files[0], sep=';', parse_dates=['timestamp'], nrows=100)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 38 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    100 non-null    int64         
 1   device_id             100 non-null    int64         
 2   timestamp             100 non-null    datetime64[ns]
 3   battery_state         100 non-null    object        
 4   battery_level         100 non-null    float64       
 5   timezone              100 non-null    object        
 6   country_code          100 non-null    object        
 7   memory_active         100 non-null    int64         
 8   memory_inactive       100 non-null    int64         
 9   memory_free           100 non-null    int64         
 10  memory_user           100 non-null    int64         
 11  charger               100 non-null    object        
 12  health                100 non-null    object        
 13  voltage              

In [8]:
df.describe(include=[np.number])

Unnamed: 0,id,device_id,battery_level,memory_active,memory_inactive,memory_free,memory_user,voltage,temperature,usage,...,roaming_enabled,bluetooth_enabled,location_enabled,power_saver_enabled,nfc_enabled,developer_mode,free,total,free_system,total_system
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,50.5,1.87,71.38,675347.4,483018.36,1574720.0,65265.84,4.1195,28.771,0.3209,...,0.0,0.0,0.71,0.0,0.29,0.0,4098.3,10567.51,460.1,2699.51
std,29.011492,1.368144,31.909208,288339.0,46542.844283,208618.3,11685.824609,0.181578,4.09235,2.246707,...,0.0,0.0,0.456048,0.0,0.456048,0.0,332.706001,1468.018581,278.189293,1085.850339
min,1.0,1.0,5.0,416132.0,401032.0,1442060.0,46156.0,3.79,21.4,-21.43,...,0.0,0.0,0.0,0.0,0.0,0.0,3789.0,9634.0,27.0,1009.0
25%,25.75,1.0,40.75,491316.0,417181.0,1442060.0,58796.0,3.93,27.3,0.03,...,0.0,0.0,0.0,0.0,0.0,0.0,3919.0,9634.0,27.0,1009.0
50%,50.5,1.0,88.0,503920.0,504364.0,1442060.0,62174.0,4.175,27.7,0.695,...,0.0,0.0,1.0,0.0,0.0,0.0,3921.0,9634.0,637.0,3390.0
75%,75.25,4.0,94.0,1111631.0,515056.0,1899508.0,67296.0,4.29,32.025,0.875,...,0.0,0.0,1.0,0.0,1.0,0.0,4611.0,12853.0,637.0,3390.0
max,100.0,4.0,100.0,1141176.0,545980.0,1899508.0,117900.0,4.32,37.2,2.21,...,0.0,0.0,1.0,0.0,1.0,0.0,4611.0,12853.0,637.0,3390.0


In [9]:
df.describe(exclude=[np.number])

Unnamed: 0,timestamp,battery_state,timezone,country_code,charger,health,network_status,network_type,mobile_network_type,mobile_data_status,mobile_data_activity,wifi_status
count,100,100,100,100,100,100,100,100,100,100,100,100
unique,98,3,2,2,2,1,3,3,1,2,3,1
top,2017-10-08 12:50:04,Charging,America/Chicago,us,ac,Good,lte,MOBILE,lte,disconnected,none,enabled
freq,2,74,71,71,76,100,36,36,100,64,57,100
first,2017-10-08 12:50:04,,,,,,,,,,,
last,2017-10-09 03:51:20,,,,,,,,,,,


In [10]:
df[['network_status', 'network_type', 'mobile_network_type', 'mobile_data_status',
    'mobile_data_activity', 'wifi_status']].sample(20, random_state=0)

Unnamed: 0,network_status,network_type,mobile_network_type,mobile_data_status,mobile_data_activity,wifi_status
26,WIFI,WIFI,lte,disconnected,none,enabled
86,disconnected,unknown,lte,disconnected,inout,enabled
2,WIFI,WIFI,lte,disconnected,none,enabled
55,lte,MOBILE,lte,connected,none,enabled
75,disconnected,unknown,lte,disconnected,inout,enabled
93,disconnected,unknown,lte,disconnected,inout,enabled
16,WIFI,WIFI,lte,disconnected,none,enabled
73,disconnected,unknown,lte,disconnected,inout,enabled
54,lte,MOBILE,lte,connected,in,enabled
95,disconnected,unknown,lte,disconnected,inout,enabled


## Basic data pre-processing

In [11]:
def save_df_page(page, chunks, collection, exclude=['id']):
    # concatenate data chunks -> careful benchmark this
    df = pd.concat(chunks, axis=0, ignore_index=True)
    
    collection.update_one({}, {'$inc': {'nrows': df.shape[0]}})

    # store page metadata in mongo
    update_page_metadata(collection, df)

    # save output to a parquet file with brotli compression
    df.to_parquet(os.path.join(model_storage, '{}.{}.parquet'.format(
        model, page)), compression='brotli', index=False)
    
    del df

    return page + 1

In [12]:
# projected memory usage for all samples loaded
round(memory_usage(df) * 5000000 / df.shape[0], 2)

4500.0

In [13]:
mappings = {'id': 'uint32', 'device_id': 'uint32', 'battery_level': 'uint8', 'memory_active': 'uint32',
            'memory_inactive': 'uint32', 'memory_free': 'uint32', 'memory_user': 'uint32', 'usage': 'uint8',
            'screen_on': 'bool', 'wifi_signal_strength': 'int16', 'wifi_link_speed': 'int16',
            'screen_brightness': 'int16', 'roaming_enabled': 'bool', 'bluetooth_enabled': 'bool',
            'location_enabled': 'bool', 'power_saver_enabled': 'bool', 'nfc_enabled': 'bool',
            'developer_mode': 'bool', 'free': 'uint32', 'total': 'uint32', 'free_system': 'uint32',
            'total_system': 'uint32'}

model_metadata.update_one({}, {'$set': {'mappings': mappings}})

<pymongo.results.UpdateResult at 0x7fe9a8182b80>

In [14]:
total_rows = 0
total_memory_usage = 0.0
memory_usage_split = 0.0
last_id = 0

page = 0
chunks = []

features = df.columns.to_list()
features = list(filter(lambda x: x not in ['country_code'], features))

string_columns = df[features].select_dtypes(include='object').columns.to_list()

exclude_columns = ['network_status', 'network_type', 'mobile_network_type',
                  'mobile_data_status', 'mobile_data_activity', 'wifi_status']

category_columns = list(set(string_columns) - set(exclude_columns))

In [15]:
print('Total csv files: {}'.format(len(csv_files)))

Total csv files: 165


In [16]:
for filepath in csv_files:
    df = pd.read_csv(filepath, sep=';', usecols=features,
                     parse_dates=['timestamp'])

    # drop missing values
    df = df.dropna()

    # process features which can drop values first
    df = df[df['timestamp'].between(pd.Timestamp(
        2017, 9, 1), pd.Timestamp(2020, 8, 1))]

    # battery level
    df = df[df['battery_level'] <= 100]

    # temperature
    lower, upper = df['temperature'].quantile([0.01, 0.99])
    df = df[df['temperature'].between(lower, upper)]

    # voltage
    df.loc[df['voltage'] > 1000, 'voltage'] = df['voltage'] / 1000

    # cpu usage
    df.loc[:, 'usage'] = abs(df['usage']) * 100
    df.loc[df['usage'] > 100, 'usage'] = df['usage'] / 100
    df = df.drop(df[df['usage'] > 100].index, axis=0)

    # basic string pre-processing: convert to lowercase and strip blank chars
    for column in string_columns:
        df.loc[:, column] = [x.lower().strip() for x in df[column].tolist()]

    df['wifi_enabled'] = False
    df.loc[df['wifi_status'].isin(
        ['enabled', 'enabling']), 'wifi_enabled'] = True

    df['mobile_enabled'] = False
    df.loc[df['network_type'].str.startswith(
        'mobile'), 'mobile_enabled'] = True

    df['wifi_active'] = False
    df.loc[(df['network_status'] == 'wifi') | (
        df['network_type'].str.startswith('wifi')), 'wifi_active'] = True

    df['mobile_active'] = False
    df.loc[df['mobile_enabled'] & (df['network_status'] != 'disconnected') & (
        df['mobile_data_status'].isin(['connecting', 'connected'])), 'mobile_active'] = True

    df = df.drop(exclude_columns, axis=1)

    df[category_columns] = df[category_columns].astype('category')
    df = df.astype(mappings)

    total_rows += df.shape[0]
    page_memory_usage = memory_usage(df)
    total_memory_usage += page_memory_usage
    memory_usage_split += page_memory_usage

    last_id = max(last_id, df['id'].max())

    chunks.append(df)

    if memory_usage_split >= config['MEMORY_USAGE_SPLIT']:
        print('Page {} created!'.format(page))
        page = save_df_page(page, chunks, model_metadata)

        del chunks[:]
        del chunks
        gc.collect()

        chunks = []
        memory_usage_split = 0.0

    print('{} ✔'.format(filepath))

../../data/raw/samples/samples.query.1.csv ✔
../../data/raw/samples/samples.query.2.csv ✔
Page 0 created!
../../data/raw/samples/samples.query.3.csv ✔
../../data/raw/samples/samples.query.4.csv ✔
../../data/raw/samples/samples.query.5.csv ✔
Page 1 created!
../../data/raw/samples/samples.query.6.csv ✔
../../data/raw/samples/samples.query.7.csv ✔
../../data/raw/samples/samples.query.8.csv ✔
Page 2 created!
../../data/raw/samples/samples.query.9.csv ✔
../../data/raw/samples/samples.query.10.csv ✔
../../data/raw/samples/samples.query.11.csv ✔
Page 3 created!
../../data/raw/samples/samples.query.12.csv ✔
../../data/raw/samples/samples.query.13.csv ✔
../../data/raw/samples/samples.query.14.csv ✔
Page 4 created!
../../data/raw/samples/samples.query.15.csv ✔
../../data/raw/samples/samples.query.16.csv ✔
../../data/raw/samples/samples.query.17.csv ✔
Page 5 created!
../../data/raw/samples/samples.query.18.csv ✔
../../data/raw/samples/samples.query.19.csv ✔
../../data/raw/samples/samples.query.20

../../data/raw/samples/samples.query.160.csv ✔
../../data/raw/samples/samples.query.161.csv ✔
../../data/raw/samples/samples.query.162.csv ✔
../../data/raw/samples/samples.query.163.csv ✔
Page 52 created!
../../data/raw/samples/samples.query.164.csv ✔
../../data/raw/samples/samples.query.165.csv ✔


In [17]:
if chunks:
    save_df_page(page, chunks, model_metadata)
    del chunks[:]
    del chunks
    gc.collect()

In [18]:
model_metadata.update_one({}, {'$set': {'last_id': int(last_id)}})
model_metadata.update_one({}, {'$set': {'total': total_rows}})

<pymongo.results.UpdateResult at 0x7fe9a6700b40>

In [19]:
total_rows, total_memory_usage, last_id

(46558859, 5667.880000000002, 49490940)

In [20]:
content = '**{}** | task: data pre-processing, rows: {}, memory usage: {} MB' \
.format(model, total_rows, total_memory_usage)

requests.post(config['discord_webhook_url'], data={'content': content})

<Response [204]>