# 1. Samples pre-processing

This task will fetch the raw data from the remote database given a SQL query file.

## 1.1 Import libraries

In [1]:
import sys

sys.path.append('..')

In [2]:
import os
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pymongo import MongoClient

from utils import get_csv_files, memory_usage, \
correct_encoding, update_metadata

In [3]:
def save_df_page(page, chunks, collection, exclude=['id']):
    # concatenate data chunks -> careful benchmark this
    df = pd.concat(chunks, axis=0)
    
    # store page metadata in mongo
    update_metadata(collection, df)
    
    # save output to a parquet file with brotli compression
    df.to_parquet(os.path.join(model_storage, '{}.{}.parquet'.format(model, page)), compression='brotli')
    
    return page + 1

## 1.2 Load the data

In [4]:
DATA_DIR = '../../data'
STORAGE_DIR = '../../storage'

config = yaml.load(open('../config.yml'), Loader=yaml.FullLoader)

model = 'samples'
model_storage = os.path.join(STORAGE_DIR, model)

if not os.path.exists(model_storage):
    os.makedirs(model_storage)
    
client = MongoClient('mongodb://{}:{}@{}:{}'.format(config['MONGO_USERNAME'], config['MONGO_PASSWORD'],
                                                    config['MONGO_HOST'], config['MONGO_PORT']))
metadata_db = client[config['MONGO_DATABASE']]

model_metadata = metadata_db[model]

model_metadata.delete_many({})
model_metadata.insert_one({'pages': []})

<pymongo.results.InsertOneResult at 0x7f13c0dc33c0>

In [5]:
csv_files = get_csv_files(os.path.join(DATA_DIR, 'raw', model, '*.csv'))

In [6]:
if not csv_files:
    raise FileNotFoundError('Couldn\'t find any csv files! Please make sure the filepath exists')

In [7]:
df = pd.read_csv(csv_files[0], sep=';')

# sample rows to list columns and dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298809 entries, 0 to 298808
Data columns (total 32 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    298809 non-null  int64  
 1   device_id             298809 non-null  int64  
 2   timestamp             298809 non-null  object 
 3   battery_state         298809 non-null  object 
 4   battery_level         298809 non-null  float64
 5   timezone              298809 non-null  object 
 6   country_code          298809 non-null  object 
 7   charger               298809 non-null  object 
 8   health                298809 non-null  object 
 9   voltage               298809 non-null  float64
 10  temperature           298809 non-null  float64
 11  usage                 298809 non-null  float64
 12  up_time               298809 non-null  int64  
 13  sleep_time            298809 non-null  int64  
 14  network_status        298809 non-null  object 
 15  

In [8]:
df.describe(include=[np.number])

Unnamed: 0,id,device_id,battery_level,voltage,temperature,usage,up_time,sleep_time,wifi_signal_strength,wifi_link_speed,...,roaming_enabled,bluetooth_enabled,location_enabled,power_saver_enabled,nfc_enabled,developer_mode,free,total,free_system,total_system
count,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0,...,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0,298809.0
mean,150133.189358,982.856246,57.420627,10.06723,30.802061,0.480892,413392.5,223983.3,-103.654354,49.664783,...,0.025073,0.190961,0.487392,0.064081,0.155973,0.096567,9739.732598,22384.894243,295.728971,2696.538625
std,86627.615952,587.635906,25.781482,157.580041,5.004957,0.839212,687177.4,419629.5,375.505466,89.545422,...,0.156347,0.393059,0.499842,0.244898,0.36283,0.295367,13858.079778,18468.51531,308.322161,1057.496174
min,1.0,1.0,1.0,0.0,-5.8,-107.0,25.0,0.0,-9999.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,51.0,372.0,0.0,218.0
25%,75153.0,466.0,37.0,3.84,27.9,0.26,57317.0,22104.0,-127.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1410.0,10766.0,95.0,1891.0
50%,150212.0,970.0,59.0,4.02,31.0,0.45,187763.0,79962.0,-74.0,39.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4199.0,12414.0,182.0,2496.0
75%,225200.0,1467.0,79.0,4.18,34.0,0.69,453023.0,223134.0,-58.0,65.0,...,0.0,0.0,1.0,0.0,0.0,0.0,12554.0,25889.0,402.0,3610.0
max,300000.0,2224.0,100.0,4404.58,54.0,339.43,12107600.0,9440295.0,0.0,2552.0,...,1.0,1.0,1.0,1.0,1.0,1.0,215812.0,235216.0,2754.0,7132.0


In [9]:
df.describe(exclude=[np.number])

Unnamed: 0,timestamp,battery_state,timezone,country_code,charger,health,network_status,mobile_data_status,mobile_data_activity,wifi_status
count,298809,298809,298809,298809,298809,298809,298809,298809,298809,298809
unique,192899,4,49,35,3,4,18,4,4,5
top,2017-10-12 20:57:22,Charging,Europe/Lisbon,pt,ac,Good,WIFI,disconnected,none,enabled
freq,11,169760,136958,249562,156367,298615,174464,238171,233461,215603


## 1.3 Basic data pre-processing

In [10]:
memory_usage(df) * 5000000 / 100

11803000.0

In [11]:
df.query('usage < 0 | usage > 100')['device_id'].value_counts()

1506    200
2169    129
562     115
808     101
1796     91
       ... 
1744      1
967       1
1050      1
1737      1
700       1
Name: device_id, Length: 135, dtype: int64

In [22]:
np.set_printoptions(threshold=np.inf, formatter={'float_kind':'{:f}'.format})

In [23]:
np.array([df.query('usage < 0 | usage > 100')['usage'].value_counts().index])

array([[-0.010000, -0.020000, -0.030000, -0.040000, -0.050000, -0.060000,
        -0.070000, -0.120000, -0.100000, -0.090000, -0.160000, -0.210000,
        -0.080000, -0.110000, -0.150000, -0.140000, -0.130000, -0.170000,
        -0.260000, -0.180000, -0.310000, -0.340000, -0.220000, -0.190000,
        -0.370000, -0.230000, -0.250000, -0.280000, -0.760000, -0.300000,
        -0.410000, -0.380000, -0.530000, -0.810000, -0.600000, -0.490000,
        -0.290000, -0.200000, -0.240000, -0.430000, -0.670000, -0.450000,
        -0.510000, -0.690000, -0.740000, -1.940000, -2.030000,
        -12.710000, -0.700000, -0.360000, -8.000000, -0.540000,
        -1.570000, -0.500000, -0.470000, -0.440000, -0.650000, -1.440000,
        -0.680000, -1.270000, -0.940000, -0.990000, -1.310000,
        -15.930000, -3.470000, -1.970000, -3.440000, -1.300000,
        -2.650000, -1.370000, -3.230000, -0.610000, -107.000000,
        -2.000000, -7.500000, -1.250000, -62.000000, -35.000000,
        -1.500000, -11.5