# 1. Devices pre-processing

This task will fetch the devices raw data, perform data cleaning, store metadata information in mongo and store processed data in parquet files.

## 1.1 Import libraries

In [None]:
import sys

sys.path.append('..')

In [1]:
import os
import yaml
import numpy as np
import pandas as pd

from pymongo import MongoClient

from utils import get_csv_files, memory_usage, \
    correct_encoding, update_metadata

In [2]:
def save_df_page(page, chunks, collection, exclude=['id']):
    # concatenate data chunks -> careful benchmark this
    df = pd.concat(chunks, axis=0)

    # store page metadata in mongo
    update_metadata(collection, df)

    # save output to a parquet file with brotli compression
    df.to_parquet(os.path.join(model_storage, '{}.{}.parquet'.format(
        model, page)), compression='brotli')

    return page + 1

## 1.2 Load the data

In [3]:
DATA_DIR = '../../data'
STORAGE_DIR = '../../storage'

config = yaml.load(open('../config.yml'), Loader=yaml.FullLoader)

model = 'devices'
model_storage = os.path.join(STORAGE_DIR, model)

if not os.path.exists(model_storage):
    os.makedirs(model_storage)

client = MongoClient('mongodb://{}:{}@{}:{}'.format(config['MONGO_USERNAME'], config['MONGO_PASSWORD'],
                                                    config['MONGO_HOST'], config['MONGO_PORT']))
metadata_db = client[config['MONGO_DATABASE']]

model_metadata = metadata_db[model]

model_metadata.delete_many({})
model_metadata.insert_one({'pages': []})

<pymongo.results.InsertOneResult at 0x7fa6faac7f40>

In [4]:
csv_files = get_csv_files(os.path.join(DATA_DIR, 'raw', model, '*.csv'))

In [5]:
if not csv_files:
    raise FileNotFoundError(
        'Couldn\'t find any csv files! Please make sure the filepath exists')

In [6]:
df = pd.read_csv(csv_files[0], sep=';', nrows=100)

# sample rows to list columns and dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            100 non-null    int64 
 1   model         100 non-null    object
 2   manufacturer  100 non-null    object
 3   brand         100 non-null    object
 4   os_version    100 non-null    object
 5   is_root       100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


In [7]:
df.describe(include=[np.number])

Unnamed: 0,id,is_root
count,100.0,100.0
mean,50.5,0.06
std,29.011492,0.238683
min,1.0,0.0
25%,25.75,0.0
50%,50.5,0.0
75%,75.25,0.0
max,100.0,1.0


In [8]:
df.describe(exclude=[np.number])

Unnamed: 0,model,manufacturer,brand,os_version
count,100,100,100,100.0
unique,72,22,27,11.0
top,ALE-L21,samsung,samsung,7.0
freq,8,28,28,34.0


## 1.3 Basic data pre-processing

In [9]:
mappings = {'id': 'uint32', 'model': 'category', 'manufacturer': 'category',
            'brand': 'category', 'os_version': 'category', 'is_root': 'bool'}

In [10]:
total_rows = 0
total_memory_usage = 0.0
memory_usage_split = 0.0

page = 0
chunks = []

In [11]:
for filepath in csv_files:
    df = pd.read_csv(filepath, sep=';')

    # drop missing values
    df = df.dropna()

    string_columns = df.select_dtypes(include='object').columns.to_list()

    # basic string pre-processing: convert to lowercase and strip blank chars
    for column in string_columns:
        df.loc[:, column] = np.array([x.lower().strip()
                                      for x in df[column].tolist()])

    # slice the minor os_version number e.g. 8.0.1 -> 8.0
    df.loc[:, 'os_version'] = df['os_version'].str.slice(0, 3)

    df = df.astype(mappings)

    total_rows += df.shape[0]
    total_memory_usage += memory_usage(df)
    memory_usage_split += total_memory_usage

    if memory_usage_split >= config['MEMORY_USAGE_SPLIT']:
        page = save_df_page(page, chunks, model_metadata)

        memory_usage_split = 0.0
        chunks = []
    else:
        chunks.append(df)

In [12]:
if chunks:
    save_df_page(page, chunks, model_metadata)

In [13]:
total_rows, total_memory_usage

(306794, 8.17)