# 1. Devices pre-processing

This task will fetch the raw data from the remote database given a SQL query file.

## 1.1 Import libraries

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
DATA_DIR = '../../data'

## 1.2 Create database connection
Create a database connection using the driver `mysqlclient`

### 1.2.1 Read the SQL query file

## 1.3 Fetch the remote data

In [124]:
df = pd.DataFrame(np.random.randint(0,10,size=(10, 1)), columns=list('A'))
df2 = pd.DataFrame(np.random.randint(10,20,size=(10, 1)), columns=list('A'))

df = df.applymap(lambda x: str(x) + 'A')
df2 = df2.applymap(lambda x: str(x) + 'B')

df = df.astype('category')
df2 = df2.astype('category')

pd.concat([df, df2], axis=1)

Unnamed: 0,A,A.1
0,9A,14B
1,6A,10B
2,2A,19B
3,5A,15B
4,6A,12B
5,3A,17B
6,9A,14B
7,5A,16B
8,6A,16B
9,6A,13B


In [126]:
values = np.concatenate([df['A'].unique(), df2['A'].unique()])
codes, uniques = pd.factorize(values)

In [127]:
codes, uniques

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
 array(['9A', '6A', '2A', '5A', '3A', '14B', '10B', '19B', '15B', '12B',
        '17B', '16B', '13B'], dtype=object))

In [128]:
type(uniques)

numpy.ndarray

In [134]:
df['A'] = pd.Categorical(df['A'])
df2['A'] = pd.Categorical(df2['A'])

In [140]:
pd.concat([df, df2], axis=0).memory_usage(deep=True) / 1024 ** 2

Index    0.000153
A        0.001369
dtype: float64

## 1.4 Basic data pre-processing
Benchmarking string operations

In [9]:
string_columns = df.select_dtypes(include='object').columns.to_list()

In [10]:
for column in string_columns:
    df[column] = [x.lower().strip() for x in df[column].tolist()]

In [11]:
mappings = {'id':'uint32', 'model':'category', 'manufacturer':'category',
            'brand':'category', 'os_version':'category', 'is_root': 'uint8'}

In [12]:
df = df.astype(mappings)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306469 entries, 0 to 306468
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   id            306469 non-null  uint32  
 1   model         306469 non-null  category
 2   manufacturer  306469 non-null  category
 3   brand         306469 non-null  category
 4   os_version    306469 non-null  category
 5   is_root       306469 non-null  uint8   
dtypes: category(4), uint32(1), uint8(1)
memory usage: 4.4 MB


## 1.5 Save output
Storage data in parquet format with brotli compression.

In [13]:
df.to_parquet(os.path.join(STORAGE_DIR, '{}.parquet'.format(model)), compression='brotli')