# 1. Data fetching

This task will fetch the raw data from the remote database given a SQL query file.

## 1.1 Install dependencies

In [1]:
!pip install -r requirements.txt



## 1.2 Import libraries

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os
import MySQLdb
import numpy as np
import pandas as pd

from sqlalchemy import create_engine

## 1.3 Create database connection
Create a database connection using the driver `mysqlclient`

In [4]:
model = 'devices'
connection_string = 'mysql://{}:{}@{}/{}'.format(os.getenv('DB_USERNAME'), os.getenv('DB_PASSWORD'), \
                                                 os.getenv('DB_HOST'), os.getenv('DB_DATABASE'))
engine = create_engine(connection_string, echo=True)

### 1.3.1 Read the SQL query file

In [5]:
sql_query = open('../data/sql/devices.sql').read()

## 1.4 Fetch the remote data

In [6]:
df = pd.read_sql(sql_query, engine)

2020-07-30 19:17:28,109 INFO sqlalchemy.engine.base.Engine SHOW VARIABLES LIKE 'sql_mode'
2020-07-30 19:17:28,111 INFO sqlalchemy.engine.base.Engine ()
2020-07-30 19:17:28,135 INFO sqlalchemy.engine.base.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2020-07-30 19:17:28,135 INFO sqlalchemy.engine.base.Engine ()
2020-07-30 19:17:28,178 INFO sqlalchemy.engine.base.Engine SELECT DATABASE()
2020-07-30 19:17:28,179 INFO sqlalchemy.engine.base.Engine ()
2020-07-30 19:17:28,218 INFO sqlalchemy.engine.base.Engine show collation where `Charset` = 'utf8mb4' and `Collation` = 'utf8mb4_bin'
2020-07-30 19:17:28,219 INFO sqlalchemy.engine.base.Engine ()
2020-07-30 19:17:28,241 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS CHAR(60)) AS anon_1
2020-07-30 19:17:28,242 INFO sqlalchemy.engine.base.Engine ()
2020-07-30 19:17:28,262 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS CHAR(60)) AS anon_1
2020-07-30 19:17:28,263 INFO sqlalchemy.engine.base.E

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306469 entries, 0 to 306468
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            306469 non-null  int64 
 1   model         306469 non-null  object
 2   manufacturer  306469 non-null  object
 3   brand         306469 non-null  object
 4   os_version    306469 non-null  object
 5   is_root       306469 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 14.0+ MB


## 1.5 Basic data pre-processing

In [8]:
mappings = {'id':'uint32', 'model':'category', 'manufacturer':'category',
            'brand':'category', 'os_version':'category', 'is_root': 'uint8'}

In [9]:
df = df.astype(mappings)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306469 entries, 0 to 306468
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   id            306469 non-null  uint32  
 1   model         306469 non-null  category
 2   manufacturer  306469 non-null  category
 3   brand         306469 non-null  category
 4   os_version    306469 non-null  category
 5   is_root       306469 non-null  uint8   
dtypes: category(4), uint32(1), uint8(1)
memory usage: 4.5 MB


## 1.6 Save output

In [10]:
df.to_parquet('../data/{}.parquet'.format(model), compression='brotli')

In [11]:
%store df

Stored 'df' (DataFrame)
