In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Task of data engineer
Set up scheduled ingestion of data from the application databases to an analytical database

Data Engineer:
    - Cloud technology
    - Develop Scalable Data Architecture
    - Streamline Data Acquisition
    - Set up processes to bring together data
    - Clear Corrupted Data

Data Scientist: 
    - Mining data for pattern
    - Apply statistical models on large datasets
    - Build predictive models using ML
    - Develop tools to monitor business process
    - Clean outliers in data

Data Engineer Problem:

###Data scientists are querying the online store databases directly and slowing down the functioning of the application since it's using the same database. ###
Data Engineer should make sure there is a seperate database for analytic

###The online store is slow because the application's database server doesn't have enough memory.### infrastructure problem

## Tools of data engineer
#Processing example
##cluster of computers perform these operations using PySpark framework

df = spark.read.parquet("users.parquet") 

outliers = df.filter(df["age"] > 100)

print(outliers.count())

## Processing tasks
- Data engineers often have to join, clean, or organize data before loading it into a destination analytics database. This is done in the data processing, or data transformation step.
- Data Processing is distributed over clusters of virtual machines(e.g. using Spark)


# Scheduling tools

Make sure jobs run in a specific order and all dependencies are resolved correctly.

Make sure the jobs run at midnight UTC each day.

###Scale up the number of nodes when there's lots of data to be processed.### Jobs of processing tools

Why Cloud computing ?

The cloud can provide you with the resources you need, when you need them.

## Chapter 2: Data engineering toolbox

Databases: A usually large collection of data organized especially for rapid search and retrieval

- Holds data
- Organise data
- Retrieve/ Search Data
Database management system- 
DBMS -much more organised than file systems
- functitons: search, replication
Structured vs unstructured data

SQL database schema:
    
## Create Customer table
    CREATE TABLE "Customer"(
    "id" SERIAL not NULL,
    "first_name" varchar,
    "second_name" varchar,
    PRIMARY KEY ("id")
    )
    
## Create order table
    CREATE TABLE "Order"(
    "id" SERIAL not NULL,
    "customer_id" integer REFERENCES " Customer",
    "product_name" varchar,
    "product_price" integer,
        
    PRIMARY KEY("id")
    )

SELECT * FROM "Customer"
INNER JOIN "Order"
ON "customer_id" = "Customer" ."id";

SQL:
    - Customer data in a store's database
    - Always has a database schema
    - MySQL, PostgreSQL
    
NoSQL:
    - can be schemaless
    - Key- value stores: Redis e.g. catching layer in distributed web server
    - MongoDB

## The database schema

#db_engine: database engine, which has been defined for you and is called db_engine
#db_enginer = Engine(postgresql://repl@/postgres)    

#Complete the SELECT statement

data = pd.read_sql("""
SELECT first_name, last_name FROM "Customer"
ORDER BY last_name, first_name
""", db_engine)

#Show the first 3 rows of the DataFrame

print(data.head(n=3))

#Show the info of the DataFrame

print(data.info())

#Complete the SELECT statement

data = pd.read_sql("""

SELECT * FROM "Customer"

INNER JOIN "Order"

ON "Order"."customer_id"="Customer"."id"

""", db_engine)

#Show the id column of data
print(data.id)

In [1]:
## Parallel computing
Parallel computing can optimize the use of multiple processing units.
Parallel computing can optimize the use of memory between several machines.


In [None]:
multiprocessing.Pool
from multiprocessing import Tool
## function take_mean_Age
def take_mean_age(year_and_group):
    year, group = year_and_group
    return pd.DataFrame({"Age":group['Age'].mean()}, index = [year])

with Pool(4) as p: ##using 4 cores
    results = p.map(take_mean_age, athlete_events.groupby("Year"))
    
## concatenate the results to form the resulting DataFrame
result_df = pd.concat(results)

In [None]:
## Dask framework to avoid write low-level code
import dask.dataframe as dd

## partition dataframe into 4
athlete_events_dask = dd.from_pandas(athlete_events, npartitions = 4)

## Run parallel computations from each partition
result_df = athlete_events_dask.groupby("Year").Age.mean().compute() ## dask use lazy evaluation hence add.compute()

In [None]:
## From task to subtasks
# Function to apply a function over multiple cores
@print_timing
#It takes in as input the function being applied, the grouping used, and the number of cores needed for the analysis.
def parallel_apply(apply_func, groups, nb_cores):
    with Pool(nb_cores) as p:
        results = p.map(apply_func, groups)
    return pd.concat(results)

# Parallel apply using 1 core
parallel_apply(take_mean_age, athlete_events.groupby('Year'), nb_cores = 1)

# Parallel apply using 2 cores
parallel_apply(take_mean_age, athlete_events.groupby('Year'), nb_cores = 2)

# Parallel apply using 4 cores
parallel_apply(take_mean_age, athlete_events.groupby('Year'), nb_cores = 4)

In [4]:
import pandas as pd
athlete_events = pd.read_csv("athletes.xls")
athlete_events.head()

Unnamed: 0,id,name,gender,age,height,weight
0,51,Nstor Abad Sanjun,M,23,167.0,64.0
1,55,Antonio Abadia Beci,M,26,170.0,65.0
2,110,Abubakar Abbas Abbas,M,20,175.0,66.0
3,126,Forough Abbasi,F,20,164.0,58.0
4,251,Bashir Abdi,M,27,176.0,56.0


In [None]:
## Using a dataframe 
import dask.dataframe as dd

# Set the number of pratitions
athlete_events_dask = dd.from_pandas(athlete_events, npartitions = 4)

# Calculate the mean Age per Year
print(athlete_events_dask.groupby('Year').Age.mean().compute())

In [None]:
## Parallel computation framework
# Hive Example
SELECT year,AVG(age)
FROM views.athlete_events
GROUP BY year