# Introduction to Data Engineering

## 2 - Data engineering toolbox

#### Database schema


In [2]:
# Complete the SELECT statement
data = pd.read_sql("""
SELECT first_name, last_name FROM "Customer"
ORDER BY last_name, first_name
""", db_engine)

# Show the first 3 rows of the DataFrame
print(data.head(3))

# Show the info of the DataFrame
print(data.info())

NameError: name 'pd' is not defined

#### Joining on relations

In [4]:
# Complete the SELECT statement
data = pd.read_sql("""
SELECT * FROM "Customer"
INNER JOIN "Order"
ON "Order"."customer_id"="Customer"."id"
""", db_engine)

# Show the id column of data
print(data.id)

NameError: name 'pd' is not defined

#### From task to subtasks

In [5]:
# Function to apply a function over multiple cores
@print_timing
def parallel_apply(apply_func, groups, nb_cores):
    with Pool(nb_cores) as p:
        results = p.map(apply_func, groups)
    return pd.concat(results)

# Parallel apply using 1 core
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 1)

# Parallel apply using 2 cores
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 2)

# Parallel apply using 4 cores
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 4)

NameError: name 'print_timing' is not defined

#### Using a DataFrame

In [6]:
import dask.dataframe as dd

# Set the number of pratitions
athlete_events_dask = dd.from_pandas(athlete_events, npartitions = 4)

# Calculate the mean Age per Year
print(athlete_events_dask.groupby('Year').Age.mean().compute())

NameError: name 'athlete_events' is not defined

#### A PySpark groupby

In [7]:
# Print the type of athlete_events_spark
print(type(athlete_events_spark))

# Print the schema of athlete_events_spark
print(athlete_events_spark.printSchema())

# Group by the Year, and find the mean Age
print(athlete_events_spark.groupBy('Year').mean('Age'))

# Group by the Year, and find the mean Age
print(athlete_events_spark.groupBy('Year').mean('Age').show())

NameError: name 'athlete_events_spark' is not defined

#### Running PySpark files

In [None]:
repl:~$ cat /home/repl/spark-script.py
from pyspark.sql import SparkSession


if __name__ == "__main__":
    spark = SparkSession.builder.getOrCreate()
    athlete_events_spark = (spark
        .read
        .csv("/home/repl/datasets/athlete_events.csv",
             header=True,
             inferSchema=True,
             escape='"'))

    athlete_events_spark = (athlete_events_spark
        .withColumn("Height",
                    athlete_events_spark.Height.cast("integer")))

    print(athlete_events_spark
        .groupBy('Year')
        .mean('Height')
        .orderBy('Year')
        .show())

#### Parallel computing



In [8]:
from multiprocessing import Pool
import pandas as pd

def take_mean_age(year_and_group):
    year, group = year_and_group
    return pd.DataFrame({"Age": group["Age"].mean()}, index=[year])
    
# defining 4 as an argument to Pool, the mapping runs in 4 separate processes
# thus uses 4 cores
with Pool(4) as p:
    results = p.map(take_mean_age, athlete_events.groupby("Year"))
    
result_df = pd.concat(results)

NameError: name 'athlete_events' is not defined

#### Parallel computing using Dask framework

In [9]:
import dask.dataframe as dd

# partition dataframe into 4
athlete_events_dask = dd.from_pandas(athlete_events, npartitions = 4)

# run parallel computation on each partition
# dask uses lazy evaluation, you need to add compute() at the end
result_df = athlete_events_dask.groupby('Year').Age.mean().compute()

NameError: name 'athlete_events' is not defined

##### example 1

In [11]:
# Function to apply a funciton over multiple cores
def take_mean_age(year_and_group):
    year, group = year_and_group
    return pd.DataFrame({"Age": group["Age"].mean()}, index=[year])

@print_timing
def parallel_apply(apply_func, groups, nb_cores):
    with Pool(nb_cores) as p:
        results = p.map(apply_func, groups)
    return pd.concat(results)

#parallel apply using 1 core
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 1)

#parallel apply using 2 cores
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 2)

#parallel apply using 4 cores
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 4)

NameError: name 'print_timing' is not defined

##### example 2 (Dask)

In [13]:
import dask.dataframe as dd

#set the number of partitions
athlete_events_dask = dd.from_pandas(athlete_events, npartitions = 4)

#calculate the mean age per year
print(athlete_events_dask.groupby('Year').Age.mean().compute()

SyntaxError: unexpected EOF while parsing (<ipython-input-13-6b2faa29dd8a>, line 7)

#### Airflow DAGs

In [14]:
# Create the DAG object
dag = DAG(dag_id="car_factory_simulation",
          default_args={"owner": "airflow", "start_date": airflow.utils.dates.days_ago(2)},
          schedule_interval="0 * * * *")
          
# Task definitions
assemble_frame = BashOperator(task_id="assemble_frame", 
                              bash_command='echo "Assembling frame"', dag=dag)
place_tires = BashOperator(task_id="place_tires", 
                           bash_command='echo "Placing tires"', dag=dag)
assemble_body = BashOperator(task_id="assemble_body", 
                             bash_command='echo "Assembling body"', dag=dag)
apply_paint = BashOperator(task_id="apply_paint", 
                           bash_command='echo "Applying paint"', dag=dag)

# Complete the downstream flow
assemble_frame.set_downstream(place_tires)
assemble_frame.set_downstream(assemble_body)
assemble_body.set_downstream(apply_paint)

NameError: name 'DAG' is not defined

## 3 - Extract, Transform and Load (ETL)

#### Extract

#### - data on the web through APIs


In [17]:
import requests

reponse = requests.get("https://hacker-news.firebaseio.com/v0/item/16222426.json")
print(reponse.json())

{'by': 'neis', 'descendants': 0, 'id': 16222426, 'score': 17, 'time': 1516800333, 'title': 'Duolingo-Style Learning for Data Science: DataCamp for Mobile', 'type': 'story', 'url': 'https://medium.com/datacamp/duolingo-style-learning-for-data-science-datacamp-for-mobile-3861d1bc02df'}


#### - data in database

In [28]:
# connection string/URI
# postgresql://[user][:password]@[host][:port]

import sqlalchemy

# Connect to the database using the connection URI
connection_uri = "postgresql://repl:password@localhost:5432/pagila" 
db_engine = sqlalchemy.create_engine(connection_uri)

##### example 1 - Fetch from an API

In [25]:
import requests

#Fetch the Hackernews postgresql
resp = requests.get("https://hacker-news.firebaseio.com/v0/item/16222426.json")

# Print the response parsed as JSON
print(resp.json())

# Assign the score of the test to post_score
post_score = resp.json()["score"]
print(post_score)

{'by': 'neis', 'descendants': 0, 'id': 16222426, 'score': 17, 'time': 1516800333, 'title': 'Duolingo-Style Learning for Data Science: DataCamp for Mobile', 'type': 'story', 'url': 'https://medium.com/datacamp/duolingo-style-learning-for-data-science-datacamp-for-mobile-3861d1bc02df'}
17


##### example 2 - Read from a database

In [None]:
import pandas as pd

def extract_table_to_pandas(tablename, db_engine):
    query = "SELECT * FROM {}".format(tablename)
    return pd.read_sql(query, db_engine)
    
# Connect to the database using the connection URI
connection_uri = "postgresql://repl:password@localhost:5432/pagila" 
db_engine = sqlalchemy.create_engine(connection_uri)

# Extract the film table into a pandas DataFrame
extract_table_to_pandas("film", db_engine)