# 01 Data Extraction using Python

# 0. Prerequisites

In [None]:
!pip install pymysql

In [None]:
!pip install pandas

In [None]:
!pip install sqlalchemy

In [None]:
import pandas as pd
from sqlalchemy import create_engine

# 1. Connecting to MySQL using SQLAlchemy

**SQLAlchemy**

> SQLAlchemy is a powerful SQL toolkit and Object-Relational Mapping (ORM) library for Python. An SQLAlchemy Engine is a central object that manages database connections. It's essentially the starting point for any SQLAlchemy application.

**PyMySQL**

> PyMySQL is a MySQL client library for Python. It essentially acts as the driver for MySQL.

It is actually very possible to simply use PyMySQL without SQLAlchemy, but for the sake of best practice especially in the industry, we recommend using SQLAlchemy as it provides and abstraction layer that handles the connectivity details.


In [None]:
# Define all the credentials

USERNAME = ''
PASSWORD = ''
HOST = ''
PORT = 0
DATABASE = ''

conn_string = f'mysql+pymysql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}'
conn_string

In [None]:
engine = create_engine(conn_string)
engine

In [None]:
# Test connection to server, will raise error if not successful

engine.connect()

# 2. Running SQL queries

## 2.1 Using SQLAlchemy directly

In [None]:
from sqlalchemy import text

In [None]:
query = """
SELECT COUNT(*)
FROM listings
"""

with engine.connect() as conn:
    results = conn.execute(text(query))
    results = results.fetchall()
    
results

In [None]:
query = """
SELECT 
    listing_id,
    name,
    host_id
FROM listings
LIMIT 5
"""

with engine.connect() as conn:
    results = conn.execute(text(query))
    results = results.fetchall()
    
results

In [None]:
query = """
SELECT *
FROM listings
ORDER BY RAND()
LIMIT 5
"""

with engine.connect() as conn:
    results = conn.execute(text(query))
    results = results.fetchall()
    
results

## 2.2 Using Pandas

Reference: https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html

In [None]:
# Get number of rows

query = """
SELECT COUNT(*)
FROM listings
"""

results = pd.read_sql(query, engine)
results

In [None]:
query = """
SELECT 
    listing_id,
    name,
    host_id
FROM listings
LIMIT 5
"""

results = pd.read_sql(query, engine)
results

In [None]:
query = """
SELECT *
FROM listings
ORDER BY RAND()
LIMIT 5
"""

results = pd.read_sql(query, engine)
results

# Activity

- Task 1: Find listings (IDs and name) that have an acceptance rate of 0.5 or above
- Task 2: Find listings that has over 500 reviews
- Task 3: Find listings that have an acceptance rate of 0.5 or above AND has over 500 reviews

You can use Python or SQL to do the get the results, but show the final answer in a DataFrame.

In [None]:
# Write your code below



# Close the database connection

In [None]:
engine.dispose()