# Data Retrieval using Pandas
### Data Science Pipeline Workshop 11 Juni 2022
- Author : Randy Galawana
- Email  : randy_galawana1@telkomsel.co.id
&copy; Telkomsel 2022

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import requests
import yaml

### Retrieve data using URL to File / path to file

In [None]:
url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
df = pd.read_csv(url, sep="\t", header=0)
df.head()

### Retrieve data using excel file

In [None]:
df = pd.read_excel("data/movies.xls")
df.head()

### Retrieve data using Json
from movie dataset with json formats

In [None]:
df = pd.read_json("data/movies.json", orient="records")
df.sample(4)

### Retrieve data nested JSON
{
    "school_name": "ABC primary school",
    "class": "Year 1",
    "students": [
    {
        "id": "A001",
        "name": "Tom",
        "math": 60,
        "physics": 66,
        "chemistry": 61
    },
    ,,,
  ]
}

In [None]:
df = pd.read_json('data/simple.json')
df.head()

In [None]:
# to flatten the nested json we use pd.json_normalize
import json

with open('data/simple.json', 'r') as f:
    data = json.loads(f.read())

df = pd.json_normalize(
    data, 
    record_path =['students'], 
    meta=['school_name', 'class']
)

df.head()

### Retrieve data using Parquet File

In [None]:
df = pd.read_parquet("data/movies.parquet", engine="fastparquet")
df.tail(5)

### Retrieve data using SQL
Use Case using AirBNB opendata from http://insideairbnb.com/get-the-data/ Singapore Data captured on 28 March 2022 with tables as follow
- airbnb_sg_listings : detail listings of rooms/property
- airbnb_sg_listings_summary : summary of listings
- airbnb_sg_calendar : detail of listings calendar and avaibility
- airbnb_sg_reviews : detail of review of listings

In [None]:
# Load credentials from credentials.yml shared by mentor
with open('credentials.yml', 'r') as f:
    credentials = yaml.safe_load(f)['mysql']

In [None]:
# create sqlalchemy engine
db_engine = create_engine('mysql://{user}:{password}@{host}:{port}/{database}'.format(**credentials))

In [None]:
## use case 1 : select all listings with ratio of beds per bedrooms less than 2

query = """select  from airbnb_sg_listings where beds/bedrooms < 2"""
df = pd.read_sql(query, db_engine)
df.count()

In [None]:
df[['id', 'beds', 'bedrooms', 'reviews_per_month']]

In [None]:
## Use Case 2 : get all listing which already booked on April 2022

query = """select  from airbnb_sg_listings where beds/bedrooms < 2"""
df = pd.read_sql(query, db_engine)
df.count()

In [None]:
## Use Case 3 : get count of listing that have review > 4 and has at least 3 bookings in Mar 2022

query = """select  from airbnb_sg_listings where beds/bedrooms < 2"""
df = pd.read_sql(query, db_engine)
df.count()

### Retrieve data using API Endpoint

we use covid 19 open data


In [None]:
## get data from API covid19
url = 'https://api.covid19api.com/summary'
resp = requests.get(url)
if resp.ok:
    data = resp.json()


In [None]:
for key in data.keys():
    print(f'key {key} in json data type is {type(data[key])}')

In [None]:
# keys countries consist of list data type 
df = pd.DataFrame(data["Countries"])
df.head()

In [None]:
# global data 
data['Global']