# Data Retrieval using Pandas
### Data Science Pipeline Workshop 11 Juni 2022
- Author : Randy Galawana
- Email  : randy_galawana1@telkomsel.co.id
&copy; Telkomsel 2022

In [8]:
import pandas as pd
from sqlalchemy import create_engine
import requests
import yaml

### Retrieve data using URL to File / path to file

In [None]:
url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
df = pd.read_csv(url, sep="\t", header=0)
df.head()

### Retrieve data using excel file

In [3]:
df = pd.read_excel("data/movies.xls")
df.head()

Unnamed: 0,Title,Year,Genres,Language,Country,Content Rating,Duration,Aspect Ratio,Budget,Gross Earnings,...,Facebook Likes - Actor 1,Facebook Likes - Actor 2,Facebook Likes - Actor 3,Facebook Likes - cast Total,Facebook likes - Movie,Facenumber in posters,User Votes,Reviews by Users,Reviews by Crtiics,IMDB Score
0,Intolerance: Love's Struggle Throughout the Ages,1916,Drama|History|War,,USA,Not Rated,123,1.33,385907.0,,...,436,22,9.0,481,691,1,10718,88,69.0,8.0
1,Over the Hill to the Poorhouse,1920,Crime|Drama,,USA,,110,1.33,100000.0,3000000.0,...,2,2,0.0,4,0,1,5,1,1.0,4.8
2,The Big Parade,1925,Drama|Romance|War,,USA,Not Rated,151,1.33,245000.0,,...,81,12,6.0,108,226,0,4849,45,48.0,8.3
3,Metropolis,1927,Drama|Sci-Fi,German,Germany,Not Rated,145,1.33,6000000.0,26435.0,...,136,23,18.0,203,12000,1,111841,413,260.0,8.3
4,Pandora's Box,1929,Crime|Drama|Romance,German,Germany,Not Rated,110,1.33,,9950.0,...,426,20,3.0,455,926,1,7431,84,71.0,8.0


### Retrieve data using Json
from movie dataset with json formats

In [4]:
df = pd.read_json("data/movies.json", orient="records")
df.sample(4)

Unnamed: 0,Title,Year,Genres,Language,Country,Content Rating,Duration,Aspect Ratio,Budget,Gross Earnings,...,Facebook Likes - Actor 1,Facebook Likes - Actor 2,Facebook Likes - Actor 3,Facebook Likes - cast Total,Facebook likes - Movie,Facenumber in posters,User Votes,Reviews by Users,Reviews by Crtiics,IMDB Score
412,Krush Groove,1985,Comedy|Drama|Music,English,USA,R,97,,3000000.0,,...,685,178,77.0,1163,889,0,1361,16,7.0,6.6
430,9½ Weeks,1986,Drama|Romance,English,USA,R,112,1.85,17000000.0,6734844.0,...,567,67,45.0,711,0,0,29591,118,38.0,5.9
776,Jade,1995,Crime|Drama|Thriller,English,USA,R,107,1.85,50000000.0,9795017.0,...,2000,979,602.0,5461,422,0,9227,70,50.0,5.1
899,The Island of Dr. Moreau,1996,Horror|Sci-Fi|Thriller,English,USA,PG-13,99,2.35,40000000.0,27663982.0,...,10000,368,44.0,10469,0,2,26051,164,70.0,4.4


### Retrieve data nested JSON
{
    "school_name": "ABC primary school",
    "class": "Year 1",
    "students": [
    {
        "id": "A001",
        "name": "Tom",
        "math": 60,
        "physics": 66,
        "chemistry": 61
    },
    ,,,
  ]
}

In [18]:
df = pd.read_json('data/simple.json')
df.head()

Unnamed: 0,school_name,class,students
0,ABC primary school,Year 1,"{'id': 'A001', 'name': 'Tom', 'math': 60, 'phy..."
1,ABC primary school,Year 1,"{'id': 'A002', 'name': 'James', 'math': 89, 'p..."
2,ABC primary school,Year 1,"{'id': 'A003', 'name': 'Jenny', 'math': 79, 'p..."


In [20]:
# to flatten the nested json we use pd.json_normalize
import json

with open('data/simple.json', 'r') as f:
    data = json.loads(f.read())

df = pd.json_normalize(
    data, 
    record_path =['students'], 
    meta=['school_name', 'class']
)

df.head()

Unnamed: 0,id,name,math,physics,chemistry,school_name,class
0,A001,Tom,60,66,61,ABC primary school,Year 1
1,A002,James,89,76,51,ABC primary school,Year 1
2,A003,Jenny,79,90,78,ABC primary school,Year 1


### Retrieve data using Parquet File

In [7]:
df = pd.read_parquet("data/movies.parquet")
df.tail(5)

Unnamed: 0,Title,Year,Genres,Language,Content Rating,Duration,Aspect Ratio,Budget,Gross Earnings,Director,...,Facebook Likes - Actor 2,Facebook Likes - Actor 3,Facebook Likes - cast Total,Facebook likes - Movie,Facenumber in posters,User Votes,Reviews by Users,Reviews by Crtiics,IMDB Score,Country
1333,Wild Wild West,1999,Action|Comedy|Sci-Fi|Western,English,PG-13,106,1.85,170000000.0,113745408.0,Barry Sonnenfeld,...,4000,582.0,15870,0,2,129601,648,85.0,4.8,USA
1334,Wing Commander,1999,Action|Adventure|Sci-Fi,English,PG-13,100,2.35,30000000.0,11576087.0,Chris Roberts,...,586,362.0,2497,858,3,14747,338,85.0,4.1,USA
1335,The Torture Chamber of Dr. Sadism,1967,Horror|Mystery,German,M,80,1.66,,,Harald Reinl,...,57,51.0,16110,200,0,952,36,38.0,6.0,West Germany
1336,Das Boot,1981,Adventure|Drama|Thriller|War,German,R,293,1.85,14000000.0,11433134.0,Wolfgang Petersen,...,21,18.0,469,11000,0,168203,426,96.0,8.4,West Germany
1337,The NeverEnding Story,1984,Adventure|Drama|Family|Fantasy,English,PG,94,2.35,27000000.0,,Wolfgang Petersen,...,312,271.0,1560,21000,1,99557,284,99.0,7.4,West Germany


### Retrieve data using SQL
Use Case using AirBNB opendata from http://insideairbnb.com/get-the-data/ Singapore Data captured on 28 March 2022 with tables as follow
- airbnb_sg_listings : detail listings of rooms/property
- airbnb_sg_listings_summary : summary of listings
- airbnb_sg_calendar : detail of listings calendar and avaibility
- airbnb_sg_reviews : detail of review of listings

In [21]:
# Load credentials from credentials.yml shared by mentor
with open('credentials.yml', 'r') as f:
    credentials = yaml.safe_load(f)['mysql']

In [22]:
# create sqlalchemy engine
db_engine = create_engine('mysql://{user}:{password}@{host}:{port}/{database}'.format(**credentials))

In [None]:
## use case 1 : select all listings with ratio of beds per bedrooms less than 2

query = """select  from airbnb_sg_listings where beds/bedrooms < 2"""
df = pd.read_sql(query, db_engine)
df.count()

In [None]:
df[['id', 'beds', 'bedrooms', 'reviews_per_month']]

In [None]:
## Use Case 2 : get all listing which already booked on April 2022



In [24]:
## Use Case 3 : get count of listing that have review > 4 and has at least 3 bookings in Mar 2022



### Retrieve data using API Endpoint

we use covid 19 open data


In [None]:
## get data from API covid19
url = 'https://api.covid19api.com/summary'
resp = requests.get(url)
if resp.ok:
    data = resp.json()


In [None]:
for key in data.keys():
    print(f'key {key} in json data type is {type(data[key])}')

In [None]:
# keys countries consist of list data type 
df = pd.DataFrame(data["Countries"])
df.head()

In [None]:
# global data 
data['Global']

### Retrieve geojson data using geopandas

In [27]:
import geopandas as gpd

In [28]:
df = gpd.read_file("data/neighbourhoods.geojson")

In [30]:
df.head()

Unnamed: 0,neighbourhood,neighbourhood_group,geometry
0,Pasir Ris,East Region,"MULTIPOLYGON (((103.95322 1.38201, 103.95350 1..."
1,Seletar,North-East Region,"MULTIPOLYGON (((103.88691 1.42649, 103.88812 1..."
2,Sungei Kadut,North Region,"MULTIPOLYGON (((103.76440 1.44345, 103.76443 1..."
3,Orchard,Central Region,"MULTIPOLYGON (((103.84298 1.30001, 103.84294 1..."
4,Simpang,North Region,"MULTIPOLYGON (((103.86350 1.43433, 103.86361 1..."
