### Recursos


- https://public.tableau.com/app/learn/sample-data
- https://github.com/plotly/datasets/tree/master
- https://github.com/vega/vega-datasets/tree/main/data


### Bibliotecas


In [None]:
import os
import pathlib
import duckdb
import pandas as pd
from urllib.request import urlretrieve


# Display all columns
pd.set_option('display.max_columns', None)
# Display all characters inside a column cell
pd.set_option("display.max_colwidth", None)
# Display all rows
pd.set_option("display.max_rows", None)
# Display all intems in a nested list inside a column cell
pd.set_option("display.max_seq_item", None)

# Enable create sql cells
%load_ext sql

In [None]:
# Automatically convert sql query result to pandas dataframe
%config SqlMagic.autopandas=True
# Disable message feedback printend after executing SQL command
%config SqlMagic.feedback=False
# Disable connection string display, connection string may contain sensitive
# information
%config SqlMagic.displaycon=False
# Enable named parameters
%config SqlMagic.named_parameters=True

### Conexão com o banco de dados


In [None]:
conn = duckdb.connect(database=":memory:", read_only=False)

In [None]:
%sql conn

### Pastas para salvar os arquivos


In [None]:
# Create data folder if not exists
directories = ["data", "output"]
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)

### Download dos arquivos


In [None]:
datasets = [
    {
        "url":"https://public.tableau.com/app/sample-data/sample_-_superstore.xls",
        "file": "data/superstore.xls"
    },
    {
        "url": "https://public.tableau.com/app/sample-data/netflix_titles.xlsx",
        "file": "data/netflix_titles.xlsx"
    },
    {
        "url": "https://public.tableau.com/app/sample-data/titanic%20passenger%20list.csv",
        "file": "data/titanic_passenger_list.csv"
    },
    {
        "url": "https://github.com/plotly/datasets/raw/master/2015_flights.parquet",
        "file": "data/2015_flights.parquet"
    }
]

# Download datasets
for dataset in datasets:
    urlretrieve(dataset["url"], dataset["file"])

### Carregando Basses de dados


#### Pandas


In [None]:
# df = DataFrame
# Superstore xls
df_superstore_orders = pd.read_excel("data/superstore.xls", sheet_name="Orders")
df_superstore_returns = pd.read_excel("data/superstore.xls", sheet_name="Returns")
df_superstore_people = pd.read_excel("data/superstore.xls", sheet_name="People")
# Netflix xlsx
df_netflix_titles = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles")
df_netflix_titles_directors = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles_directors")
df_netflix_titles_countries = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles_countries")
df_netflix_titles_cast = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles_cast")
df_netflix_titles_cast = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles_category")
# Titanic csv
df_titanic = pd.read_csv("data/titanic_passenger_list.csv")
# Flights parquet
df_flights = pd.read_parquet("data/2015_flights.parquet")

In [None]:
df_superstore_orders.head(1)

#### DuckDB


In [None]:
conn.execute(
    """
    INSTALL spatial;
    LOAD spatial;
    """
)
df_netflix = conn.execute(
    """
    CREATE TABLE netflix_titles AS
    SELECT *
    FROM st_read('data/netflix_titles.xlsx', layer='netflix_titles')
    """
)