### Recursos


- https://public.tableau.com/app/learn/sample-data
- https://github.com/plotly/datasets/tree/master
- https://github.com/vega/vega-datasets/tree/main/data


### Bibliotecas


In [1]:
import os
import pathlib
import traceback
import sys
import duckdb
import pandas as pd
from urllib.request import urlretrieve


# Display all columns
pd.set_option('display.max_columns', None)
# Display all characters inside a column cell
pd.set_option("display.max_colwidth", None)
# Display all rows
pd.set_option("display.max_rows", None)
# Display all intems in a nested list inside a column cell
pd.set_option("display.max_seq_item", None)

# Enable create sql cells
%load_ext sql

In [2]:
# Automatically convert sql query result to pandas dataframe
%config SqlMagic.autopandas=True
# Disable message feedback printend after executing SQL command
%config SqlMagic.feedback=False
# Disable connection string display, connection string may contain sensitive
# information
%config SqlMagic.displaycon=False
# Enable named parameters
%config SqlMagic.named_parameters=True

### Conexão com o banco de dados


In [3]:
conn = duckdb.connect(database=":memory:", read_only=False)

In [4]:
%sql conn

### Pastas para salvar os arquivos


In [5]:
# Create data folder if not exists
directories = ["data", "output"]
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)

### Download dos arquivos


In [6]:
datasets = [
    {
        "url":"https://public.tableau.com/app/sample-data/sample_-_superstore.xls",
        "file": "data/superstore.xls"
    },
    {
        "url": "https://public.tableau.com/app/sample-data/netflix_titles.xlsx",
        "file": "data/netflix_titles.xlsx"
    },
    {
        "url": "https://public.tableau.com/app/sample-data/titanic%20passenger%20list.csv",
        "file": "data/titanic_passenger_list.csv"
    },
    {
        "url": "https://github.com/plotly/datasets/raw/master/2015_flights.parquet",
        "file": "data/2015_flights.parquet"
    }
]

# Download datasets
for dataset in datasets:
    urlretrieve(dataset["url"], dataset["file"])

### Carregando Basses de dados


#### Pandas


In [7]:
# df = DataFrame
# Superstore xls
df_superstore_orders = pd.read_excel("data/superstore.xls", sheet_name="Orders")
df_superstore_returns = pd.read_excel("data/superstore.xls", sheet_name="Returns")
df_superstore_people = pd.read_excel("data/superstore.xls", sheet_name="People")
# Netflix xlsx
df_netflix_titles = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles")
df_netflix_titles_directors = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles_directors")
df_netflix_titles_countries = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles_countries")
df_netflix_titles_cast = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles_cast")
df_netflix_titles_category = pd.read_excel("data/netflix_titles.xlsx", sheet_name="netflix_titles_category")
# Titanic csv
df_titanic = pd.read_csv("data/titanic_passenger_list.csv")
# Flights parquet
df_flights = pd.read_parquet("data/2015_flights.parquet")

In [8]:
df_superstore_orders.head(1)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136


In [9]:
df_superstore_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Row ID         9994 non-null   int64         
 1   Order ID       9994 non-null   object        
 2   Order Date     9994 non-null   datetime64[ns]
 3   Ship Date      9994 non-null   datetime64[ns]
 4   Ship Mode      9994 non-null   object        
 5   Customer ID    9994 non-null   object        
 6   Customer Name  9994 non-null   object        
 7   Segment        9994 non-null   object        
 8   Country        9994 non-null   object        
 9   City           9994 non-null   object        
 10  State          9994 non-null   object        
 11  Postal Code    9994 non-null   int64         
 12  Region         9994 non-null   object        
 13  Product ID     9994 non-null   object        
 14  Category       9994 non-null   object        
 15  Sub-Category   9994 n

#### DuckDB


In [10]:
conn.execute(
    """
    INSTALL spatial;
    LOAD spatial;
    """
)

try:
    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS superstore_orders AS
        SELECT *
        FROM st_read('data/superstore.xls', layer='Orders')
        """
    )
except Exception as error:
    print(error)


conn.execute(
    """
    CREATE TABLE IF NOT EXISTS netflix_titles AS
    SELECT *
    FROM st_read('data/netflix_titles.xlsx', layer='netflix_titles')
    """
)

conn.execute("DROP TABLE IF EXISTS netflix_titles")

ERROR 4: `data/superstore.xls' not recognized as a supported file format.


IO Error: Could not open file: data/superstore.xls (`data/superstore.xls' not recognized as a supported file format.)


<duckdb.DuckDBPyConnection at 0x7e6a8b4125b0>

O DuckDB ainda é muito recente então não possui determinadas funcionalidades (11/2023).

- Especificar o tipo de codificação do arquivo (encoding). Ex: encoding='latin1' (português do Brasil)
- Carregar arquivos .xls

Podemos carregar os dados que se enquadram nestas limitações com o Pandas e depois converter para DuckDB.


In [11]:
%%sql
CREATE TABLE IF NOT EXISTS superstore_orders AS SELECT * FROM df_superstore_orders;
CREATE TABLE IF NOT EXISTS superstore_returns AS SELECT * FROM df_superstore_returns;
CREATE TABLE IF NOT EXISTS superstore_people AS SELECT * FROM df_superstore_people;

Unnamed: 0,Count
0,4


In [12]:
def full_text_search(df, search_value):
    for column in df.columns:
        for index, row in df.iterrows():
            row_value = row[column]
            if isinstance(row_value, str) and search_value in row_value:
                return f'Found "{search_value}" in column "{column}" at index {index}'

    return f'"{search_value}" not found in any column'

In [13]:
full_text_search(df_netflix_titles, "Flying Fortress")

'Found "Flying Fortress" in column "duration_minutes" at index 2018'

In [14]:
df_netflix_titles.iloc[2018]
df_netflix_titles.drop(2018, inplace=True)

In [15]:
%%sql
CREATE TABLE IF NOT EXISTS netflix_titles AS SELECT * FROM df_netflix_titles;
CREATE TABLE IF NOT EXISTS netflix_titles_directors AS SELECT * FROM df_netflix_titles_directors;
CREATE TABLE IF NOT EXISTS netflix_titles_countries AS SELECT * FROM df_netflix_titles_countries;
CREATE TABLE IF NOT EXISTS netflix_titles_cast AS SELECT * FROM df_netflix_titles_cast;
CREATE TABLE IF NOT EXISTS netflix_titles_category AS SELECT * FROM df_netflix_titles_category;


Unnamed: 0,Count
0,13670
