In [1]:
import duckdb
import pandas as pd
import polars as pl

In [2]:
csv_names_dir = 'data/input/names.csv'

# Leitura de arquivo .csv

In [3]:
df_pandas_names = pd.read_csv(csv_names_dir)

In [4]:
df_polars_names = pl.read_csv(csv_names_dir)

In [5]:
duckdb.sql(f""" DROP TABLE IF EXISTS df_duckdb_names;
                       """)

duckdb.sql(f"""
                        CREATE TABLE df_duckdb_names AS 
                        SELECT * FROM '{csv_names_dir}'
                       """)

# Replace valores \N

Pandas

In [6]:
for column in df_pandas_names.columns:
    print(f'{column}: ', df_pandas_names.query(f"{column} == '\\\\N'")[column].count())

nconst:  0
primaryName:  48
birthYear:  12770823
deathYear:  13166298
primaryProfession:  2565021
knownForTitles:  1470866


In [7]:
df_pandas_names = df_pandas_names.replace('\\\\N', None, regex=True)

Polars

In [8]:
for column in df_polars_names.columns:
    print(f'{column}: ', df_polars_names.select(pl.col(column)).filter(pl.col(column) == '\\N').count())

nconst:  shape: (1, 1)
┌────────┐
│ nconst │
│ ---    │
│ u32    │
╞════════╡
│ 0      │
└────────┘
primaryName:  shape: (1, 1)
┌─────────────┐
│ primaryName │
│ ---         │
│ u32         │
╞═════════════╡
│ 48          │
└─────────────┘
birthYear:  shape: (1, 1)
┌───────────┐
│ birthYear │
│ ---       │
│ u32       │
╞═══════════╡
│ 12770823  │
└───────────┘
deathYear:  shape: (1, 1)
┌───────────┐
│ deathYear │
│ ---       │
│ u32       │
╞═══════════╡
│ 13166298  │
└───────────┘
primaryProfession:  shape: (1, 1)
┌───────────────────┐
│ primaryProfession │
│ ---               │
│ u32               │
╞═══════════════════╡
│ 2565021           │
└───────────────────┘
knownForTitles:  shape: (1, 1)
┌────────────────┐
│ knownForTitles │
│ ---            │
│ u32            │
╞════════════════╡
│ 1470866        │
└────────────────┘


In [9]:
df_polars_names = df_polars_names.with_columns(pl.all().replace('\\N', None))

DuckDB

In [10]:
duckdb.sql("""
    SELECT 
            SUM(CASE WHEN nconst             == '\\N' THEN 1 ELSE 0 END) AS nconst
           ,SUM(CASE WHEN primaryName        == '\\N' THEN 1 ELSE 0 END) AS primaryName
           ,SUM(CASE WHEN birthYear          == '\\N' THEN 1 ELSE 0 END) AS birthYear
           ,SUM(CASE WHEN deathYear          == '\\N' THEN 1 ELSE 0 END) AS deathYear
           ,SUM(CASE WHEN primaryProfession  == '\\N' THEN 1 ELSE 0 END) AS primaryProfession
           ,SUM(CASE WHEN knownForTitles     == '\\N' THEN 1 ELSE 0 END) AS knownForTitles
    FROM df_duckdb_names 
""").show()

┌────────┬─────────────┬───────────┬───────────┬───────────────────┬────────────────┐
│ nconst │ primaryName │ birthYear │ deathYear │ primaryProfession │ knownForTitles │
│ int128 │   int128    │  int128   │  int128   │      int128       │     int128     │
├────────┼─────────────┼───────────┼───────────┼───────────────────┼────────────────┤
│      0 │          48 │  12770823 │  13166298 │           2565021 │        1470866 │
└────────┴─────────────┴───────────┴───────────┴───────────────────┴────────────────┘



In [11]:
duckdb.sql("""
    UPDATE df_duckdb_names
    SET 
           nconst               = CASE WHEN nconst              == '\\N' THEN NULL ELSE nconst END,
           primaryName          = CASE WHEN primaryName         == '\\N' THEN NULL ELSE primaryName END,
           birthYear            = CASE WHEN birthYear           == '\\N' THEN NULL ELSE birthYear END,
           deathYear            = CASE WHEN deathYear           == '\\N' THEN NULL ELSE deathYear END,
           primaryProfession    = CASE WHEN primaryProfession   == '\\N' THEN NULL ELSE primaryProfession END,
           knownForTitles       = CASE WHEN knownForTitles      == '\\N' THEN NULL ELSE knownForTitles END;
""")

# Criando tabela profession

DuckDB

In [12]:
duckdb.sql("""
    CREATE TABLE IF NOT EXISTS df_duckdb_profession AS
    SELECT 
           nconst,
           SPLIT_PART(primaryProfession, ',', 1) AS profession1,
           SPLIT_PART(primaryProfession, ',', 2) AS profession2,
           SPLIT_PART(primaryProfession, ',', 3) AS profession3
    FROM df_duckdb_names
""")

In [13]:
duckdb.sql("""
    SELECT * FROM df_duckdb_profession LIMIT 10;
""")

┌───────────┬─────────────┬──────────────────┬──────────────────┐
│  nconst   │ profession1 │   profession2    │   profession3    │
│  varchar  │   varchar   │     varchar      │     varchar      │
├───────────┼─────────────┼──────────────────┼──────────────────┤
│ nm0000001 │ actor       │ miscellaneous    │ producer         │
│ nm0000002 │ actress     │ soundtrack       │ archive_footage  │
│ nm0000003 │ actress     │ music_department │ producer         │
│ nm0000004 │ actor       │ writer           │ music_department │
│ nm0000005 │ writer      │ director         │ actor            │
│ nm0000006 │ actress     │ producer         │ soundtrack       │
│ nm0000007 │ actor       │ producer         │ miscellaneous    │
│ nm0000008 │ actor       │ director         │ writer           │
│ nm0000009 │ actor       │ producer         │ director         │
│ nm0000010 │ actor       │ director         │ producer         │
├───────────┴─────────────┴──────────────────┴──────────────────┤
│ 10 rows 

# Amostra

In [14]:
df_pandas_names.head(5)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987.0,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0053137,tt0027125"
1,nm0000002,Lauren Bacall,1924,2014.0,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982.0,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007.0,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0069467"


In [15]:
df_polars_names.head(5)

nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
str,str,str,str,str,str
"""nm0000001""","""Fred Astaire""","""1899""","""1987""","""actor,miscellaneous,producer""","""tt0072308,tt0050419,tt0053137,…"
"""nm0000002""","""Lauren Bacall""","""1924""","""2014""","""actress,soundtrack,archive_foo…","""tt0037382,tt0075213,tt0117057,…"
"""nm0000003""","""Brigitte Bardot""","""1934""",,"""actress,music_department,produ…","""tt0057345,tt0049189,tt0056404,…"
"""nm0000004""","""John Belushi""","""1949""","""1982""","""actor,writer,music_department""","""tt0072562,tt0077975,tt0080455,…"
"""nm0000005""","""Ingmar Bergman""","""1918""","""2007""","""writer,director,actor""","""tt0050986,tt0083922,tt0050976,…"


In [16]:
duckdb.sql("""
    SELECT *
    FROM df_duckdb_names
    LIMIT 5
""").show() 

┌───────────┬─────────────────┬───────────┬───────────┬──────────────────────┬─────────────────────────────────────────┐
│  nconst   │   primaryName   │ birthYear │ deathYear │  primaryProfession   │             knownForTitles              │
│  varchar  │     varchar     │  varchar  │  varchar  │       varchar        │                 varchar                 │
├───────────┼─────────────────┼───────────┼───────────┼──────────────────────┼─────────────────────────────────────────┤
│ nm0000001 │ Fred Astaire    │ 1899      │ 1987      │ actor,miscellaneou…  │ tt0072308,tt0050419,tt0053137,tt0027125 │
│ nm0000002 │ Lauren Bacall   │ 1924      │ 2014      │ actress,soundtrack…  │ tt0037382,tt0075213,tt0117057,tt0038355 │
│ nm0000003 │ Brigitte Bardot │ 1934      │ NULL      │ actress,music_depa…  │ tt0057345,tt0049189,tt0056404,tt0054452 │
│ nm0000004 │ John Belushi    │ 1949      │ 1982      │ actor,writer,music…  │ tt0072562,tt0077975,tt0080455,tt0078723 │
│ nm0000005 │ Ingmar Bergman  │ 