# Connect to DuckDB

Set up connection to the DuckDB database using duckdb-python driver and establish a database cursor.


In [2]:
import duckdb

# Connect to the DuckDB database
con = duckdb.connect(
    database="/Users/jaouadsalahy/Documents/IT/13_pollution_eau/database/data.duckdb"
)


# Establish a database cursor
cursor = con.cursor()

# List Tables and Schema

Query system tables to list all available tables and their schema information.


In [3]:
# List Tables and Schema

# Query to list all tables in the database
tables_query = "SHOW TABLES"
tables = cursor.execute(tables_query).fetchall()

# Display the list of tables
print("Tables in the database:")
for table in tables:
    print(table[0])

# Query to get schema information for each table
for table in tables:
    schema_query = f"DESCRIBE {table[0]}"
    schema = cursor.execute(schema_query).fetchall()

    # Display schema information
    print(f"\nSchema for table {table[0]}:")
    for column in schema:
        print(f"Column: {column[0]}, Type: {column[1]}")

Tables in the database:
cog_communes
edc_communes
edc_prelevements
edc_resultats
laposte_communes

Schema for table cog_communes:
Column: TYPECOM, Type: VARCHAR
Column: COM, Type: VARCHAR
Column: REG, Type: BIGINT
Column: DEP, Type: VARCHAR
Column: CTCD, Type: VARCHAR
Column: ARR, Type: VARCHAR
Column: TNCC, Type: BIGINT
Column: NCC, Type: VARCHAR
Column: NCCENR, Type: VARCHAR
Column: LIBELLE, Type: VARCHAR
Column: CAN, Type: VARCHAR
Column: COMPARENT, Type: VARCHAR
Column: de_partition, Type: INTEGER
Column: de_ingestion_date, Type: DATE
Column: de_dataset_datetime, Type: VARCHAR

Schema for table edc_communes:
Column: inseecommune, Type: VARCHAR
Column: nomcommune, Type: VARCHAR
Column: quartier, Type: VARCHAR
Column: cdreseau, Type: VARCHAR
Column: nomreseau, Type: VARCHAR
Column: debutalim, Type: DATE
Column: de_partition, Type: INTEGER
Column: de_ingestion_date, Type: DATE
Column: de_dataset_datetime, Type: VARCHAR

Schema for table edc_prelevements:
Column: cddept, Type: VARCHAR


# Basic Data Overview

Execute queries to get row counts, column statistics, and sample data from tables.


In [None]:
# Basic Data Overview

import pandas as pd

# For each table, load it into a pandas DataFrame and display info and description
for table in tables:
    print(f"\n{'=' * 50}")
    print(f"Table: {table[0]}")
    print(f"{'=' * 50}")

    # Convert DuckDB table to pandas DataFrame
    df = pd.read_sql(f"SELECT * FROM {table[0]}", cursor)

    # Display basic information about the DataFrame
    print("\nDataFrame Info:")
    print(df.info())

    # Display statistical description
    print("\nStatistical Description:")
    print(df.describe(include="all"))

    # Display first few rows
    print("\nFirst 5 rows:")
    print(df.head())


Table: cog_communes


  df = pd.read_sql(f"SELECT * FROM {table[0]}", cursor)



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37544 entries, 0 to 37543
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TYPECOM              37544 non-null  object 
 1   COM                  37544 non-null  object 
 2   REG                  34980 non-null  float64
 3   DEP                  34980 non-null  object 
 4   CTCD                 34980 non-null  object 
 5   ARR                  34963 non-null  object 
 6   TNCC                 37544 non-null  int64  
 7   NCC                  37544 non-null  object 
 8   NCCENR               37544 non-null  object 
 9   LIBELLE              37544 non-null  object 
 10  CAN                  34924 non-null  object 
 11  COMPARENT            2609 non-null   object 
 12  de_partition         37544 non-null  int64  
 13  de_ingestion_date    37544 non-null  object 
 14  de_dataset_datetime  37544 non-null  object 
dtypes: float64(1), int6

  df = pd.read_sql(f"SELECT * FROM {table[0]}", cursor)



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247622 entries, 0 to 247621
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   inseecommune         247622 non-null  object
 1   nomcommune           247622 non-null  object
 2   quartier             247622 non-null  object
 3   cdreseau             247622 non-null  object
 4   nomreseau            247622 non-null  object
 5   debutalim            247622 non-null  object
 6   de_partition         247622 non-null  int64 
 7   de_ingestion_date    247622 non-null  object
 8   de_dataset_datetime  247622 non-null  object
dtypes: int64(1), object(8)
memory usage: 17.0+ MB
None

Statistical Description:
       inseecommune       nomcommune quartier   cdreseau  \
count        247622           247622   247622     247622   
unique        34914            32657    16594      23767   
top           48009  PEYRE EN AUBRAC        -  048000295   
f

  df = pd.read_sql(f"SELECT * FROM {table[0]}", cursor)



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2083345 entries, 0 to 2083344
Data columns (total 21 columns):
 #   Column                      Dtype 
---  ------                      ----- 
 0   cddept                      object
 1   cdreseau                    object
 2   inseecommuneprinc           object
 3   nomcommuneprinc             object
 4   cdreseauamont               object
 5   nomreseauamont              object
 6   pourcentdebit               object
 7   referenceprel               object
 8   dateprel                    object
 9   heureprel                   object
 10  conclusionprel              object
 11  ugelib                      object
 12  distrlib                    object
 13  moalib                      object
 14  plvconformitebacterio       object
 15  plvconformitechimique       object
 16  plvconformitereferencebact  object
 17  plvconformitereferencechim  object
 18  de_partition                int64 
 19  de_ingestion_date        

  df = pd.read_sql(f"SELECT * FROM {table[0]}", cursor)


# Data Profiling

Analyze data types, null values, unique values, and basic statistics for each column.


In [5]:
# Data Profiling

# Analyze data types, null values, unique values, and basic statistics for each column
for table in tables:
    print(f"\nData profiling for table {table[0]}:")

    # Query to get column names and data types
    columns_query = f"PRAGMA table_info({table[0]})"
    columns = cursor.execute(columns_query).fetchall()

    for column in columns:
        column_name = column[1]
        column_type = column[2]

        # Query to get count of null values
        null_count_query = (
            f"SELECT COUNT(*) FROM {table[0]} WHERE {column_name} IS NULL"
        )
        null_count = cursor.execute(null_count_query).fetchone()[0]

        # Query to get count of unique values
        unique_count_query = f"SELECT COUNT(DISTINCT {column_name}) FROM {table[0]}"
        unique_count = cursor.execute(unique_count_query).fetchone()[0]

        # Query to get basic statistics
        stats_query = f"SELECT MIN({column_name}), MAX({column_name}), AVG({column_name}), STDDEV({column_name}) FROM {table[0]}"
        stats = cursor.execute(stats_query).fetchone()

        print(
            f"Column: {column_name}, Type: {column_type}, Nulls: {null_count}, Unique: {unique_count}, Min: {stats[0]}, Max: {stats[1]}, Avg: {stats[2]}, Stddev: {stats[3]}"
        )


Data profiling for table cog_communes:


BinderException: Binder Error: No function matches the given name and argument types 'avg(VARCHAR)'. You might need to add explicit type casts.
	Candidate functions:
	avg(DECIMAL) -> DECIMAL
	avg(SMALLINT) -> DOUBLE
	avg(INTEGER) -> DOUBLE
	avg(BIGINT) -> DOUBLE
	avg(HUGEINT) -> DOUBLE
	avg(DOUBLE) -> DOUBLE


LINE 1: SELECT MIN(TYPECOM), MAX(TYPECOM), AVG(TYPECOM), STDDEV(TYPECOM) FROM cog_communes
                                           ^

# Sample Queries

Perform example queries to demonstrate table relationships and data patterns.


In [None]:
# Sample Queries

# Query to demonstrate table relationships and data patterns
# Example 1: Join two tables and display the result
join_query = """
SELECT t1.*, t2.*
FROM table1 t1
JOIN table2 t2 ON t1.common_column = t2.common_column
LIMIT 5
"""
join_result = cursor.execute(join_query).fetchall()
print("\nJoin result between table1 and table2:")
for row in join_result:
    print(row)

# Example 2: Aggregate data from a table
aggregate_query = """
SELECT column1, COUNT(*), AVG(column2)
FROM table1
GROUP BY column1
LIMIT 5
"""
aggregate_result = cursor.execute(aggregate_query).fetchall()
print("\nAggregate result from table1:")
for row in aggregate_result:
    print(row)

# Example 3: Filter data based on a condition
filter_query = """
SELECT *
FROM table1
WHERE column3 > 100
LIMIT 5
"""
filter_result = cursor.execute(filter_query).fetchall()
print("\nFiltered result from table1 where column3 > 100:")
for row in filter_result:
    print(row)

# Data Visualization

Create visualizations using pandas and plotly to better understand the data distribution and relationships.


In [None]:
import pandas as pd
import plotly.express as px


# Load data from DuckDB into pandas DataFrame
def load_table_to_df(table_name):
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql(query, con)
    return df


# Example: Visualize data distribution for a specific table
table_name = "your_table_name"  # Replace with your actual table name
df = load_table_to_df(table_name)

# Histogram for a specific column
fig = px.histogram(df, x="your_column_name")  # Replace with your actual column name
fig.show()

# Scatter plot to visualize relationships between two columns
fig = px.scatter(
    df, x="your_column_x", y="your_column_y"
)  # Replace with your actual column names
fig.show()

# Box plot to visualize the distribution of a column
fig = px.box(df, y="your_column_name")  # Replace with your actual column name
fig.show()

# Correlation heatmap
correlation_matrix = df.corr()
fig = px.imshow(correlation_matrix, text_auto=True)
fig.show()