From 30a9dc0faf7870b7146abc34c071b9698f1c7ff9 Mon Sep 17 00:00:00 2001 From: BohuTANG Date: Thu, 31 Jul 2025 17:17:38 +0800 Subject: [PATCH] refine the window functions --- .../08-window-functions/cume-dist.md | 133 +++++----- .../08-window-functions/dense-rank.md | 107 +++++--- .../08-window-functions/first-value.md | 168 ++++++------ .../08-window-functions/index.md | 240 ++++-------------- .../08-window-functions/lag.md | 115 +++++---- .../08-window-functions/last-value.md | 175 ++++++------- .../08-window-functions/lead.md | 115 +++++---- .../08-window-functions/nth-value.md | 189 +++++++------- .../08-window-functions/ntile.md | 164 +++++++----- .../08-window-functions/percent_rank.md | 63 +++-- .../08-window-functions/rank.md | 111 +++++--- .../08-window-functions/row-number.md | 156 +++++++----- 12 files changed, 887 insertions(+), 849 deletions(-) diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/cume-dist.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/cume-dist.md index 40c564a568..7ff8164ec1 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/cume-dist.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/cume-dist.md @@ -1,65 +1,68 @@ ---- -title: CUME_DIST ---- -import FunctionDescription from '@site/src/components/FunctionDescription'; - - - -Returns the cumulative distribution of a given value in a set of values. It calculates the proportion of rows that have values less than or equal to the specified value, divided by the total number of rows. Please note that the resulting value falls between 0 and 1, inclusive. - -See also: [PERCENT_RANK](percent_rank.md) - -## Syntax - -```sql -CUME_DIST() OVER ( - PARTITION BY expr, ... - ORDER BY expr [ASC | DESC], ... -) -``` - -## Examples - -This example retrieves the students' names, scores, grades, and the cumulative distribution values (cume_dist_val) within each grade using the CUME_DIST() window function. - -```sql -CREATE TABLE students ( - name VARCHAR(20), - score INT NOT NULL, - grade CHAR(1) NOT NULL -); - -INSERT INTO students (name, score, grade) -VALUES - ('Smith', 81, 'A'), - ('Jones', 55, 'C'), - ('Williams', 55, 'C'), - ('Taylor', 62, 'B'), - ('Brown', 62, 'B'), - ('Davies', 84, 'A'), - ('Evans', 87, 'A'), - ('Wilson', 72, 'B'), - ('Thomas', 72, 'B'), - ('Johnson', 100, 'A'); - -SELECT - name, - score, - grade, - CUME_DIST() OVER (PARTITION BY grade ORDER BY score) AS cume_dist_val -FROM - students; - -name |score|grade|cume_dist_val| ---------+-----+-----+-------------+ -Smith | 81|A | 0.25| -Davies | 84|A | 0.5| -Evans | 87|A | 0.75| -Johnson | 100|A | 1.0| -Taylor | 62|B | 0.5| -Brown | 62|B | 0.5| -Wilson | 72|B | 1.0| -Thomas | 72|B | 1.0| -Jones | 55|C | 1.0| -Williams| 55|C | 1.0| -``` \ No newline at end of file +--- +title: CUME_DIST +--- +import FunctionDescription from '@site/src/components/FunctionDescription'; + + + +Calculates the cumulative distribution of each row's value. Returns the fraction of rows with values less than or equal to the current row's value. + +See also: [PERCENT_RANK](percent_rank.md) + +## Syntax + +```sql +CUME_DIST() +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression [ ASC | DESC ] +) +``` + +**Arguments:** +- `PARTITION BY`: Optional. Divides rows into partitions +- `ORDER BY`: Required. Determines the distribution order +- `ASC | DESC`: Optional. Sort direction (default: ASC) + +**Notes:** +- Returns values between 0 and 1 (exclusive of 0, inclusive of 1) +- Formula: (number of rows ≤ current value) / (total rows) +- Always returns 1.0 for the highest value(s) +- Useful for calculating percentiles and cumulative percentages + +## Examples + +```sql +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + score INT +); + +INSERT INTO scores VALUES + ('Alice', 95), + ('Bob', 87), + ('Charlie', 87), + ('David', 82), + ('Eve', 78); +``` + +**Calculate cumulative distribution (showing what percentage of students scored at or below each score):** + +```sql +SELECT student, score, + CUME_DIST() OVER (ORDER BY score) AS cume_dist, + ROUND(CUME_DIST() OVER (ORDER BY score) * 100) AS cumulative_percent +FROM scores +ORDER BY score; +``` + +Result: +``` +student | score | cume_dist | cumulative_percent +--------+-------+-----------+------------------- +Eve | 78 | 0.2 | 20 +David | 82 | 0.4 | 40 +Bob | 87 | 0.8 | 80 +Charlie | 87 | 0.8 | 80 +Alice | 95 | 1.0 | 100 \ No newline at end of file diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/dense-rank.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/dense-rank.md index 00d8230d68..ac8c906772 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/dense-rank.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/dense-rank.md @@ -2,59 +2,96 @@ title: DENSE_RANK --- -Returns the rank of a value within a group of values, without gaps in the ranks. - -The rank value starts at 1 and continues up sequentially. - -If two values are the same, they have the same rank. +Assigns a rank to each row within a partition. Rows with equal values receive the same rank, with no gaps in subsequent rankings. ## Syntax ```sql -DENSE_RANK() OVER ( [ PARTITION BY ] ORDER BY [ ASC | DESC ] [ ] ) +DENSE_RANK() +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression [ ASC | DESC ] +) ``` +**Arguments:** +- `PARTITION BY`: Optional. Divides rows into partitions +- `ORDER BY`: Required. Determines the ranking order +- `ASC | DESC`: Optional. Sort direction (default: ASC) + +**Notes:** +- Ranks start from 1 +- Equal values get the same rank +- No gaps in ranking sequence after ties +- Example: 1, 2, 2, 3, 4 (not 1, 2, 2, 4, 5 like RANK) + ## Examples -**Create the table** ```sql -CREATE TABLE employees ( - employee_id INT, - first_name VARCHAR, - last_name VARCHAR, - department VARCHAR, - salary INT +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + subject VARCHAR(20), + score INT ); + +INSERT INTO scores VALUES + ('Alice', 'Math', 95), + ('Alice', 'English', 87), + ('Alice', 'Science', 92), + ('Bob', 'Math', 85), + ('Bob', 'English', 85), + ('Bob', 'Science', 80), + ('Charlie', 'Math', 88), + ('Charlie', 'English', 85), + ('Charlie', 'Science', 85); ``` -**Insert data** +**Dense rank all scores (showing no gaps after ties):** + ```sql -INSERT INTO employees (employee_id, first_name, last_name, department, salary) VALUES - (1, 'John', 'Doe', 'IT', 90000), - (2, 'Jane', 'Smith', 'HR', 85000), - (3, 'Mike', 'Johnson', 'IT', 82000), - (4, 'Sara', 'Williams', 'Sales', 77000), - (5, 'Tom', 'Brown', 'HR', 75000); +SELECT student, subject, score, + DENSE_RANK() OVER (ORDER BY score DESC) AS dense_rank +FROM scores +ORDER BY score DESC, student, subject; ``` -**Calculating the total salary per department using DENSE_RANK** +Result: +``` +student | subject | score | dense_rank +--------+---------+-------+----------- +Alice | Math | 95 | 1 +Alice | Science | 92 | 2 +Charlie | Math | 88 | 3 +Alice | English | 87 | 4 +Bob | English | 85 | 5 +Bob | Math | 85 | 5 +Charlie | English | 85 | 5 +Charlie | Science | 85 | 5 +Bob | Science | 80 | 6 +``` + +**Dense rank scores within each student:** ```sql -SELECT - department, - SUM(salary) AS total_salary, - DENSE_RANK() OVER (ORDER BY SUM(salary) DESC) AS dense_rank -FROM - employees -GROUP BY - department; +SELECT student, subject, score, + DENSE_RANK() OVER (PARTITION BY student ORDER BY score DESC) AS subject_dense_rank +FROM scores +ORDER BY student, score DESC, subject; ``` Result: - -| department | total_salary | dense_rank | -|------------|--------------|------------| -| IT | 172000 | 1 | -| HR | 160000 | 2 | -| Sales | 77000 | 3 | +``` +student | subject | score | subject_dense_rank +--------+---------+-------+------------------- +Alice | Math | 95 | 1 +Alice | Science | 92 | 2 +Alice | English | 87 | 3 +Bob | English | 85 | 1 +Bob | Math | 85 | 1 +Bob | Science | 80 | 2 +Charlie | Math | 88 | 1 +Charlie | English | 85 | 2 +Charlie | Science | 85 | 2 +``` diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/first-value.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/first-value.md index 2160670690..543c43099f 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/first-value.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/first-value.md @@ -1,96 +1,74 @@ ---- -title: FIRST_VALUE ---- - -import FunctionDescription from '@site/src/components/FunctionDescription'; - - - -Returns the first value in the window frame. - -See also: - -- [LAST_VALUE](last-value.md) -- [NTH_VALUE](nth-value.md) - -## Syntax - -```sql -FIRST_VALUE (expression) [ { IGNORE | RESPECT } NULLS ] OVER ([PARTITION BY partition_expression] ORDER BY order_expression [window_frame]) -``` - -- `[ { IGNORE | RESPECT } NULLS ]`: Controls how NULL values are handled within the window function. - - By default, `RESPECT NULLS` is used, meaning NULL values are included in the calculation and affect the result. - - When set to `IGNORE NULLS`, NULL values are excluded from consideration, and the function operates only on non-NULL values. - - If all values in the window frame are NULL, the function returns NULL even when `IGNORE NULLS` is specified. - -- For the syntax of window frame, see [Window Frame Syntax](index.md#window-frame-syntax). - -## Examples - -```sql -CREATE TABLE employees ( - employee_id INT, - first_name VARCHAR(50), - last_name VARCHAR(50), - salary DECIMAL(10,2) -); - -INSERT INTO employees (employee_id, first_name, last_name, salary) -VALUES - (1, 'John', 'Doe', 5000.00), - (2, 'Jane', 'Smith', 6000.00), - (3, 'David', 'Johnson', 5500.00), - (4, 'Mary', 'Williams', 7000.00), - (5, 'Michael', 'Brown', 4500.00); - --- Use FIRST_VALUE to retrieve the first name of the employee with the highest salary -SELECT employee_id, first_name, last_name, salary, - FIRST_VALUE(first_name) OVER (ORDER BY salary DESC) AS highest_salary_first_name -FROM employees; - - -employee_id | first_name | last_name | salary | highest_salary_first_name -------------+------------+-----------+---------+-------------------------- -4 | Mary | Williams | 7000.00 | Mary -2 | Jane | Smith | 6000.00 | Mary -3 | David | Johnson | 5500.00 | Mary -1 | John | Doe | 5000.00 | Mary -5 | Michael | Brown | 4500.00 | Mary - -``` - -This example excludes the NULL values from the window frame with the `IGNORE NULLS` option: - -```sql -CREATE or replace TABLE example AS SELECT * FROM (VALUES - (0, 1, 614), - (1, 1, null), - (2, 1, null), - (3, 1, 639), - (4, 1, 2027) -) tbl(id, user_id, order_id); - - -SELECT - id, - user_id, - order_id, - FIRST_VALUE (order_id) IGNORE nulls over ( - PARTITION BY user_id - ORDER BY - id ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING - ) AS last_order_id -FROM - example - -┌───────────────────────────────────────────────────────┐ -│ id │ user_id │ order_id │ last_order_id │ -├───────┼─────────┼──────────────────┼──────────────────┤ -│ 0 │ 1 │ 614 │ 614 │ -│ 1 │ 1 │ NULL │ 614 │ -│ 2 │ 1 │ NULL │ 639 │ -│ 3 │ 1 │ 639 │ 639 │ -│ 4 │ 1 │ 2027 │ 639 │ -└───────────────────────────────────────────────────────┘ +--- +title: FIRST_VALUE +--- + +import FunctionDescription from '@site/src/components/FunctionDescription'; + + + +Returns the first value in the window frame. + +See also: + +- [LAST_VALUE](last-value.md) +- [NTH_VALUE](nth-value.md) + +## Syntax + +```sql +FIRST_VALUE(expression) +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression [ ASC | DESC ] + [ window_frame ] +) +``` + +**Arguments:** +- `expression`: Required. The column or expression to return the first value from +- `PARTITION BY`: Optional. Divides rows into partitions +- `ORDER BY`: Required. Determines the ordering within the window +- `window_frame`: Optional. Defines the window frame (default: RANGE UNBOUNDED PRECEDING) + +**Notes:** +- Returns the first value in the ordered window frame +- Supports `IGNORE NULLS` and `RESPECT NULLS` options +- Useful for finding the earliest/lowest value in each group + +## Examples + +```sql +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + score INT +); + +INSERT INTO scores VALUES + ('Alice', 95), + ('Bob', 87), + ('Charlie', 82), + ('David', 78), + ('Eve', 92); +``` + +**Get the highest score (first value when ordered by score DESC):** + +```sql +SELECT student, score, + FIRST_VALUE(score) OVER (ORDER BY score DESC) AS highest_score, + FIRST_VALUE(student) OVER (ORDER BY score DESC) AS top_student +FROM scores +ORDER BY score DESC; +``` + +Result: +``` +student | score | highest_score | top_student +--------+-------+---------------+------------ +Alice | 95 | 95 | Alice +Eve | 92 | 95 | Alice +Bob | 87 | 95 | Alice +Charlie | 82 | 95 | Alice +David | 78 | 95 | Alice ``` \ No newline at end of file diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/index.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/index.md index b40e499da4..6c161f1035 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/index.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/index.md @@ -4,13 +4,13 @@ title: 'Window Functions' ## Overview -A window function operates on a group ("window") of related rows. For each input row, a window function returns one output row that depends on the specific row passed to the function and the values of the other rows in the window. +Window functions perform calculations across a set of related rows while returning one result per input row. Unlike aggregate functions, window functions don't collapse rows into a single output. -There are two main types of order-sensitive window functions: - -* **Rank-related functions**: List information based on the "rank" of a row. For example, ranking stores in descending order by profit per year, the store with the most profit will be ranked 1, and the second-most profitable store will be ranked 2, and so on. - -* **Window frame functions**: Enable you to perform rolling operations, such as calculating a running total or a moving average, on a subset of the rows in the window. +**Key characteristics:** +- Operate on a "window" of rows related to the current row +- Return one value per input row (no grouping/collapsing) +- Can access values from other rows in the window +- Support partitioning and ordering for flexible calculations ## Window Function Categories @@ -18,23 +18,40 @@ Databend supports two main categories of window functions: ### 1. Dedicated Window Functions -These functions are specifically designed for window operations and provide ranking, navigation, and value analysis capabilities. - -| Function | Description | Example | -|----------|-------------|---------| -| [RANK](rank.md) | Returns rank with gaps | `RANK() OVER (ORDER BY salary DESC)` → `1, 2, 2, 4, ...` | -| [DENSE_RANK](dense-rank.md) | Returns rank without gaps | `DENSE_RANK() OVER (ORDER BY salary DESC)` → `1, 2, 2, 3, ...` | -| [ROW_NUMBER](row-number.md) | Returns sequential row number | `ROW_NUMBER() OVER (ORDER BY hire_date)` → `1, 2, 3, 4, ...` | -| [CUME_DIST](cume-dist.md) | Returns cumulative distribution | `CUME_DIST() OVER (ORDER BY score)` → `0.2, 0.4, 0.8, 1.0, ...` | -| [PERCENT_RANK](percent_rank.md) | Returns relative rank (0-1) | `PERCENT_RANK() OVER (ORDER BY score)` → `0.0, 0.25, 0.75, ...` | -| [NTILE](ntile.md) | Divides rows into N groups | `NTILE(4) OVER (ORDER BY score)` → `1, 1, 2, 2, 3, 3, 4, 4, ...` | -| [FIRST_VALUE](first-value.md) | Returns first value in window | `FIRST_VALUE(product) OVER (PARTITION BY category ORDER BY sales)` | -| [LAST_VALUE](last-value.md) | Returns last value in window | `LAST_VALUE(product) OVER (PARTITION BY category ORDER BY sales)` | -| [NTH_VALUE](nth-value.md) | Returns Nth value in window | `NTH_VALUE(product, 2) OVER (PARTITION BY category ORDER BY sales)` | -| [LEAD](lead.md) | Access value from subsequent row | `LEAD(price, 1) OVER (ORDER BY date)` → next day's price | -| [LAG](lag.md) | Access value from previous row | `LAG(price, 1) OVER (ORDER BY date)` → previous day's price | -| [FIRST](first.md) | Returns first value (alias) | `FIRST(product) OVER (PARTITION BY category ORDER BY sales)` | -| [LAST](last.md) | Returns last value (alias) | `LAST(product) OVER (PARTITION BY category ORDER BY sales)` | +These functions are specifically designed for window operations. + +**Ranking Functions:** + +| Function | Description | Ties Handling | Example Output | +|----------|-------------|---------------|----------------| +| [ROW_NUMBER](row-number.md) | Sequential numbering | Always unique | `1, 2, 3, 4, 5` | +| [RANK](rank.md) | Ranking with gaps | Same rank, gaps after | `1, 2, 2, 4, 5` | +| [DENSE_RANK](dense-rank.md) | Ranking without gaps | Same rank, no gaps | `1, 2, 2, 3, 4` | + +**Distribution Functions:** + +| Function | Description | Range | Example Output | +|----------|-------------|-------|----------------| +| [PERCENT_RANK](percent_rank.md) | Relative rank as percentage | 0.0 to 1.0 | `0.0, 0.25, 0.5, 0.75, 1.0` | +| [CUME_DIST](cume-dist.md) | Cumulative distribution | 0.0 to 1.0 | `0.2, 0.4, 0.6, 0.8, 1.0` | +| [NTILE](ntile.md) | Divide into N buckets | 1 to N | `1, 1, 2, 2, 3, 3` | + +**Value Access Functions:** + +| Function | Description | Use Case | +|----------|-------------|----------| +| [FIRST_VALUE](first-value.md) | First value in window | Get highest/earliest value | +| [LAST_VALUE](last-value.md) | Last value in window | Get lowest/latest value | +| [NTH_VALUE](nth-value.md) | Nth value in window | Get specific positioned value | +| [LAG](lag.md) | Previous row value | Compare with previous | +| [LEAD](lead.md) | Next row value | Compare with next | + +**Aliases:** + +| Function | Alias For | +|----------|----------| +| [FIRST](first.md) | FIRST_VALUE | +| [LAST](last.md) | LAST_VALUE | ### 2. Aggregate Functions Used as Window Functions @@ -62,175 +79,28 @@ These are standard aggregate functions that can be used with the OVER clause to | [MIN_IF](../07-aggregate-functions/aggregate-min-if.md) | Conditional minimum | ✓ | `MIN_IF(temp, location = 'outside') OVER (PARTITION BY day)` | | [MAX_IF](../07-aggregate-functions/aggregate-max-if.md) | Conditional maximum | ✓ | `MAX_IF(speed, vehicle = 'car') OVER (PARTITION BY test)` | -## Window Function Syntax +## Basic Syntax -```sql - ( [ ] ) OVER ( { named_window | inline_window } ) -``` - -Where: +All window functions follow this pattern: ```sql -named_window ::= window_name - -inline_window ::= - [ PARTITION BY ] - [ ORDER BY ] +FUNCTION() OVER ( + [ PARTITION BY column ] + [ ORDER BY column ] [ window_frame ] +) ``` -### Key Components - -| Component | Description | Example | -|-----------|-------------|--------| -| `` | The window function to apply | `SUM()`, `RANK()`, etc. | -| `OVER` | Indicates window function usage | Required for all window functions | -| `PARTITION BY` | Groups rows into partitions | `PARTITION BY department` | -| `ORDER BY` | Orders rows within each partition | `ORDER BY salary DESC` | -| `window_frame` | Defines subset of rows to consider | `ROWS BETWEEN 1 PRECEDING AND CURRENT ROW` | -| `named_window` | References a window defined in WINDOW clause | `SELECT sum(x) OVER w FROM t WINDOW w AS (PARTITION BY y)` | - - -## Window Frame Syntax - -A window frame defines which rows are included in the function calculation for each row. There are two types of window frames: - -### 1. Frame Types - -| Frame Type | Description | Example | -|------------|-------------|--------| -| `ROWS` | Physical row-based frame | `ROWS BETWEEN 3 PRECEDING AND CURRENT ROW` | -| `RANGE` | Logical value-based frame | `RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW` | - -### 2. Frame Extent - -| Frame Extent Pattern | Description | Example | -|----------------------|-------------|--------| -| **Cumulative Frames** | | | -| `BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW` | All rows from start to current | Running total | -| `BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING` | Current row to end | Running total from current position | -| **Sliding Frames** | | | -| `BETWEEN N PRECEDING AND CURRENT ROW` | N rows before current + current | 3-day moving average | -| `BETWEEN CURRENT ROW AND N FOLLOWING` | Current + N rows after | Forward-looking calculation | -| `BETWEEN N PRECEDING AND N FOLLOWING` | N rows before + current + N rows after | Centered moving average | -| `BETWEEN UNBOUNDED PRECEDING AND N FOLLOWING` | All rows from start to N after current | Extended cumulative calculation | -| `BETWEEN N PRECEDING AND UNBOUNDED FOLLOWING` | N rows before current to end | Extended backward calculation | - - -## Window Function Examples - -The following examples demonstrate common window function use cases using an employee dataset. - -### Sample Data Setup - -```sql --- Create employees table -CREATE TABLE employees ( - employee_id INT, - first_name VARCHAR, - last_name VARCHAR, - department VARCHAR, - salary INT -); - --- Insert sample data -INSERT INTO employees VALUES - (1, 'John', 'Doe', 'IT', 75000), - (2, 'Jane', 'Smith', 'HR', 85000), - (3, 'Mike', 'Johnson', 'IT', 90000), - (4, 'Sara', 'Williams', 'Sales', 60000), - (5, 'Tom', 'Brown', 'HR', 82000), - (6, 'Ava', 'Davis', 'Sales', 62000), - (7, 'Olivia', 'Taylor', 'IT', 72000), - (8, 'Emily', 'Anderson', 'HR', 77000), - (9, 'Sophia', 'Lee', 'Sales', 58000), - (10, 'Ella', 'Thomas', 'IT', 67000); -``` - -### Example 1: Ranking Functions - -Ranking employees by salary in descending order: - -```sql -SELECT - employee_id, - first_name, - last_name, - department, - salary, - RANK() OVER (ORDER BY salary DESC) AS rank, - DENSE_RANK() OVER (ORDER BY salary DESC) AS dense_rank, - ROW_NUMBER() OVER (ORDER BY salary DESC) AS row_num -FROM employees -ORDER BY salary DESC; -``` - -**Result:** +- **PARTITION BY**: Divides data into groups +- **ORDER BY**: Sorts rows within each partition +- **window_frame**: Defines which rows to include (optional) -| employee_id | first_name | last_name | department | salary | rank | dense_rank | row_num | -|-------------|------------|-----------|------------|--------|------|------------|--------| -| 3 | Mike | Johnson | IT | 90000 | 1 | 1 | 1 | -| 2 | Jane | Smith | HR | 85000 | 2 | 2 | 2 | -| 5 | Tom | Brown | HR | 82000 | 3 | 3 | 3 | -| 8 | Emily | Anderson | HR | 77000 | 4 | 4 | 4 | -| 1 | John | Doe | IT | 75000 | 5 | 5 | 5 | -### Example 2: Partitioning - -Calculating statistics per department: - -```sql -SELECT DISTINCT - department, - COUNT(*) OVER (PARTITION BY department) AS employee_count, - SUM(salary) OVER (PARTITION BY department) AS total_salary, - AVG(salary) OVER (PARTITION BY department) AS avg_salary, - MIN(salary) OVER (PARTITION BY department) AS min_salary, - MAX(salary) OVER (PARTITION BY department) AS max_salary -FROM employees -ORDER BY department; -``` - -**Result:** - -| department | employee_count | total_salary | avg_salary | min_salary | max_salary | -|------------|----------------|-------------|------------|------------|------------| -| HR | 3 | 244000 | 81333.33 | 77000 | 85000 | -| IT | 4 | 304000 | 76000.00 | 67000 | 90000 | -| Sales | 3 | 180000 | 60000.00 | 58000 | 62000 | - -### Example 3: Running Totals and Moving Averages - -Calculating running totals and moving averages within departments: - -```sql -SELECT - employee_id, - first_name, - department, - salary, - -- Running total (cumulative sum) - SUM(salary) OVER ( - PARTITION BY department - ORDER BY employee_id - ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW - ) AS running_total, - -- Moving average of current and previous row - AVG(salary) OVER ( - PARTITION BY department - ORDER BY employee_id - ROWS BETWEEN 1 PRECEDING AND CURRENT ROW - ) AS moving_avg -FROM employees -ORDER BY department, employee_id; -``` +## Common Use Cases -**Result:** +- **Ranking**: Create leaderboards and top-N lists +- **Analytics**: Calculate running totals, moving averages, percentiles +- **Comparison**: Compare current vs previous/next values +- **Grouping**: Divide data into buckets without losing detail -| employee_id | first_name | department | salary | running_total | moving_avg | -|-------------|------------|------------|--------|---------------|------------| -| 2 | Jane | HR | 85000 | 85000 | 85000.00 | -| 5 | Tom | HR | 82000 | 167000 | 83500.00 | -| 8 | Emily | HR | 77000 | 244000 | 79500.00 | -| 1 | John | IT | 75000 | 75000 | 75000.00 | -| 3 | Mike | IT | 90000 | 165000 | 82500.00 | +For detailed syntax and examples, see individual function documentation above. diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/lag.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/lag.md index 081fd60677..fcbda9b553 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/lag.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/lag.md @@ -6,63 +6,90 @@ import FunctionDescription from '@site/src/components/FunctionDescription'; -LAG allows you to access the value of a column from a preceding row within the same result set. It is typically used to retrieve the value of a column in the previous row, based on a specified ordering. +Returns the value from a previous row in the result set. See also: [LEAD](lead.md) ## Syntax ```sql -LAG(expression [, offset [, default]]) OVER (PARTITION BY partition_expression ORDER BY sort_expression) +LAG( + expression + [, offset ] + [, default ] +) +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression +) ``` -- *offset*: Specifies the number of rows ahead (LEAD) or behind (LAG) the current row within the partition to retrieve the value from. Defaults to 1. -> Note that setting a negative offset has the same effect as using the [LEAD](lead.md) function. +**Arguments:** +- `expression`: The column or expression to evaluate +- `offset`: Number of rows before the current row (default: 1) +- `default`: Value to return when no previous row exists (default: NULL) -- *default*: Specifies a value to be returned if the LEAD or LAG function encounters a situation where there is no value available due to the offset exceeding the partition's boundaries. Defaults to NULL. +**Notes:** +- Negative offset values work like LEAD function +- Returns NULL if the offset goes beyond partition boundaries ## Examples ```sql -CREATE TABLE sales ( - sale_id INT, - product_name VARCHAR(50), - sale_amount DECIMAL(10, 2) +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + test_date DATE, + score INT ); -INSERT INTO sales (sale_id, product_name, sale_amount) -VALUES (1, 'Product A', 1000.00), - (2, 'Product A', 1500.00), - (3, 'Product A', 2000.00), - (4, 'Product B', 500.00), - (5, 'Product B', 800.00), - (6, 'Product B', 1200.00); - -SELECT product_name, sale_amount, LAG(sale_amount) OVER (PARTITION BY product_name ORDER BY sale_id) AS previous_sale_amount -FROM sales; - -product_name | sale_amount | previous_sale_amount ------------------------------------------------ -Product A | 1000.00 | NULL -Product A | 1500.00 | 1000.00 -Product A | 2000.00 | 1500.00 -Product B | 500.00 | NULL -Product B | 800.00 | 500.00 -Product B | 1200.00 | 800.00 - --- The following statements return the same result. -SELECT product_name, sale_amount, LAG(sale_amount, -1) OVER (PARTITION BY product_name ORDER BY sale_id) AS next_sale_amount -FROM sales; - -SELECT product_name, sale_amount, LEAD(sale_amount) OVER (PARTITION BY product_name ORDER BY sale_id) AS next_sale_amount -FROM sales; - -product_name|sale_amount|next_sale_amount| -------------+-----------+----------------+ -Product A | 1000.00| 1500.00| -Product A | 1500.00| 2000.00| -Product A | 2000.00| | -Product B | 500.00| 800.00| -Product B | 800.00| 1200.00| -Product B | 1200.00| | +INSERT INTO scores VALUES + ('Alice', '2024-01-01', 85), + ('Alice', '2024-02-01', 90), + ('Alice', '2024-03-01', 88), + ('Bob', '2024-01-01', 78), + ('Bob', '2024-02-01', 82), + ('Bob', '2024-03-01', 85); +``` + +**Get previous test score for each student:** + +```sql +SELECT student, test_date, score, + LAG(score) OVER (PARTITION BY student ORDER BY test_date) AS previous_score +FROM scores +ORDER BY student, test_date; +``` + +Result: +``` +student | test_date | score | previous_score +--------+------------+-------+--------------- +Alice | 2024-01-01 | 85 | NULL +Alice | 2024-02-01 | 90 | 85 +Alice | 2024-03-01 | 88 | 90 +Bob | 2024-01-01 | 78 | NULL +Bob | 2024-02-01 | 82 | 78 +Bob | 2024-03-01 | 85 | 82 +``` + +**Get score from 2 tests ago:** + +```sql +SELECT student, test_date, score, + LAG(score, 2, 0) OVER (PARTITION BY student ORDER BY test_date) AS score_2_tests_ago +FROM scores +ORDER BY student, test_date; +``` + +Result: +``` +student | test_date | score | score_2_tests_ago +--------+------------+-------+------------------ +Alice | 2024-01-01 | 85 | 0 +Alice | 2024-02-01 | 90 | 0 +Alice | 2024-03-01 | 88 | 85 +Bob | 2024-01-01 | 78 | 0 +Bob | 2024-02-01 | 82 | 0 +Bob | 2024-03-01 | 85 | 78 ``` \ No newline at end of file diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/last-value.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/last-value.md index 250442ef59..00045e39a5 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/last-value.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/last-value.md @@ -1,96 +1,81 @@ ---- -title: LAST_VALUE ---- - -import FunctionDescription from '@site/src/components/FunctionDescription'; - - - -Returns the last value in the window frame. - -See also: - -- [FIRST_VALUE](first-value.md) -- [NTH_VALUE](nth-value.md) - -## Syntax - -```sql -LAST_VALUE (expression) [ { IGNORE | RESPECT } NULLS ] OVER ([PARTITION BY partition_expression] ORDER BY order_expression [window_frame]) -``` - -- `[ { IGNORE | RESPECT } NULLS ]`: Controls how NULL values are handled within the window function. - - By default, `RESPECT NULLS` is used, meaning NULL values are included in the calculation and affect the result. - - When set to `IGNORE NULLS`, NULL values are excluded from consideration, and the function operates only on non-NULL values. - - If all values in the window frame are NULL, the function returns NULL even when `IGNORE NULLS` is specified. - -- For the syntax of window frame, see [Window Frame Syntax](index.md#window-frame-syntax). - -## Examples - -```sql -CREATE TABLE employees ( - employee_id INT, - first_name VARCHAR(50), - last_name VARCHAR(50), - salary DECIMAL(10,2) -); - -INSERT INTO employees (employee_id, first_name, last_name, salary) -VALUES - (1, 'John', 'Doe', 5000.00), - (2, 'Jane', 'Smith', 6000.00), - (3, 'David', 'Johnson', 5500.00), - (4, 'Mary', 'Williams', 7000.00), - (5, 'Michael', 'Brown', 4500.00); - --- Use LAST_VALUE to retrieve the first name of the employee with the lowest salary -SELECT employee_id, first_name, last_name, salary, - LAST_VALUE(first_name) OVER (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS lowest_salary_first_name -FROM employees; - -┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ employee_id │ first_name │ last_name │ salary │ lowest_salary_first_name │ -├─────────────────┼──────────────────┼──────────────────┼──────────────────────────┼──────────────────────────┤ -│ 4 │ Mary │ Williams │ 7000.00 │ Michael │ -│ 2 │ Jane │ Smith │ 6000.00 │ Michael │ -│ 3 │ David │ Johnson │ 5500.00 │ Michael │ -│ 1 │ John │ Doe │ 5000.00 │ Michael │ -│ 5 │ Michael │ Brown │ 4500.00 │ Michael │ -└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -This example excludes the NULL values from the window frame with the `IGNORE NULLS` option: - -```sql -CREATE or replace TABLE example AS SELECT * FROM (VALUES - (0, 1, 614), - (1, 1, null), - (2, 1, null), - (3, 1, 639), - (4, 1, 2027) -) tbl(id, user_id, order_id); - - -SELECT - id, - user_id, - order_id, - LAST_VALUE (order_id) IGNORE NULLS over ( - PARTITION BY user_id - ORDER BY - id ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING - ) AS last_order_id -FROM - example - -┌───────────────────────────────────────────────────────┐ -│ id │ user_id │ order_id │ last_order_id │ -├───────┼─────────┼──────────────────┼──────────────────┤ -│ 0 │ 1 │ 614 │ NULL │ -│ 1 │ 1 │ NULL │ 614 │ -│ 2 │ 1 │ NULL │ 614 │ -│ 3 │ 1 │ 639 │ 614 │ -│ 4 │ 1 │ 2027 │ 639 │ -└───────────────────────────────────────────────────────┘ +--- +title: LAST_VALUE +--- + +import FunctionDescription from '@site/src/components/FunctionDescription'; + + + +Returns the last value in the window frame. + +See also: + +- [FIRST_VALUE](first-value.md) +- [NTH_VALUE](nth-value.md) + +## Syntax + +```sql +LAST_VALUE(expression) +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression [ ASC | DESC ] + [ window_frame ] +) +``` + +**Arguments:** +- `expression`: Required. The column or expression to return the last value from +- `PARTITION BY`: Optional. Divides rows into partitions +- `ORDER BY`: Required. Determines the ordering within the window +- `window_frame`: Optional. Defines the window frame (default: RANGE UNBOUNDED PRECEDING) + +**Notes:** +- Returns the last value in the ordered window frame +- Supports `IGNORE NULLS` and `RESPECT NULLS` options +- Often requires explicit window frame to get expected results +- Useful for finding the latest/highest value in each group + +## Examples + +```sql +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + score INT +); + +INSERT INTO scores VALUES + ('Alice', 95), + ('Bob', 87), + ('Charlie', 82), + ('David', 78), + ('Eve', 92); +``` + +**Get the lowest score (last value when ordered by score DESC):** + +```sql +SELECT student, score, + LAST_VALUE(score) OVER ( + ORDER BY score DESC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) AS lowest_score, + LAST_VALUE(student) OVER ( + ORDER BY score DESC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) AS lowest_student +FROM scores +ORDER BY score DESC; +``` + +Result: +``` +student | score | lowest_score | lowest_student +--------+-------+--------------+--------------- +Alice | 95 | 78 | David +Eve | 92 | 78 | David +Bob | 87 | 78 | David +Charlie | 82 | 78 | David +David | 78 | 78 | David ``` \ No newline at end of file diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/lead.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/lead.md index 4de7d25fb0..08b0014caf 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/lead.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/lead.md @@ -6,63 +6,90 @@ import FunctionDescription from '@site/src/components/FunctionDescription'; -LEAD allows you to access the value of a column from a subsequent row within the same result set. It is typically used to retrieve the value of a column in the next row, based on a specified ordering. +Returns the value from a subsequent row in the result set. See also: [LAG](lag.md) ## Syntax ```sql -LEAD(expression [, offset [, default]]) OVER (PARTITION BY partition_expression ORDER BY sort_expression) +LEAD( + expression + [, offset ] + [, default ] +) +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression +) ``` -- *offset*: Specifies the number of rows ahead (LEAD) or behind (LAG) the current row within the partition to retrieve the value from. Defaults to 1. -> Note that setting a negative offset has the same effect as using the [LAG](lag.md) function. +**Arguments:** +- `expression`: The column or expression to evaluate +- `offset`: Number of rows after the current row (default: 1) +- `default`: Value to return when no next row exists (default: NULL) -- *default*: Specifies a value to be returned if the LEAD or LAG function encounters a situation where there is no value available due to the offset exceeding the partition's boundaries. Defaults to NULL. +**Notes:** +- Negative offset values work like LAG function +- Returns NULL if the offset goes beyond partition boundaries ## Examples ```sql -CREATE TABLE sales ( - sale_id INT, - product_name VARCHAR(50), - sale_amount DECIMAL(10, 2) +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + test_date DATE, + score INT ); -INSERT INTO sales (sale_id, product_name, sale_amount) -VALUES (1, 'Product A', 1000.00), - (2, 'Product A', 1500.00), - (3, 'Product A', 2000.00), - (4, 'Product B', 500.00), - (5, 'Product B', 800.00), - (6, 'Product B', 1200.00); - -SELECT product_name, sale_amount, LEAD(sale_amount) OVER (PARTITION BY product_name ORDER BY sale_id) AS next_sale_amount -FROM sales; - -product_name | sale_amount | next_sale_amount ----------------------------------------------- -Product A | 1000.00 | 1500.00 -Product A | 1500.00 | 2000.00 -Product A | 2000.00 | NULL -Product B | 500.00 | 800.00 -Product B | 800.00 | 1200.00 -Product B | 1200.00 | NULL - --- The following statements return the same result. -SELECT product_name, sale_amount, LEAD(sale_amount, -1) OVER (PARTITION BY product_name ORDER BY sale_id) AS previous_sale_amount -FROM sales; - -SELECT product_name, sale_amount, LAG(sale_amount) OVER (PARTITION BY product_name ORDER BY sale_id) AS previous_sale_amount -FROM sales; - -product_name|sale_amount|previous_sale_amount| -------------+-----------+--------------------+ -Product A | 1000.00| | -Product A | 1500.00| 1000.00| -Product A | 2000.00| 1500.00| -Product B | 500.00| | -Product B | 800.00| 500.00| -Product B | 1200.00| 800.00| +INSERT INTO scores VALUES + ('Alice', '2024-01-01', 85), + ('Alice', '2024-02-01', 90), + ('Alice', '2024-03-01', 88), + ('Bob', '2024-01-01', 78), + ('Bob', '2024-02-01', 82), + ('Bob', '2024-03-01', 85); +``` + +**Get next test score for each student:** + +```sql +SELECT student, test_date, score, + LEAD(score) OVER (PARTITION BY student ORDER BY test_date) AS next_score +FROM scores +ORDER BY student, test_date; +``` + +Result: +``` +student | test_date | score | next_score +--------+------------+-------+----------- +Alice | 2024-01-01 | 85 | 90 +Alice | 2024-02-01 | 90 | 88 +Alice | 2024-03-01 | 88 | NULL +Bob | 2024-01-01 | 78 | 82 +Bob | 2024-02-01 | 82 | 85 +Bob | 2024-03-01 | 85 | NULL +``` + +**Get score from 2 tests later:** + +```sql +SELECT student, test_date, score, + LEAD(score, 2, 0) OVER (PARTITION BY student ORDER BY test_date) AS score_2_tests_later +FROM scores +ORDER BY student, test_date; +``` + +Result: +``` +student | test_date | score | score_2_tests_later +--------+------------+-------+-------------------- +Alice | 2024-01-01 | 85 | 88 +Alice | 2024-02-01 | 90 | 0 +Alice | 2024-03-01 | 88 | 0 +Bob | 2024-01-01 | 78 | 85 +Bob | 2024-02-01 | 82 | 0 +Bob | 2024-03-01 | 85 | 0 ``` \ No newline at end of file diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/nth-value.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/nth-value.md index 970b02170d..68af57af6c 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/nth-value.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/nth-value.md @@ -1,96 +1,95 @@ ---- -title: NTH_VALUE ---- - -import FunctionDescription from '@site/src/components/FunctionDescription'; - - - -Returns the value at the `N`-th position within the window frame, where `N` is a specified integer that determines the exact position of the value. - -See also: - -- [FIRST_VALUE](first-value.md) -- [LAST_VALUE](last-value.md) - -## Syntax - -```sql -NTH_VALUE (expression, n) [ { IGNORE | RESPECT } NULLS ] OVER ([PARTITION BY partition_expression] ORDER BY order_expression [window_frame]) -``` - -- `[ { IGNORE | RESPECT } NULLS ]`: Controls how NULL values are handled within the window function. - - By default, `RESPECT NULLS` is used, meaning NULL values are included in the calculation and affect the result. - - When set to `IGNORE NULLS`, NULL values are excluded from consideration, and the function operates only on non-NULL values. - - If all values in the window frame are NULL, the function returns NULL even when `IGNORE NULLS` is specified. - -- For the syntax of window frame, see [Window Frame Syntax](index.md#window-frame-syntax). - -## Examples - -```sql -CREATE TABLE employees ( - employee_id INT, - first_name VARCHAR(50), - last_name VARCHAR(50), - salary DECIMAL(10,2) -); - -INSERT INTO employees (employee_id, first_name, last_name, salary) -VALUES - (1, 'John', 'Doe', 5000.00), - (2, 'Jane', 'Smith', 6000.00), - (3, 'David', 'Johnson', 5500.00), - (4, 'Mary', 'Williams', 7000.00), - (5, 'Michael', 'Brown', 4500.00); - --- Use NTH_VALUE to retrieve the first name of the employee with the second highest salary -SELECT employee_id, first_name, last_name, salary, - NTH_VALUE(first_name, 2) OVER (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS second_highest_salary_first_name -FROM employees; - -┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ employee_id │ first_name │ last_name │ salary │ second_highest_salary_first_name │ -├─────────────────┼──────────────────┼──────────────────┼──────────────────────────┼──────────────────────────────────┤ -│ 4 │ Mary │ Williams │ 7000.00 │ Jane │ -│ 2 │ Jane │ Smith │ 6000.00 │ Jane │ -│ 3 │ David │ Johnson │ 5500.00 │ Jane │ -│ 1 │ John │ Doe │ 5000.00 │ Jane │ -│ 5 │ Michael │ Brown │ 4500.00 │ Jane │ -└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -This example excludes the NULL values from the window frame with the `IGNORE NULLS` option: - -```sql -CREATE or replace TABLE example AS SELECT * FROM (VALUES - (0, 1, 614), - (1, 1, null), - (2, 1, null), - (3, 1, 639), - (4, 1, 2027) -) tbl(id, user_id, order_id); - - -SELECT - id, - user_id, - order_id, - NTH_VALUE (order_id, 2) IGNORE NULLS over ( - PARTITION BY user_id - ORDER BY - id ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING - ) AS last_order_id -FROM - example - -┌───────────────────────────────────────────────────────┐ -│ id │ user_id │ order_id │ last_order_id │ -├───────┼─────────┼──────────────────┼──────────────────┤ -│ 0 │ 1 │ 614 │ NULL │ -│ 1 │ 1 │ NULL │ NULL │ -│ 2 │ 1 │ NULL │ NULL │ -│ 3 │ 1 │ 639 │ NULL │ -│ 4 │ 1 │ 2027 │ 639 │ -└───────────────────────────────────────────────────────┘ +--- +title: NTH_VALUE +--- + +import FunctionDescription from '@site/src/components/FunctionDescription'; + + + +Returns the value at the specified position (N) within the window frame. + +See also: + +- [FIRST_VALUE](first-value.md) +- [LAST_VALUE](last-value.md) + +## Syntax + +```sql +NTH_VALUE( + expression, + n +) +[ { IGNORE | RESPECT } NULLS ] +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY order_expression + [ window_frame ] +) +``` + +**Arguments:** +- `expression`: The column or expression to evaluate +- `n`: Position number (1-based index) of the value to return +- `IGNORE NULLS`: Optional. When specified, NULL values are skipped when counting positions +- `RESPECT NULLS`: Default behavior. NULL values are included when counting positions + +**Notes:** +- Position counting starts from 1 (not 0) +- Returns NULL if the specified position doesn't exist in the window frame +- For window frame syntax, see [Window Frame Syntax](index.md#window-frame-syntax) + +## Examples + +```sql +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + score INT +); + +INSERT INTO scores VALUES + ('Alice', 85), + ('Bob', 90), + ('Charlie', 78), + ('David', 92), + ('Eve', 88); +``` + +**Get the 2nd highest score student:** + +```sql +SELECT student, score, + NTH_VALUE(student, 2) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS second_highest_student +FROM scores; +``` + +Result: +``` +student | score | second_highest_student +---------+-------+----------------------- +David | 92 | Bob +Bob | 90 | Bob +Eve | 88 | Bob +Alice | 85 | Bob +Charlie | 78 | Bob +``` + +**Get the 3rd highest score student:** + +```sql +SELECT student, score, + NTH_VALUE(student, 3) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS third_highest_student +FROM scores; +``` + +Result: +``` +student | score | third_highest_student +---------+-------+---------------------- +David | 92 | Eve +Bob | 90 | Eve +Eve | 88 | Eve +Alice | 85 | Eve +Charlie | 78 | Eve ``` \ No newline at end of file diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/ntile.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/ntile.md index 24ac92fcc9..6e2944184f 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/ntile.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/ntile.md @@ -1,65 +1,99 @@ ---- -title: NTILE ---- -import FunctionDescription from '@site/src/components/FunctionDescription'; - - - -Divides the sorted result set into a specified number of buckets or groups. It evenly distributes the sorted rows into these buckets and assigns a bucket number to each row. The NTILE function is typically used with the ORDER BY clause to sort the results. - -Please note that the NTILE function evenly distributes the rows into buckets based on the sorting order of the rows and ensures that the number of rows in each bucket is as equal as possible. If the number of rows cannot be evenly distributed into the buckets, some buckets may have one extra row compared to the others. - -## Syntax - -```sql -NTILE(n) OVER ( - PARTITION BY expr, ... - ORDER BY expr [ASC | DESC], ... -) -``` - -## Examples - -This example retrieves the students' names, scores, grades, and assigns them to buckets based on their scores within each grade using the NTILE() window function. - -```sql -CREATE TABLE students ( - name VARCHAR(20), - score INT NOT NULL, - grade CHAR(1) NOT NULL -); - -INSERT INTO students (name, score, grade) -VALUES - ('Smith', 81, 'A'), - ('Jones', 55, 'C'), - ('Williams', 55, 'C'), - ('Taylor', 62, 'B'), - ('Brown', 62, 'B'), - ('Davies', 84, 'A'), - ('Evans', 87, 'A'), - ('Wilson', 72, 'B'), - ('Thomas', 72, 'B'), - ('Johnson', 100, 'A'); - -SELECT - name, - score, - grade, - ntile(3) OVER (PARTITION BY grade ORDER BY score DESC) AS bucket -FROM - students; - -name |score|grade|bucket| ---------+-----+-----+------+ -Johnson | 100|A | 1| -Evans | 87|A | 1| -Davies | 84|A | 2| -Smith | 81|A | 3| -Wilson | 72|B | 1| -Thomas | 72|B | 1| -Taylor | 62|B | 2| -Brown | 62|B | 3| -Jones | 55|C | 1| -Williams| 55|C | 2| -``` \ No newline at end of file +--- +title: NTILE +--- +import FunctionDescription from '@site/src/components/FunctionDescription'; + + + +Divides rows into a specified number of buckets and assigns a bucket number to each row. Rows are distributed as evenly as possible across buckets. + +## Syntax + +```sql +NTILE(bucket_count) +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression [ ASC | DESC ] +) +``` + +**Arguments:** +- `bucket_count`: Required. Number of buckets to create (must be positive integer) +- `PARTITION BY`: Optional. Divides rows into partitions +- `ORDER BY`: Required. Determines the distribution order +- `ASC | DESC`: Optional. Sort direction (default: ASC) + +**Notes:** +- Bucket numbers range from 1 to `bucket_count` +- Rows are distributed as evenly as possible +- If rows don't divide evenly, earlier buckets get one extra row +- Useful for creating percentiles and equal-sized groups + +## Examples + +```sql +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + subject VARCHAR(20), + score INT +); + +INSERT INTO scores VALUES + ('Alice', 'Math', 95), + ('Alice', 'English', 87), + ('Alice', 'Science', 92), + ('Bob', 'Math', 85), + ('Bob', 'English', 85), + ('Bob', 'Science', 80), + ('Charlie', 'Math', 88), + ('Charlie', 'English', 85), + ('Charlie', 'Science', 85); +``` + +**Divide all scores into 3 buckets (tertiles):** + +```sql +SELECT student, subject, score, + NTILE(3) OVER (ORDER BY score DESC) AS score_bucket +FROM scores +ORDER BY score DESC, student, subject; +``` + +Result: +``` +student | subject | score | score_bucket +--------+---------+-------+------------- +Alice | Math | 95 | 1 +Alice | Science | 92 | 1 +Charlie | Math | 88 | 1 +Alice | English | 87 | 2 +Bob | English | 85 | 2 +Bob | Math | 85 | 2 +Charlie | English | 85 | 3 +Charlie | Science | 85 | 3 +Bob | Science | 80 | 3 +``` + +**Divide scores into quartiles within each student:** + +```sql +SELECT student, subject, score, + NTILE(2) OVER (PARTITION BY student ORDER BY score DESC) AS performance_half +FROM scores +ORDER BY student, score DESC, subject; +``` + +Result: +``` +student | subject | score | performance_half +--------+---------+-------+----------------- +Alice | Math | 95 | 1 +Alice | Science | 92 | 1 +Alice | English | 87 | 2 +Bob | English | 85 | 1 +Bob | Math | 85 | 1 +Bob | Science | 80 | 2 +Charlie | Math | 88 | 1 +Charlie | English | 85 | 2 +Charlie | Science | 85 | 2 \ No newline at end of file diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/percent_rank.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/percent_rank.md index bc922c7d54..31d6cf2766 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/percent_rank.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/percent_rank.md @@ -5,19 +5,32 @@ import FunctionDescription from '@site/src/components/FunctionDescription'; -Returns the relative rank of a given value within a set of values. The resulting value falls between 0 and 1, inclusive. Please note that the first row in any set has a PERCENT_RANK of 0. +Calculates the relative rank of each row as a percentage. Returns values between 0 and 1, where 0 represents the lowest rank and 1 represents the highest rank. See also: [CUME_DIST](cume-dist.md) ## Syntax ```sql -PERCENT_RANK() OVER ( - PARTITION BY expr, ... - ORDER BY expr [ASC | DESC], ... +PERCENT_RANK() +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression [ ASC | DESC ] ) ``` +**Arguments:** +- `PARTITION BY`: Optional. Divides rows into partitions +- `ORDER BY`: Required. Determines the ranking order +- `ASC | DESC`: Optional. Sort direction (default: ASC) + +**Notes:** +- Returns values between 0 and 1 (inclusive) +- First row always has PERCENT_RANK of 0 +- Last row always has PERCENT_RANK of 1 +- Formula: (rank - 1) / (total_rows - 1) +- Multiply by 100 to get percentile values + ## Examples ```sql @@ -28,30 +41,30 @@ CREATE TABLE scores ( ); INSERT INTO scores VALUES - ('Alice', 85), - ('Bob', 92), - ('Carol', 78), - ('David', 95), - ('Eve', 88); - --- PERCENT_RANK example -SELECT - student, - score, - PERCENT_RANK() OVER (ORDER BY score) AS percent_rank, - ROUND(PERCENT_RANK() OVER (ORDER BY score) * 100, 1) AS percentile + ('Alice', 95), + ('Bob', 87), + ('Charlie', 87), + ('David', 82), + ('Eve', 78); +``` + +**Calculate percent rank (showing percentile position):** + +```sql +SELECT student, score, + PERCENT_RANK() OVER (ORDER BY score DESC) AS percent_rank, + ROUND(PERCENT_RANK() OVER (ORDER BY score DESC) * 100) AS percentile FROM scores -ORDER BY score; +ORDER BY score DESC, student; ``` Result: - ``` -student|score|percent_rank|percentile| --------+-----+------------+----------+ -Carol | 78| 0.0| 0.0| -Alice | 85| 0.25| 25.0| -Eve | 88| 0.5| 50.0| -Bob | 92| 0.75| 75.0| -David | 95| 1.0| 100.0| +student | score | percent_rank | percentile +--------+-------+--------------+----------- +Alice | 95 | 0.0 | 0 +Bob | 87 | 0.25 | 25 +Charlie | 87 | 0.25 | 25 +David | 82 | 0.75 | 75 +Eve | 78 | 1.0 | 100 ``` diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/rank.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/rank.md index f75b22b2bb..a277c18bbf 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/rank.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/rank.md @@ -2,62 +2,95 @@ title: RANK --- -The RANK() function assigns a unique rank to each value within an ordered group of values. - -The rank value starts at 1 and continues up sequentially. If two values are the same, they have the same rank. +Assigns a rank to each row within a partition. Rows with equal values receive the same rank, with gaps in subsequent rankings. ## Syntax ```sql -RANK() OVER ( - [ PARTITION BY ] - ORDER BY [ { ASC | DESC } ] - [ ] +RANK() +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression [ ASC | DESC ] ) ``` +**Arguments:** +- `PARTITION BY`: Optional. Divides rows into partitions +- `ORDER BY`: Required. Determines the ranking order +- `ASC | DESC`: Optional. Sort direction (default: ASC) + +**Notes:** +- Ranks start from 1 +- Equal values get the same rank +- Creates gaps in ranking sequence after ties +- Example: 1, 2, 2, 4, 5 (not 1, 2, 2, 3, 4) + ## Examples -**Create the table** ```sql -CREATE TABLE employees ( - employee_id INT, - first_name VARCHAR, - last_name VARCHAR, - department VARCHAR, - salary INT +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + subject VARCHAR(20), + score INT ); + +INSERT INTO scores VALUES + ('Alice', 'Math', 95), + ('Alice', 'English', 87), + ('Alice', 'Science', 92), + ('Bob', 'Math', 85), + ('Bob', 'English', 85), + ('Bob', 'Science', 80), + ('Charlie', 'Math', 88), + ('Charlie', 'English', 85), + ('Charlie', 'Science', 85); ``` -**Insert data** +**Rank all scores (showing tie handling with gaps):** + ```sql -INSERT INTO employees (employee_id, first_name, last_name, department, salary) VALUES - (1, 'John', 'Doe', 'IT', 90000), - (2, 'Jane', 'Smith', 'HR', 85000), - (3, 'Mike', 'Johnson', 'IT', 82000), - (4, 'Sara', 'Williams', 'Sales', 77000), - (5, 'Tom', 'Brown', 'HR', 75000); +SELECT student, subject, score, + RANK() OVER (ORDER BY score DESC) AS score_rank +FROM scores +ORDER BY score DESC, student, subject; ``` -**Ranking employees by salary** +Result: +``` +student | subject | score | score_rank +--------+---------+-------+----------- +Alice | Math | 95 | 1 +Alice | Science | 92 | 2 +Charlie | Math | 88 | 3 +Alice | English | 87 | 4 +Bob | English | 85 | 5 +Bob | Math | 85 | 5 +Charlie | English | 85 | 5 +Charlie | Science | 85 | 5 +Bob | Science | 80 | 9 +``` + +**Rank scores within each student (showing ties within partitions):** + ```sql -SELECT - employee_id, - first_name, - last_name, - department, - salary, - RANK() OVER (ORDER BY salary DESC) AS rank -FROM - employees; +SELECT student, subject, score, + RANK() OVER (PARTITION BY student ORDER BY score DESC) AS subject_rank +FROM scores +ORDER BY student, score DESC, subject; ``` Result: - -| employee_id | first_name | last_name | department | salary | rank | -|-------------|------------|-----------|------------|--------|------| -| 1 | John | Doe | IT | 90000 | 1 | -| 2 | Jane | Smith | HR | 85000 | 2 | -| 3 | Mike | Johnson | IT | 82000 | 3 | -| 4 | Sara | Williams | Sales | 77000 | 4 | -| 5 | Tom | Brown | HR | 75000 | 5 | +``` +student | subject | score | subject_rank +--------+---------+-------+------------- +Alice | Math | 95 | 1 +Alice | Science | 92 | 2 +Alice | English | 87 | 3 +Bob | English | 85 | 1 +Bob | Math | 85 | 1 +Bob | Science | 80 | 3 +Charlie | Math | 88 | 1 +Charlie | English | 85 | 2 +Charlie | Science | 85 | 2 +``` diff --git a/docs/en/sql-reference/20-sql-functions/08-window-functions/row-number.md b/docs/en/sql-reference/20-sql-functions/08-window-functions/row-number.md index cdf7f82558..dedb496180 100644 --- a/docs/en/sql-reference/20-sql-functions/08-window-functions/row-number.md +++ b/docs/en/sql-reference/20-sql-functions/08-window-functions/row-number.md @@ -1,62 +1,94 @@ ---- -title: ROW_NUMBER ---- - -Assigns a temporary sequential number to each row within a partition of a result set, starting at 1 for the first row in each partition. - -## Syntax - -```sql -ROW_NUMBER() - OVER ( [ PARTITION BY [, ... ] ] - ORDER BY [ , ... ] [ { ASC | DESC } ] ) -``` - -| Parameter | Required? | Description | -|--------------|-----------|------------------------------------------------------------------------------------------------------------| -| ORDER BY | Yes | Specifies the order of rows within each partition. | -| ASC / DESC | No | Specifies the sorting order within each partition. ASC (ascending) is the default. | -| QUALIFY | No | Filters rows based on conditions. | - -## Examples - -This example demonstrates the use of ROW_NUMBER() to assign sequential numbers to employees within their departments, ordered by descending salary. - -```sql --- Prepare the data -CREATE TABLE employees ( - employee_id INT, - first_name VARCHAR, - last_name VARCHAR, - department VARCHAR, - salary INT -); - -INSERT INTO employees (employee_id, first_name, last_name, department, salary) VALUES - (1, 'John', 'Doe', 'IT', 90000), - (2, 'Jane', 'Smith', 'HR', 85000), - (3, 'Mike', 'Johnson', 'IT', 82000), - (4, 'Sara', 'Williams', 'Sales', 77000), - (5, 'Tom', 'Brown', 'HR', 75000); - --- Select employee details along with the row number partitioned by department and ordered by salary in descending order. -SELECT - employee_id, - first_name, - last_name, - department, - salary, - ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num -FROM - employees; - -┌──────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ employee_id │ first_name │ last_name │ department │ salary │ row_num │ -├─────────────────┼──────────────────┼──────────────────┼──────────────────┼─────────────────┼─────────┤ -│ 2 │ Jane │ Smith │ HR │ 85000 │ 1 │ -│ 5 │ Tom │ Brown │ HR │ 75000 │ 2 │ -│ 1 │ John │ Doe │ IT │ 90000 │ 1 │ -│ 3 │ Mike │ Johnson │ IT │ 82000 │ 2 │ -│ 4 │ Sara │ Williams │ Sales │ 77000 │ 1 │ -└──────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` \ No newline at end of file +--- +title: ROW_NUMBER +--- + +Assigns a sequential number to each row within a partition, starting from 1. + +## Syntax + +```sql +ROW_NUMBER() +OVER ( + [ PARTITION BY partition_expression ] + ORDER BY sort_expression [ ASC | DESC ] +) +``` + +**Arguments:** +- `PARTITION BY`: Optional. Divides rows into partitions +- `ORDER BY`: Required. Determines the row numbering order +- `ASC | DESC`: Optional. Sort direction (default: ASC) + +**Notes:** +- Returns sequential integers starting from 1 +- Each partition restarts numbering from 1 +- Commonly used for ranking and pagination + +## Examples + +```sql +-- Create sample data +CREATE TABLE scores ( + student VARCHAR(20), + subject VARCHAR(20), + score INT +); + +INSERT INTO scores VALUES + ('Alice', 'Math', 95), + ('Alice', 'English', 87), + ('Alice', 'Science', 92), + ('Bob', 'Math', 78), + ('Bob', 'English', 85), + ('Bob', 'Science', 80), + ('Charlie', 'Math', 88), + ('Charlie', 'English', 90), + ('Charlie', 'Science', 85); +``` + +**Number all rows sequentially (even with tied scores):** + +```sql +SELECT student, subject, score, + ROW_NUMBER() OVER (ORDER BY score DESC, student, subject) AS row_num +FROM scores +ORDER BY score DESC, student, subject; +``` + +Result: +``` +student | subject | score | row_num +--------+---------+-------+-------- +Alice | Math | 95 | 1 +Alice | Science | 92 | 2 +Charlie | English | 90 | 3 +Charlie | Math | 88 | 4 +Alice | English | 87 | 5 +Bob | English | 85 | 6 +Charlie | Science | 85 | 7 +Bob | Science | 80 | 8 +Bob | Math | 78 | 9 +``` + +**Number rows within each student (for pagination/top-N):** + +```sql +SELECT student, subject, score, + ROW_NUMBER() OVER (PARTITION BY student ORDER BY score DESC) AS subject_rank +FROM scores +ORDER BY student, score DESC; +``` + +Result: +``` +student | subject | score | subject_rank +--------+---------+-------+------------- +Alice | Math | 95 | 1 +Alice | Science | 92 | 2 +Alice | English | 87 | 3 +Bob | English | 85 | 1 +Bob | Science | 80 | 2 +Bob | Math | 78 | 3 +Charlie | English | 90 | 1 +Charlie | Math | 88 | 2 +Charlie | Science | 85 | 3 \ No newline at end of file