# Излечение, преобразование, загрузка

## Импортируем необходимые модули

In [1]:
from sqlalchemy import create_engine, text
import pandas as pd

## Настраиваем подключение к БД

In [2]:
user = "entries_user"
password = "entries_password"
host = "localhost"
port = "5432"
database = "entries_db"

engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")

## Базовый запрос с PostgreSQL

Базовый запрос, описанный в тз с использованием диалекта PostgreSQL

```sql
SELECT
    full_name,
    TO_CHAR(report_date, 'YYYY-MM') AS month,
    COUNT(*) AS workdays_count,
    COUNT(*) FILTER (WHERE enter_dt::time <= TIME '09:00:00') AS on_time_count,
    
    COUNT(*) FILTER (
        WHERE enter_dt::time > TIME '09:00:00'
          AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 15
    ) AS late_0_15,
    
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 15
          AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 30
    ) AS late_15_30,
    
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 30
          AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 60
    ) AS late_30_60,
    
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 60
    ) AS late_60_plus,
    
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600 >= 9
    ) AS full_day_count,
    
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600 < 9
    ) AS short_day_count,
    
    ROUND(AVG(EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600), 2) AS avg_worktime

FROM workdays_tgt
GROUP BY full_name, TO_CHAR(report_date, 'YYYY-MM')
ORDER BY full_name, TO_CHAR(report_date, 'YYYY-MM');
```

In [3]:
query_aggregated_info_postgres = """
SELECT
    full_name,
    TO_CHAR(report_date, 'YYYY-MM') AS month,
    COUNT(*) AS workdays_count,
    COUNT(*) FILTER (WHERE enter_dt::time <= TIME '09:00:00') AS on_time_count,
    COUNT(*) FILTER (
        WHERE enter_dt::time > TIME '09:00:00'
          AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 15
    ) AS late_0_15,
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 15
          AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 30
    ) AS late_15_30,
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 30
          AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 60
    ) AS late_30_60,
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 60
    ) AS late_60_plus,
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600 >= 9
    ) AS full_day_count,
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600 < 9
    ) AS short_day_count,
    ROUND(AVG(EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600), 2) AS avg_worktime
FROM workdays_tgt
GROUP BY full_name, TO_CHAR(report_date, 'YYYY-MM')
ORDER BY full_name, TO_CHAR(report_date, 'YYYY-MM');
"""

## Базовый запрос без PostgreSQL

На самом деле это запрос без фильтруемых агрегатов, особенности PostgreSQL как `EXTRACT`, `EPOCH` всё ещё фигурируют в запросе, но они имеют альтернативы в других СУБД как арифметика интервалов в худшем случае.

```sql
SELECT
    full_name,
    TO_CHAR(report_date, 'YYYY-MM') AS month,
    COUNT(*) AS workdays_count,

    SUM(CASE WHEN enter_dt::time <= TIME '09:00:00' THEN 1 ELSE 0 END) AS on_time_count,

    SUM(CASE
        WHEN enter_dt::time > TIME '09:00:00'
         AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 15
        THEN 1 ELSE 0
    END) AS late_0_15,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 15
         AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 30
        THEN 1 ELSE 0
    END) AS late_15_30,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 30
         AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 60
        THEN 1 ELSE 0
    END) AS late_30_60,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 60
        THEN 1 ELSE 0
    END) AS late_60_plus,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600 >= 9
        THEN 1 ELSE 0
    END) AS full_day_count,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600 < 9
        THEN 1 ELSE 0
    END) AS short_day_count,

    ROUND(AVG(EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600), 2) AS avg_worktime

FROM workdays_tgt
GROUP BY full_name, TO_CHAR(report_date, 'YYYY-MM')
ORDER BY full_name, TO_CHAR(report_date, 'YYYY-MM');


```

In [4]:
query_aggregated_info_clean = """
SELECT
    full_name,
    TO_CHAR(report_date, 'YYYY-MM') AS month,
    COUNT(*) AS workdays_count,

    SUM(CASE WHEN enter_dt::time <= TIME '09:00:00' THEN 1 ELSE 0 END) AS on_time_count,

    SUM(CASE
        WHEN enter_dt::time > TIME '09:00:00'
         AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 15
        THEN 1 ELSE 0
    END) AS late_0_15,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 15
         AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 30
        THEN 1 ELSE 0
    END) AS late_15_30,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 30
         AND EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 <= 60
        THEN 1 ELSE 0
    END) AS late_30_60,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (enter_dt::time - TIME '09:00:00')) / 60 > 60
        THEN 1 ELSE 0
    END) AS late_60_plus,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600 >= 9
        THEN 1 ELSE 0
    END) AS full_day_count,

    SUM(CASE
        WHEN EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600 < 9
        THEN 1 ELSE 0
    END) AS short_day_count,

    ROUND(AVG(EXTRACT(EPOCH FROM (exit_dt - enter_dt)) / 3600), 2) AS avg_worktime

FROM workdays_tgt
GROUP BY full_name, TO_CHAR(report_date, 'YYYY-MM')
ORDER BY full_name, TO_CHAR(report_date, 'YYYY-MM');

"""

## Запрос с учётом департаментов

Используем `LEFT JOIN` для сотрудников без департамента, для них же используем `COALESCE`.

```sql
INSERT INTO aggregated_info_tgt (
    full_name,
    month,
    workdays_count,
    on_time_count,
    late_0_15,
    late_15_30,
    late_30_60,
    late_60_plus,
    left_on_time_count,
    left_early_0_15,
    left_early_15_30,
    left_early_30_60,
    left_early_60_plus,
    full_day_count,
    short_day_count,
    avg_worktime
)
SELECT
    wd.full_name,
    TO_CHAR(wd.report_date, 'YYYY-MM') AS month,
    COUNT(*) AS workdays_count,

    -- Пришёл вовремя
    COUNT(*) FILTER (
        WHERE wd.enter_dt::time <= MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0)
    ) AS on_time_count,

    -- Опоздание 0–15 минут
    COUNT(*) FILTER (
        WHERE wd.enter_dt::time > MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0)
          AND EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 <= 15
    ) AS late_0_15,

    -- Опоздание 15–30 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 > 15
          AND EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 <= 30
    ) AS late_15_30,

    -- Опоздание 30–60 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 > 30
          AND EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 <= 60
    ) AS late_30_60,

    -- Опоздание > 60 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 > 60
    ) AS late_60_plus,

    -- Ушёл вовремя или позже
    COUNT(*) FILTER (
        WHERE wd.exit_dt::time >= MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0)
    ) AS left_on_time_count,

    -- Ушёл раньше ≤15 минут
    COUNT(*) FILTER (
        WHERE wd.exit_dt::time < MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0)
          AND EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 <= 15
    ) AS left_early_0_15,

    -- Ушёл раньше 15–30 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 > 15
          AND EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 <= 30
    ) AS left_early_15_30,

    -- Ушёл раньше 30–60 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 > 30
          AND EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 <= 60
    ) AS left_early_30_60,

    -- Ушёл раньше > 60 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 > 60
    ) AS left_early_60_plus,

    -- Полный день
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.exit_dt - wd.enter_dt)) / 3600 >= (COALESCE(d.exit_hour, 18) - COALESCE(d.enter_hour, 9))
    ) AS full_day_count,

    -- Неполный день
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.exit_dt - wd.enter_dt)) / 3600 < (COALESCE(d.exit_hour, 18) - COALESCE(d.enter_hour, 9))
    ) AS short_day_count,

    -- Средняя длительность
    ROUND(AVG(EXTRACT(EPOCH FROM (wd.exit_dt - wd.enter_dt)) / 3600), 2) AS avg_worktime

FROM workdays_tgt wd
LEFT JOIN worker_department_xref xref
    ON wd.full_name = xref.full_name
LEFT JOIN departments_ref d
    ON xref.department = d.name

GROUP BY wd.full_name, TO_CHAR(wd.report_date, 'YYYY-MM')
ORDER BY wd.full_name, TO_CHAR(wd.report_date, 'YYYY-MM');
```

In [5]:
query_aggregated_info_department = """
INSERT INTO aggregated_info_tgt (
    full_name,
    month,
    workdays_count,
    on_time_count,
    late_0_15,
    late_15_30,
    late_30_60,
    late_60_plus,
    left_on_time_count,
    left_early_0_15,
    left_early_15_30,
    left_early_30_60,
    left_early_60_plus,
    full_day_count,
    short_day_count,
    avg_worktime
)
SELECT
    wd.full_name,
    TO_CHAR(wd.report_date, 'YYYY-MM') AS month,
    COUNT(*) AS workdays_count,

    -- Пришёл вовремя
    COUNT(*) FILTER (
        WHERE wd.enter_dt::time <= MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0)
    ) AS on_time_count,

    -- Опоздание 0–15 минут
    COUNT(*) FILTER (
        WHERE wd.enter_dt::time > MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0)
          AND EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 <= 15
    ) AS late_0_15,

    -- Опоздание 15–30 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 > 15
          AND EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 <= 30
    ) AS late_15_30,

    -- Опоздание 30–60 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 > 30
          AND EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 <= 60
    ) AS late_30_60,

    -- Опоздание > 60 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.enter_dt::time - MAKE_TIME(COALESCE(d.enter_hour, 9), 0, 0))) / 60 > 60
    ) AS late_60_plus,

    -- Ушёл вовремя или позже
    COUNT(*) FILTER (
        WHERE wd.exit_dt::time >= MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0)
    ) AS left_on_time_count,

    -- Ушёл раньше ≤15 минут
    COUNT(*) FILTER (
        WHERE wd.exit_dt::time < MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0)
          AND EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 <= 15
    ) AS left_early_0_15,

    -- Ушёл раньше 15–30 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 > 15
          AND EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 <= 30
    ) AS left_early_15_30,

    -- Ушёл раньше 30–60 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 > 30
          AND EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 <= 60
    ) AS left_early_30_60,

    -- Ушёл раньше > 60 минут
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (MAKE_TIME(COALESCE(d.exit_hour, 18), 0, 0) - wd.exit_dt::time)) / 60 > 60
    ) AS left_early_60_plus,

    -- Полный день
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.exit_dt - wd.enter_dt)) / 3600 >= (COALESCE(d.exit_hour, 18) - COALESCE(d.enter_hour, 9))
    ) AS full_day_count,

    -- Неполный день
    COUNT(*) FILTER (
        WHERE EXTRACT(EPOCH FROM (wd.exit_dt - wd.enter_dt)) / 3600 < (COALESCE(d.exit_hour, 18) - COALESCE(d.enter_hour, 9))
    ) AS short_day_count,

    -- Средняя длительность
    ROUND(AVG(EXTRACT(EPOCH FROM (wd.exit_dt - wd.enter_dt)) / 3600), 2) AS avg_worktime

FROM workdays_tgt wd
LEFT JOIN worker_department_xref xref
    ON wd.full_name = xref.full_name
LEFT JOIN departments_ref d
    ON xref.department = d.name

GROUP BY wd.full_name, TO_CHAR(wd.report_date, 'YYYY-MM')
ORDER BY wd.full_name, TO_CHAR(wd.report_date, 'YYYY-MM');
"""

## Исполнение запросов

In [6]:
with engine.connect() as conn:
    with conn.begin():
        res_postgres = conn.execute(text(query_aggregated_info_postgres))
        res_clean = conn.execute(text(query_aggregated_info_clean))
        conn.execute(text(query_aggregated_info_department))

for row in res_postgres:
    print(*row)

for row in res_clean:
    print(*row)

Головин Сергей Вальерьевич 2006-04 20 6 3 4 0 7 6 14 8.76
Головин Сергей Вальерьевич 2006-05 23 7 4 6 0 6 15 8 9.42
Головин Сергей Вальерьевич 2006-06 20 8 4 3 0 5 9 11 9.16
Дружинин Георгий Михайлович 2006-04 20 7 4 1 0 8 11 9 8.90
Дружинин Георгий Михайлович 2006-05 23 6 3 6 0 8 16 7 9.27
Дружинин Георгий Михайлович 2006-06 20 7 3 3 1 6 10 10 9.13
Лапухов Алексей Дмитриевич 2006-04 20 6 3 0 0 11 4 16 8.57
Лапухов Алексей Дмитриевич 2006-05 23 8 1 3 0 11 12 11 8.90
Лапухов Алексей Дмитриевич 2006-06 20 8 3 1 0 8 11 9 9.45
Путилов Андрей Маркович 2006-04 20 6 2 1 0 11 11 9 9.04
Путилов Андрей Маркович 2006-05 23 5 2 5 0 11 11 12 9.00
Путилов Андрей Маркович 2006-06 20 7 5 3 0 5 11 9 9.08
Работаева Ирина Генриховна 2006-04 16 10 1 1 0 4 16 0 16.88
Работаева Ирина Генриховна 2006-05 15 8 2 3 0 2 15 0 17.06
Работаева Ирина Генриховна 2006-06 14 12 1 1 0 0 14 0 17.12
Работаев Даниил Игоревич 2006-04 16 13 0 1 0 2 16 0 17.13
Работаев Даниил Игоревич 2006-05 15 8 2 0 0 5 15 0 17.24
Работаев 

### Выгружаем таблицу в .csv-файл

In [7]:
df_agg = pd.read_sql("SELECT * FROM aggregated_info_tgt;", engine)
df_agg.to_csv("data/target/aggregated_info_tgt.csv")