# Notebook for generating

# Import

In [117]:
import os
import random
import calendar
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
from dataclasses import dataclass
from typing import Literal
from IPython.display import display

# Settings & Utils

## Constants & Helper

In [118]:
RESET_DB = False

UPDATE_TRANSACTION_TABLES_THIS_RUN = True
ADD_RECORDS_THIS_RUN = True
UPDATE_RECORDS_THIS_RUN = True
DELETE_RECORDS_THIS_RUN = True

In [119]:
SEPERATOR = "|"
CHANGE_FIELD = "change_field"
TRANSACTION_DATE_COLUMN = "transaction_date"
START_TRANSATION_DATE = datetime(2000, 1, 31)

UPDATE_MASTERDATA = False
UPDATE_TRANSACTIONAL_DATA = False

TBL_EMPLOYEE = "employee"
TBL_CLIENTS = "clients"
TBL_BUSINESSPARTNER = "businesspartner"
TBL_COSTCENTER = "costcenter"
TBL_DEPARTMENT = "department"
TBL_TASK = "task"
TBL_PROJECT = "project"

TBL_USERS = "users"
TBL_PROJECT = "project"
TBL_PROJECTTIME = "projecttime"
TBL_PAY_TYPE = "pay_type"
TBL_EMPLOYEE_PAY = "employee_pay"

DB_ATOSS = "atoss"
DB_PCT = "pct"
DB_DATEV = "datev"

SCHEMA_DBO = "dbo"
SCHEMA_MDM = "mdm"

In [120]:
def create_wide_df(nrows: int, ncolumns_each: int = 50) -> pd.DataFrame:
    row_rng = range(0, nrows)
    col_rng = range(0, ncolumns_each)
    string_values = [
        [f"string-value ({row_idx + 1}, {col_idx + 1})" for col_idx in col_rng]
        for row_idx in row_rng
    ]
    random_num_values = np.random.randint(0, 10_000_000, size=(nrows, ncolumns_each))

    date_values = [
        [datetime(2005, 5, 5, 5, 5, 5, 500) for _ in col_rng]
        for _ in row_rng
    ]

    column_names = [f"bdcolumn{i}" for i in range(1, (3 * ncolumns_each) + 1)]

    df_string_values = pd.DataFrame(string_values)
    df_num_values = pd.DataFrame(random_num_values)
    df_date_values = pd.DataFrame(date_values)

    df = pd.concat([df_string_values, df_num_values, df_date_values], axis=1)
    df.columns = column_names

    return df

def generate_wide_df(df: pd.DataFrame, ncolumns_each: int = 50) -> pd.DataFrame:
    df_bd = create_wide_df(nrows=df.shape[0], ncolumns_each=ncolumns_each)
    return pd.concat([df, df_bd], axis=1)

In [121]:
@dataclass(frozen=True)
class MdTable:
    tablename: str

    @property
    def tablepath(self) -> str:
        return f"./_md_{self.tablename}.csv"
    
    def read(self, sep: str = SEPERATOR, **kwargs) -> pd.DataFrame:
        return pd.read_csv(self.tablepath, sep=sep, **kwargs)


@dataclass(frozen=True)
class DbTable:
    db: str
    schema: str
    tablename: str

    @property
    def table_total_name(self) -> str:
        return f"{self.db}.{self.schema}.{self.tablename}"
    
    @property
    def tablepath(self) -> str:
        return f"./{self.table_total_name}.csv"
    
    @property
    def is_existing(self) -> bool:
        return os.path.exists(self.tablepath)
    
    def save(
            self,
            df: pd.DataFrame,
            sep: str = SEPERATOR,
            mode: Literal["a", "w"] = "w"
        ) -> None:
        df.to_csv(
            self.tablepath,
            sep=sep,
            index=False,
            encoding="UTF-8",
            mode=mode,
            header=(mode == "w")
        )
        print(f"[{str(datetime.now())[:19]}] TABLE successfully saved at: {self.tablepath}   SHAPE: {df.shape}")
        
    def read(self, sep: str = SEPERATOR, columns: list[str] = None, **kwargs) -> pd.DataFrame:
        return pd.read_csv(self.tablepath, sep=sep, usecols=columns, **kwargs)
    
    def delete(self) -> None:
        if not self.is_existing:
            log = f"{self.table_total_name} not existing."
            print(log)
            return
        
        os.remove(self.tablepath)
        log = f"TABLE {self.table_total_name} successfully removed."
        print(log)

def fn_update_count(row: str) -> str:
    split_str = " uc: "
    if split_str in row:
        value, uc = row.split(split_str)
        uc = int(uc) + 1
        return f"{value} uc: {uc}"
    return f"{row} uc: 1"

In [122]:
def pd_crossjoin(
        df: pd.DataFrame,
        join_table: DbTable,
        id_column_name_jt: str,
        id_new_column_name: str,
        include_columns: list[str] = None
        ) -> pd.DataFrame:
    if include_columns is None:
        include_columns = []
    include_columns = [id_column_name_jt] + include_columns
    # df_join = join_table.read()[[id_column_name_jt]] \
    #     .rename(columns={id_column_name_jt: id_new_column_name})
    df_join = join_table.read()[include_columns] \
        .rename(columns={id_column_name_jt: id_new_column_name})

    return pd.merge(df, df_join, how="cross").drop_duplicates()

In [123]:
def pd_add_rows(
        *,
        df: pd.DataFrame,
        df_md: pd.DataFrame,
        nrows: int,
        id_columns: list[str] = None,
        crossjoin_tables: DbTable = None
        ) -> pd.DataFrame:
    if id_columns is None:
        id_columns = ["id"]
    
    df_current_ids = df[id_columns]

    added_filter = (df_md[id_columns[0]].isin(df_current_ids[id_columns[0]]))
    for i in range(1, len(id_columns)):
        added_filter &= (df[id_columns[i]].isin(df_current_ids[id_columns[i]]))

    df_new_records = df_md[~added_filter].sample(nrows)
    if crossjoin_tables is not None:
        for i in range(1, len(id_columns)):
            df_new_records = pd_crossjoin(
                df=df_new_records,
                join_table=crossjoin_tables[i - 1],
                id_column_name_jt=id_columns[0],
                id_new_column_name=id_columns[i]
            )

    df_new_records = df_new_records.sample(nrows)

    print(f"New Records (nrows={nrows}):")
    display(df_new_records)

    return pd.concat([df, df_new_records], axis=0).reset_index(drop=True)


def pd_update_rows(
        *,
        df: pd.DataFrame,
        nrows: int,
        id_columns: list = None,
        cf_column: str = CHANGE_FIELD
        ) -> pd.DataFrame:
    if id_columns is None:
        id_columns = ["id"]

    df_sample_ids = df.sample(nrows)[id_columns]

    update_filter = (df[id_columns[0]].isin(df_sample_ids[id_columns[0]]))
    for i in range(1, len(id_columns)):
        update_filter &= (df[id_columns[i]].isin(df_sample_ids[id_columns[i]]))

    df.loc[update_filter, CHANGE_FIELD] = df.loc[update_filter, cf_column] \
                                            .apply(fn_update_count)
    print(f"Updated Records (nrows={nrows}):")
    display(df[update_filter])

    return df.reset_index(drop=True)


def pd_delete_rows(*, df: pd.DataFrame, nrows: int) -> pd.DataFrame:
    # TODO: implement multiple column id
    delete_ids = list(df.sample(nrows)["id"])
    delete_filter = df["id"].isin(delete_ids)

    print(f"Removed Records (nrows={nrows}):")
    display(df[delete_filter])

    return df[~delete_filter].reset_index(drop=True)

In [124]:
datetime.now().date().day

20

In [125]:
def end_of_month(dt: datetime, offset_months: int = 0):
    dt = datetime(dt.year, dt.month, 1) + timedelta(days=(offset_months * 31))
    _, last_day = calendar.monthrange(dt.year, dt.month)
    return datetime(dt.year, dt.month, last_day)

def get_next_transaction_datetime(
        table: DbTable,
        sep: str = SEPERATOR,
        date_column: str = TRANSACTION_DATE_COLUMN,
        default_date: datetime = START_TRANSATION_DATE
    ) -> datetime:
    try:
        date_str = pd.read_csv(
            table.tablepath,
            sep=sep,
            usecols=[date_column]
        ).max().values[0]
        next_transaction_dt = datetime.strptime(date_str, '%Y-%m-%d') + timedelta()
        return end_of_month(next_transaction_dt, offset_days=1)
    except:
        return default_date

## Table Setup

### MasterData Tables

In [126]:
tbl_md_employee = MdTable(TBL_EMPLOYEE)
tbl_md_clients = MdTable(TBL_CLIENTS)
tbl_md_businesspartner = MdTable(TBL_BUSINESSPARTNER)
tbl_md_costcenter = MdTable(TBL_COSTCENTER)
tbl_md_department = MdTable(TBL_DEPARTMENT)
tbl_md_project = MdTable(TBL_PROJECT)
tbl_md_task = MdTable(TBL_TASK)

display(
    tbl_md_employee,
    tbl_md_clients,
    tbl_md_businesspartner,
    tbl_md_costcenter,
    tbl_md_department,
    tbl_md_project,
    tbl_md_task
)

MdTable(tablename='employee')

MdTable(tablename='clients')

MdTable(tablename='businesspartner')

MdTable(tablename='costcenter')

MdTable(tablename='department')

MdTable(tablename='project')

MdTable(tablename='task')

In [127]:
df_md_employee = tbl_md_employee.read(parse_dates=["leave_date"])
df_md_clients = tbl_md_clients.read()
df_md_businesspartner = tbl_md_businesspartner.read()
df_md_costcenter = tbl_md_costcenter.read()
df_md_department = tbl_md_department.read()
df_md_project = tbl_md_project.read()
df_md_task = tbl_md_task.read()

### DB Tables

In [128]:
tbl_db_pct_users = DbTable(DB_PCT, SCHEMA_DBO, TBL_USERS)
tbl_db_pct_project = DbTable(DB_PCT, SCHEMA_DBO, TBL_PROJECT)
tbl_db_pct_department = DbTable(DB_PCT, SCHEMA_DBO, TBL_DEPARTMENT)
tbl_db_pct_task = DbTable(DB_PCT, SCHEMA_DBO, TBL_TASK)
tbl_db_pct_businesspartner = DbTable(DB_PCT, SCHEMA_DBO, TBL_BUSINESSPARTNER)
tbl_db_pct_projecttime = DbTable(DB_PCT, SCHEMA_DBO, TBL_PROJECTTIME)
tbl_db_pct_employee_mdm = DbTable(DB_PCT, SCHEMA_MDM, TBL_EMPLOYEE)

tbl_db_datev_employee = DbTable(DB_DATEV, SCHEMA_DBO, TBL_EMPLOYEE)
tbl_db_datev_clients = DbTable(DB_DATEV, SCHEMA_DBO, TBL_CLIENTS)
tbl_db_datev_department = DbTable(DB_DATEV, SCHEMA_DBO, TBL_DEPARTMENT)
tbl_db_datev_costcenter = DbTable(DB_DATEV, SCHEMA_DBO, TBL_COSTCENTER)
tbl_db_datev_paytype = DbTable(DB_DATEV, SCHEMA_DBO, TBL_PAY_TYPE)
tbl_db_datev_employee_pay = DbTable(DB_DATEV, SCHEMA_DBO, TBL_EMPLOYEE_PAY)

ALL_TABLES = [
    tbl_db_pct_users,
    tbl_db_pct_project,
    tbl_db_pct_department,
    tbl_db_pct_task,
    tbl_db_pct_businesspartner,
    tbl_db_pct_projecttime,
    tbl_db_pct_employee_mdm,
    tbl_db_datev_employee,
    tbl_db_datev_clients,
    tbl_db_datev_department,
    tbl_db_datev_costcenter,
    tbl_db_datev_paytype,
    tbl_db_datev_employee_pay
]

display(
    tbl_db_pct_users,
    tbl_db_pct_project,
    tbl_db_pct_department,
    tbl_db_pct_task,
    tbl_db_pct_businesspartner,
    tbl_db_pct_projecttime,
    tbl_db_pct_employee_mdm,
    tbl_db_datev_employee,
    tbl_db_datev_clients,
    tbl_db_datev_department,
    tbl_db_datev_costcenter,
    tbl_db_datev_paytype,
    tbl_db_datev_employee_pay
)

DbTable(db='pct', schema='dbo', tablename='users')

DbTable(db='pct', schema='dbo', tablename='project')

DbTable(db='pct', schema='dbo', tablename='department')

DbTable(db='pct', schema='dbo', tablename='task')

DbTable(db='pct', schema='dbo', tablename='businesspartner')

DbTable(db='pct', schema='dbo', tablename='projecttime')

DbTable(db='pct', schema='mdm', tablename='employee')

DbTable(db='datev', schema='dbo', tablename='employee')

DbTable(db='datev', schema='dbo', tablename='clients')

DbTable(db='datev', schema='dbo', tablename='department')

DbTable(db='datev', schema='dbo', tablename='costcenter')

DbTable(db='datev', schema='dbo', tablename='pay_type')

DbTable(db='datev', schema='dbo', tablename='employee_pay')

# Generate or Update databasefiles

## Reset DB if needed

In [129]:
if RESET_DB:
    table: DbTable
    for table in ALL_TABLES:
        table.delete()
else:
    print("No Reset!")

No Reset!


## ERP-System - datev

### dbo.clients

#### Create if not existing

In [130]:
INIT_CLIENTS_COUNT = 10

if not tbl_db_datev_clients.is_existing:
    df_init = df_md_clients.sample(INIT_CLIENTS_COUNT)
    tbl_db_datev_clients.save(df_init)

df_db_datev_clients = tbl_db_datev_clients.read()
df_db_datev_clients.head()

Unnamed: 0,id,name,address,change_field
0,92,Xcel Ventures GmbH,"Musterstraße 92, 10198 Berlin",cf uc: 2
1,83,Universal Solutions AG,"Musterstraße 83, 10189 Berlin",cf
2,50,Mu Enterprises AG,"Musterstraße 50, 10156 Berlin",cf uc: 1
3,63,Pinnacle Consulting SE,"Musterstraße 63, 10169 Berlin",cf uc: 1
4,69,Radiant Innovations SE,"Musterstraße 69, 10175 Berlin",cf uc: 1


#### Update Records

In [131]:
update_record_count = random.choice(range(0, 4))

df_db_datev_clients = pd_update_rows(
    df=df_db_datev_clients,
    nrows=update_record_count
)

tbl_db_datev_clients.save(df_db_datev_clients)

Updated Records (nrows=2):


Unnamed: 0,id,name,address,change_field
3,63,Pinnacle Consulting SE,"Musterstraße 63, 10169 Berlin",cf uc: 2
14,73,Sigma Innovations Ltd.,"Musterstraße 73, 10179 Berlin",cf uc: 1


[2023-12-20 23:39:16] TABLE successfully saved at: ./datev.dbo.clients.csv   SHAPE: (15, 4)


#### Add Records

In [132]:
add_record_count = random.choice(range(0, 2)) if ADD_RECORDS_THIS_RUN else 0

df_db_datev_clients = pd_add_rows(
    df=df_db_datev_clients,
    df_md=df_md_clients,
    nrows=add_record_count
)

tbl_db_datev_clients.save(df_db_datev_clients)

New Records (nrows=1):


Unnamed: 0,id,name,address,change_field
85,86,Vista Technologies SE,"Musterstraße 86, 10192 Berlin",cf


[2023-12-20 23:39:16] TABLE successfully saved at: ./datev.dbo.clients.csv   SHAPE: (16, 4)


### dbo.department

#### Create if not existing

In [133]:
INIT_DEPARTMENT_COUNT = 5

if not tbl_db_datev_department.is_existing:
    df_init = df_md_department.sample(INIT_DEPARTMENT_COUNT)

    df_init = pd_crossjoin(
        df=df_init,
        join_table=tbl_db_datev_clients,
        id_column_name_jt="id",
        id_new_column_name="client_id"
    )

    tbl_db_datev_department.save(df_init)

df_db_datev_department = tbl_db_datev_department.read()
df_db_datev_department.sample(5).head()

Unnamed: 0,id,name,change_field,client_id
9,2,Brand Management,cf,67
38,17,Environmental Affairs,cf uc: 1,27
36,17,Environmental Affairs,cf,87
28,1,Administration,cf,27
7,2,Brand Management,cf,4


#### Update Records

In [134]:
update_record_count = random.choice(range(0, 4))

df_db_datev_department = pd_update_rows(
    df=df_db_datev_department,
    nrows=update_record_count,
    id_columns=["id", "client_id"]
)

tbl_db_datev_department.save(df_db_datev_department)

Updated Records (nrows=2):


Unnamed: 0,id,name,change_field,client_id
7,2,Brand Management,cf uc: 1,4
8,2,Brand Management,cf uc: 2,27
27,1,Administration,cf uc: 1,4
28,1,Administration,cf uc: 1,27


[2023-12-20 23:39:16] TABLE successfully saved at: ./datev.dbo.department.csv   SHAPE: (54, 4)


#### Add Records

In [135]:
add_record_count = random.choice(range(0, 2)) if ADD_RECORDS_THIS_RUN else 0

df_db_datev_department = pd_add_rows(
    df=df_db_datev_department,
    df_md=df_md_department,
    nrows=add_record_count,
    id_columns=["id", "client_id"],
    crossjoin_tables=[tbl_db_datev_clients]
)

tbl_db_datev_department.save(df_db_datev_department)

New Records (nrows=0):


Unnamed: 0,id,name,change_field,client_id


[2023-12-20 23:39:16] TABLE successfully saved at: ./datev.dbo.department.csv   SHAPE: (54, 4)


### dbo.costcenter

#### Create if not existing

In [136]:
INIT_COSTCENTER_COUNT = 20

if not tbl_db_datev_costcenter.is_existing:
    df_init = df_md_costcenter.sample(INIT_COSTCENTER_COUNT)

    df_init = pd_crossjoin(
        df=df_init,
        join_table=tbl_db_datev_department,
        id_column_name_jt="id",
        id_new_column_name="department_id"
    )

    df_init = pd_crossjoin(
        df=df_init,
        join_table=tbl_db_datev_clients,
        id_column_name_jt="id",
        id_new_column_name="client_id"
    )

    tbl_db_datev_costcenter.save(df_init)


df_db_datev_costcenter = tbl_db_datev_costcenter.read()
df_db_datev_costcenter.sample(5).head()

Unnamed: 0,id,buKr,costcenter_short,name,change_field,department_id,client_id
756,704,1739,75450,KSTNAME704,cf,2,87
784,704,1739,75450,KSTNAME704,cf,17,69
309,1212,1743,33650,KSTNAME1212,cf,2,67
886,1949,1748,37500,KSTNAME1949,cf,17,87
59,144,1735,88650,KSTNAME144,cf,2,67


In [137]:
unique_together_count = \
    len(df_db_datev_costcenter["id"].unique()) * \
    len(df_db_datev_costcenter["client_id"].unique()) * \
    len(df_db_datev_costcenter["department_id"].unique())

unique_together_count, df_db_datev_costcenter.shape[0], df_db_datev_costcenter.drop_duplicates().shape[0]

(2352, 1008, 1008)

#### Update Records

In [138]:
update_record_count = random.choice(range(0, 7))

df_db_datev_costcenter = pd_update_rows(
    df=df_db_datev_costcenter,
    nrows=update_record_count,
    id_columns=["id", "client_id", "department_id"]
)

tbl_db_datev_costcenter.save(df_db_datev_costcenter)

Updated Records (nrows=0):


Unnamed: 0,id,buKr,costcenter_short,name,change_field,department_id,client_id


[2023-12-20 23:39:16] TABLE successfully saved at: ./datev.dbo.costcenter.csv   SHAPE: (1008, 7)


#### Add Records

In [139]:
add_record_count = random.choice(range(0, 3)) if ADD_RECORDS_THIS_RUN else 0

df_db_datev_costcenter = pd_add_rows(
    df=df_db_datev_costcenter,
    df_md=df_md_costcenter,
    nrows=add_record_count,
    id_columns=["id", "client_id", "department_id"],
    crossjoin_tables=[tbl_db_datev_clients, tbl_db_datev_department]
)

tbl_db_datev_costcenter.save(df_db_datev_costcenter)

New Records (nrows=0):


Unnamed: 0,id,buKr,costcenter_short,name,change_field,client_id,department_id


[2023-12-20 23:39:16] TABLE successfully saved at: ./datev.dbo.costcenter.csv   SHAPE: (1008, 7)


### dbo.employee

#### Create if not existing

In [140]:
INIT_EMPLOYEE_COUNT = 5000
INIT_EMPLOYEE_TOTAL_COUNT = INIT_EMPLOYEE_COUNT * 6

if not tbl_db_datev_employee.is_existing:
    df_init = df_md_employee.sample(INIT_EMPLOYEE_COUNT)

    df_init = pd_crossjoin(
        df=df_init,
        join_table=tbl_db_datev_clients,
        id_column_name_jt="id",
        id_new_column_name="client_id"
    )

    df_init = pd_crossjoin(
        df=df_init,
        join_table=tbl_db_datev_department,
        id_column_name_jt="id",
        id_new_column_name="costcenter_id"
    )

    df_init = df_init.sample(INIT_EMPLOYEE_TOTAL_COUNT)

    tbl_db_datev_employee.save(df_init)

df_db_datev_employee = tbl_db_datev_employee.read()
df_db_datev_employee.head()

Unnamed: 0,id,firstname,lastname,birthdate,entry_date,salary,leave_date,change_field,client_id,costcenter_id
0,3800,Kamila,Szymanska,1989-09-30,2023-07-29,5298.23,,cf,50,17
1,2091,Christopher,Robinson,2003-04-09,2004-11-23,4795.34,,cf,83,2
2,3410,Avery,Smith,1964-10-09,2015-08-24,4571.15,,cf,4,15
3,7346,Ángel,Pérez,2001-09-07,2021-12-05,4541.82,,cf,67,15
4,2019,Grace,Robinson,1954-05-07,2002-10-20,6097.44,,cf,92,15


#### Update Records

In [141]:
update_record_count = random.choice(range(0, 7))

df_db_datev_employee = pd_update_rows(
    df=df_db_datev_employee,
    nrows=update_record_count
)

tbl_db_datev_employee.save(df_db_datev_employee)

Updated Records (nrows=6):


Unnamed: 0,id,firstname,lastname,birthdate,entry_date,salary,leave_date,change_field,client_id,costcenter_id
130,7329,Zofia,Nowak,1954-11-07,2001-11-02,4757.99,,cf uc: 1,92,10
2724,7778,Mohammed,Ahmed,1960-03-18,2001-11-28,5177.48,,cf uc: 1,67,2
3263,1057,Mia,Schuster,1958-11-09,1998-12-09,6420.99,,cf uc: 1,87,17
3304,2357,Yuki,Takahashi,1985-02-21,2024-09-04,5195.86,,cf uc: 1,92,17
3530,7329,Zofia,Nowak,1954-11-07,2001-11-02,4757.99,,cf uc: 1,83,17
3631,2357,Yuki,Takahashi,1985-02-21,2024-09-04,5195.86,,cf uc: 1,4,15
4180,1057,Mia,Schuster,1958-11-09,1998-12-09,6420.99,,cf uc: 1,92,17
4493,7778,Mohammed,Ahmed,1960-03-18,2001-11-28,5177.48,,cf uc: 1,63,17
4547,7778,Mohammed,Ahmed,1960-03-18,2001-11-28,5177.48,,cf uc: 1,87,1
4941,7329,Zofia,Nowak,1954-11-07,2001-11-02,4757.99,,cf uc: 1,67,10


[2023-12-20 23:39:16] TABLE successfully saved at: ./datev.dbo.employee.csv   SHAPE: (30042, 10)


#### Add Records

In [142]:
add_record_count = random.choice(range(2, 20)) if ADD_RECORDS_THIS_RUN else 0

df_db_datev_employee = pd_add_rows(
    df=df_db_datev_employee,
    df_md=df_md_employee,
    nrows=add_record_count,
    crossjoin_tables=[tbl_db_datev_clients, tbl_db_datev_department],
    id_columns=["id", "client_id", "costcenter_id"]
)

tbl_db_datev_employee.save(df_db_datev_employee)

New Records (nrows=2):


Unnamed: 0,id,firstname,lastname,birthdate,entry_date,salary,leave_date,change_field,client_id,costcenter_id
1295,5982,Lara,Müller,1979-05-15,2014-05-11,4767.35,NaT,cf,4,14
1650,5982,Lara,Müller,1979-05-15,2014-05-11,4767.35,NaT,cf,73,17


[2023-12-20 23:39:16] TABLE successfully saved at: ./datev.dbo.employee.csv   SHAPE: (30044, 10)


### dbo.paytype [Frozen Master Data Table]

#### Create

In [143]:
paytype_list = [
    "Basisgehalt",
    "AG-Anteil",
    "Lohnsteuer",
    "Soli-Zuschlag",
    "Krankenk.",
    "Pflegevers.",
    "Arbeitslosenvers.",
    "Rentenvers."
]

paytype_factor = [1.] + [(len(paytype_list) - i) * 0.025 for i in range(1, len(paytype_list))]

df_paytype = pd.DataFrame({
    "id": [i for i in range(1, len(paytype_list) + 1)],
    "name": paytype_list,
    "salary_factor": paytype_factor
})

tbl_db_datev_paytype.save(df_paytype)
tbl_db_datev_paytype.read()

[2023-12-20 23:39:16] TABLE successfully saved at: ./datev.dbo.pay_type.csv   SHAPE: (8, 3)


Unnamed: 0,id,name,salary_factor
0,1,Basisgehalt,1.0
1,2,AG-Anteil,0.175
2,3,Lohnsteuer,0.15
3,4,Soli-Zuschlag,0.125
4,5,Krankenk.,0.1
5,6,Pflegevers.,0.075
6,7,Arbeitslosenvers.,0.05
7,8,Rentenvers.,0.025


### dbo.employee_pay [Transaction Table]

#### Create Big Tansaction Table

In [144]:

def run_employee_pay_transations(loops: int = 1):
    transaction_date = get_next_transaction_datetime(tbl_db_datev_employee_pay)
    df_costcenter = tbl_db_datev_costcenter.read()
    df_costcenter = df_costcenter[["id", "client_id","department_id"]] \
                        .rename(columns={"id": "costcenter_id"})

    df_employee = tbl_db_datev_employee.read()

    df_employee_pay = pd.merge(
        df_employee,
        df_costcenter,
        on=["client_id", "costcenter_id"],
        how="left"
    )

    df_employee_pay = pd_crossjoin(
        df=df_employee_pay,
        join_table=tbl_db_datev_paytype,
        id_column_name_jt="id",
        id_new_column_name="paytype_id",
        include_columns=["salary_factor"]
    )


    df_employee_pay["amount"] = np.round(
        df_employee_pay["salary"] * df_employee_pay["salary_factor"],
        2
    )
    df_employee_pay[TRANSACTION_DATE_COLUMN] = transaction_date
    df_employee_pay = df_employee_pay.rename(columns={"id": "employee_id"})

    df_employee_pay = df_employee_pay[[
        "transaction_date",
        "employee_id",
        "client_id",
        "costcenter_id",
        "paytype_id",
        "amount"
    ]]

    df_employee_pay = generate_wide_df(df_employee_pay)

    for _ in range(0, loops):
        save_mode = "a" if tbl_db_datev_employee_pay.is_existing else "w"
        tbl_db_datev_employee_pay.save(df_employee_pay, mode=save_mode)
        transaction_date = end_of_month(transaction_date, offset_months=1)
        df_employee_pay[TRANSACTION_DATE_COLUMN] = transaction_date

if UPDATE_TRANSACTION_TABLES_THIS_RUN:
    run_employee_pay_transations(loops=1)

[2023-12-20 23:39:50] TABLE successfully saved at: ./datev.dbo.employee_pay.csv   SHAPE: (240352, 156)
[2023-12-20 23:40:07] TABLE successfully saved at: ./datev.dbo.employee_pay.csv   SHAPE: (240352, 156)
[2023-12-20 23:40:24] TABLE successfully saved at: ./datev.dbo.employee_pay.csv   SHAPE: (240352, 156)
[2023-12-20 23:40:40] TABLE successfully saved at: ./datev.dbo.employee_pay.csv   SHAPE: (240352, 156)
[2023-12-20 23:40:57] TABLE successfully saved at: ./datev.dbo.employee_pay.csv   SHAPE: (240352, 156)
[2023-12-20 23:41:13] TABLE successfully saved at: ./datev.dbo.employee_pay.csv   SHAPE: (240352, 156)
[2023-12-20 23:41:30] TABLE successfully saved at: ./datev.dbo.employee_pay.csv   SHAPE: (240352, 156)
[2023-12-20 23:41:47] TABLE successfully saved at: ./datev.dbo.employee_pay.csv   SHAPE: (240352, 156)
[2023-12-20 23:42:03] TABLE successfully saved at: ./datev.dbo.employee_pay.csv   SHAPE: (240352, 156)
[2023-12-20 23:42:20] TABLE successfully saved at: ./datev.dbo.employee_p

In [145]:
df = tbl_db_datev_employee_pay.read(columns=[TRANSACTION_DATE_COLUMN])
print(df.shape)
list(df[TRANSACTION_DATE_COLUMN].unique())

(15142128, 1)


['2000-01-31',
 '2000-02-29',
 '2000-03-31',
 '2000-04-30',
 '2000-05-31',
 '2000-06-30',
 '2000-07-31',
 '2000-08-31',
 '2000-09-30',
 '2000-10-31',
 '2000-11-30',
 '2000-12-31',
 '2001-01-31',
 '2001-02-28',
 '2001-03-31',
 '2001-04-30',
 '2001-05-31',
 '2001-06-30',
 '2001-07-31',
 '2001-08-31',
 '2001-09-30',
 '2001-10-31',
 '2001-11-30',
 '2001-12-31',
 '2002-01-31',
 '2002-02-28',
 '2002-03-31',
 '2002-04-30',
 '2002-05-31',
 '2002-06-30',
 '2002-07-31',
 '2002-08-31',
 '2002-09-30',
 '2002-10-31',
 '2002-11-30',
 '2002-12-31',
 '2003-01-31',
 '2003-02-28',
 '2003-03-31',
 '2003-04-30',
 '2003-05-31',
 '2003-06-30',
 '2003-07-31',
 '2003-08-31',
 '2003-09-30',
 '2003-10-31',
 '2003-11-30',
 '2003-12-31',
 '2004-01-31',
 '2004-02-29',
 '2004-03-31',
 '2004-04-30',
 '2004-05-31',
 '2004-06-30',
 '2004-07-31',
 '2004-08-31',
 '2004-09-30',
 '2004-10-31',
 '2004-11-30',
 '2004-12-31']