# Notebook for generating

# Import

In [108]:
import os
import random
import pandas as pd

from datetime import datetime
from dataclasses import dataclass

# Settings & Utils

## Constants & Helper

In [109]:
RESET_DB = False

ADD_RECORDS_THIS_RUN = True
UPDATE_RECORDS_THIS_RUN = True
DELETE_RECORDS_THIS_RUN = True

In [110]:
# DATE = get minumum date from data and use +1 Month  ||  default datetime(2015, 1, 1)
SEPERATOR = "|"
CHANGE_FIELD = "change_field"

UPDATE_MASTERDATA = False
UPDATE_TRANSACTIONAL_DATA = False

TBL_EMPLOYEE = "employee"
TBL_CLIENTS = "clients"
TBL_BUSINESSPARTNER = "businesspartner"
TBL_COSTCENTER = "costcenter"
TBL_DEPARTMENT = "department"
TBL_TASK = "task"
TBL_PROJECT = "project"

TBL_USERS = "users"
TBL_PROJECT = "project"
TBL_PROJECTTIME = "projecttime"
TBL_PAY_TYPE = "pay_type"
TBL_EMPLOYEE_PAY = "employee_pay"

DB_ATOSS = "atoss"
DB_PCT = "pct"
DB_DATEV = "datev"

SCHEMA_DBO = "dbo"
SCHEMA_MDM = "mdm"

In [111]:
@dataclass(frozen=True)
class MdTable:
    tablename: str

    @property
    def tablepath(self) -> str:
        return f"./_md_{self.tablename}.csv"
    
    def read(self, sep: str = SEPERATOR, **kwargs) -> pd.DataFrame:
        return pd.read_csv(self.tablepath, sep=sep, **kwargs)


@dataclass(frozen=True)
class DbTable:
    db: str
    schema: str
    tablename: str

    @property
    def table_total_name(self) -> str:
        return f"{self.db}.{self.schema}.{self.tablename}"
    
    @property
    def tablepath(self) -> str:
        return f"./{self.table_total_name}.csv"
    
    @property
    def is_existing(self) -> bool:
        return os.path.exists(self.tablepath)
    
    def save(self, df: pd.DataFrame, sep: str = SEPERATOR) -> None:
        df.to_csv(self.tablepath, sep=sep, index=False, encoding="UTF-8")
        print(f"[{str(datetime.now())[:19]}] TABLE successfully saved at: {self.tablepath} (SHAPE: {df.shape})")
        
    def read(self, sep: str = SEPERATOR, **kwargs) -> pd.DataFrame:
        return pd.read_csv(self.tablepath, sep=sep, **kwargs)
    
    def delete(self) -> None:
        if not self.is_existing:
            log = f"{self.table_total_name} not existing."
            print(log)
            return
        
        os.remove(self.tablepath)
        log = f"TABLE {self.table_total_name} successfully removed."
        print(log)

def fn_update_count(row: str) -> str:
    split_str = " uc: "
    if split_str in row:
        value, uc = row.split(split_str)
        uc = int(uc) + 1
        return f"{value} uc: {uc}"
    return f"{row} uc: 1"

In [112]:
def pd_add_rows(*, df: pd.DataFrame, df_md: pd.DataFrame, nrows: int) -> pd.DataFrame:
    df_new_records = df_md[~df_md["id"].isin(df["id"])].sample(nrows)

    print(f"New Records (nrows={nrows}):")
    display(df_new_records)

    return pd.concat([df, df_new_records], axis=0).reset_index(drop=True)


def pd_update_rows(*, df: pd.DataFrame, nrows: int, column: str = CHANGE_FIELD) -> pd.DataFrame:
    update_ids = list(df.sample(nrows)["id"])
    update_filter = df["id"].isin(update_ids)

    df.loc[update_filter, CHANGE_FIELD] = df.loc[update_filter, column] \
                                            .apply(fn_update_count)
    print(f"Updated Records (nrows={nrows}):")
    display(df[update_filter])

    return df


def pd_delete_rows(*, df: pd.DataFrame, nrows: int) -> pd.DataFrame:
    delete_ids = list(df.sample(nrows)["id"])
    delete_filter = df["id"].isin(delete_ids)

    print(f"Removed Records (nrows={nrows}):")
    display(df[delete_filter])

    return df[~delete_filter].reset_index(drop=True)

## Table Setup

### MasterData Tables

In [113]:
tbl_md_employee = MdTable(TBL_EMPLOYEE)
tbl_md_clients = MdTable(TBL_CLIENTS)
tbl_md_businesspartner = MdTable(TBL_BUSINESSPARTNER)
tbl_md_costcenter = MdTable(TBL_COSTCENTER)
tbl_md_department = MdTable(TBL_DEPARTMENT)
tbl_md_project = MdTable(TBL_PROJECT)
tbl_md_task = MdTable(TBL_TASK)

display(
    tbl_md_employee,
    tbl_md_clients,
    tbl_md_businesspartner,
    tbl_md_costcenter,
    tbl_md_department,
    tbl_md_project,
    tbl_md_task
)

MdTable(tablename='employee')

MdTable(tablename='clients')

MdTable(tablename='businesspartner')

MdTable(tablename='costcenter')

MdTable(tablename='department')

MdTable(tablename='project')

MdTable(tablename='task')

In [114]:
df_md_employee = tbl_md_employee.read(parse_dates=["leave_date"])
df_md_clients = tbl_md_clients.read()
df_md_businesspartner = tbl_md_businesspartner.read()
df_md_costcenter = tbl_md_costcenter.read()
df_md_department = tbl_md_department.read()
df_md_project = tbl_md_project.read()
df_md_task = tbl_md_task.read()

### DB Tables

In [115]:
tbl_db_pct_users = DbTable(DB_PCT, SCHEMA_DBO, TBL_USERS)
tbl_db_pct_project = DbTable(DB_PCT, SCHEMA_DBO, TBL_PROJECT)
tbl_db_pct_department = DbTable(DB_PCT, SCHEMA_DBO, TBL_DEPARTMENT)
tbl_db_pct_task = DbTable(DB_PCT, SCHEMA_DBO, TBL_TASK)
tbl_db_pct_businesspartner = DbTable(DB_PCT, SCHEMA_DBO, TBL_BUSINESSPARTNER)
tbl_db_pct_projecttime = DbTable(DB_PCT, SCHEMA_DBO, TBL_PROJECTTIME)
tbl_db_pct_employee_mdm = DbTable(DB_PCT, SCHEMA_MDM, TBL_EMPLOYEE)

tbl_db_datev_employee = DbTable(DB_DATEV, SCHEMA_DBO, TBL_EMPLOYEE)
tbl_db_datev_clients = DbTable(DB_DATEV, SCHEMA_DBO, TBL_CLIENTS)
tbl_db_datev_department = DbTable(DB_DATEV, SCHEMA_DBO, TBL_DEPARTMENT)
tbl_db_datev_costcenter = DbTable(DB_DATEV, SCHEMA_DBO, TBL_COSTCENTER)
tbl_db_datev_pay_type = DbTable(DB_DATEV, SCHEMA_DBO, TBL_PAY_TYPE)
tbl_db_datev_employee_pay = DbTable(DB_DATEV, SCHEMA_DBO, TBL_EMPLOYEE_PAY)

ALL_TABLES = [
    tbl_db_pct_users,
    tbl_db_pct_project,
    tbl_db_pct_department,
    tbl_db_pct_task,
    tbl_db_pct_businesspartner,
    tbl_db_pct_projecttime,
    tbl_db_pct_employee_mdm,
    tbl_db_datev_employee,
    tbl_db_datev_clients,
    tbl_db_datev_department,
    tbl_db_datev_costcenter,
    tbl_db_datev_pay_type,
    tbl_db_datev_employee_pay
]

display(
    tbl_db_pct_users,
    tbl_db_pct_project,
    tbl_db_pct_department,
    tbl_db_pct_task,
    tbl_db_pct_businesspartner,
    tbl_db_pct_projecttime,
    tbl_db_pct_employee_mdm,
    tbl_db_datev_employee,
    tbl_db_datev_clients,
    tbl_db_datev_department,
    tbl_db_datev_costcenter,
    tbl_db_datev_pay_type,
    tbl_db_datev_employee_pay
)

DbTable(db='pct', schema='dbo', tablename='users')

DbTable(db='pct', schema='dbo', tablename='project')

DbTable(db='pct', schema='dbo', tablename='department')

DbTable(db='pct', schema='dbo', tablename='task')

DbTable(db='pct', schema='dbo', tablename='businesspartner')

DbTable(db='pct', schema='dbo', tablename='projecttime')

DbTable(db='pct', schema='mdm', tablename='employee')

DbTable(db='datev', schema='dbo', tablename='employee')

DbTable(db='datev', schema='dbo', tablename='clients')

DbTable(db='datev', schema='dbo', tablename='department')

DbTable(db='datev', schema='dbo', tablename='costcenter')

DbTable(db='datev', schema='dbo', tablename='pay_type')

DbTable(db='datev', schema='dbo', tablename='employee_pay')

# Generate or Update databasefiles

## Reset DB if needed

In [116]:
if RESET_DB:
    table: DbTable
    for table in ALL_TABLES:
        table.delete()
else:
    print("No Reset!")

No Reset!


## ERP-System - datev

### dbo.clients

#### Create if not existing

In [117]:
INIT_CLIENTS_COUNT = 10

df_init = df_md_clients.sample(INIT_CLIENTS_COUNT)
if not tbl_db_datev_clients.is_existing:
    tbl_db_datev_clients.save(df_init)


df_db_datev_clients = tbl_db_datev_clients.read()
df_db_datev_clients.head()

Unnamed: 0,id,name,address,change_field
0,70,Rapid Systems GmbH,"Musterstraße 70, 10176 Berlin",cf
1,85,Utopia Consulting Ltd.,"Musterstraße 85, 10191 Berlin",cf
2,75,Stellar Corporation SE,"Musterstraße 75, 10181 Berlin",cf
3,45,Luminous Enterprises AG,"Musterstraße 45, 10151 Berlin",cf
4,61,Phi Solutions Ltd.,"Musterstraße 61, 10167 Berlin",cf


#### Update Records

In [119]:
update_record_count = random.choice(range(0, 4))

df_db_datev_clients = pd_update_rows(
    df=df_db_datev_clients,
    nrows=update_record_count
)

tbl_db_datev_clients.save(df_db_datev_clients)

Updated Records (nrows=0):


Unnamed: 0,id,name,address,change_field


[2023-12-20 11:05:30.] TABLE successfully saved at: ./datev.dbo.clients.csv (SHAPE: (11, 4))


#### Add Records

In [118]:
add_record_count = random.choice(range(0, 2)) if ADD_RECORDS_THIS_RUN else 0

df_db_datev_clients = pd_add_rows(
    df=df_db_datev_clients,
    df_md=df_md_clients,
    nrows=add_record_count
)

tbl_db_datev_clients.save(df_db_datev_clients)

New Records (nrows=0):


Unnamed: 0,id,name,address,change_field


[2023-12-20 11:05:30.] TABLE successfully saved at: ./datev.dbo.clients.csv (SHAPE: (11, 4))


### dbo.employee

#### Create if not existing

In [120]:
INIT_EMPLOYEE_COUNT = 5000

df_init = df_md_employee.sample(INIT_EMPLOYEE_COUNT)
if not tbl_db_datev_employee.is_existing:
    tbl_db_datev_employee.save(df_init)


df_db_datev_employee = tbl_db_datev_employee.read()
df_db_datev_employee.head()

Unnamed: 0,id,firstname,lastname,birthdate,entry_date,salary,leave_date,change_field
0,7805,Obi-Wan,Kenobi,1959-03-12,2004-03-31,5097.77,,cf
1,7773,Natalie,Wilson,1963-06-30,2007-08-12,6343.06,,cf
2,7137,Sara,Fernández,1970-01-06,2025-02-18,4794.82,,cf
3,2472,Miku,Watanabe,1961-03-25,2005-09-26,5927.05,,cf
4,3015,Bernard,Lowe,1956-01-20,2018-12-14,5109.09,,cf


#### Update Records

In [122]:
update_record_count = random.choice(range(0, 7))

df_db_datev_employee = pd_update_rows(
    df=df_db_datev_employee,
    nrows=update_record_count
)

tbl_db_datev_employee.save(df_db_datev_employee)

Updated Records (nrows=171):


Unnamed: 0,id,firstname,lastname,birthdate,entry_date,salary,leave_date,change_field
6,6769,Karolina,Nowak,1978-10-26,2004-01-08,6277.68,,cf uc: 1
18,2085,Sofía,Moya,1992-04-16,2022-09-29,5468.50,,cf uc: 1
26,3107,Adrián,Romero,1956-03-16,2017-06-16,6435.66,,cf uc: 1
48,3295,Camille,Lefevre,1989-01-29,2005-05-06,4927.65,,cf uc: 1
70,6131,Fleur,de,1965-03-23,2002-02-01,5636.24,,cf uc: 1
...,...,...,...,...,...,...,...,...
4771,3031,Elena,Ferrari,1979-05-17,2025-05-25,4854.06,,cf uc: 1
4854,3358,Isabella,Miller,1955-08-16,2023-10-11,5380.24,,cf uc: 1
4871,4526,Yasin,Kılıç,1973-09-03,2013-12-17,5153.50,,cf uc: 1
4928,2209,Joshua,Taylor,2000-01-14,2014-09-16,4555.27,,cf uc: 1


[2023-12-20 11:05:30.] TABLE successfully saved at: ./datev.dbo.employee.csv (SHAPE: (5003, 8))


#### Add Records

In [121]:
add_record_count = random.choice(range(0, 4)) if ADD_RECORDS_THIS_RUN else 0

df_db_datev_employee = pd_add_rows(
    df=df_db_datev_employee,
    df_md=df_md_employee,
    nrows=add_record_count
)

tbl_db_datev_employee.save(df_db_datev_employee)

New Records (nrows=1):


Unnamed: 0,id,firstname,lastname,birthdate,entry_date,salary,leave_date,change_field
5845,5846,Leila,Al-Farsi,1960-09-12,2015-08-16,5057.46,NaT,cf


[2023-12-20 11:05:30.] TABLE successfully saved at: ./datev.dbo.employee.csv (SHAPE: (5003, 8))


### dbo.department

#### Create if not existing

In [129]:
df_md_department.head()

Unnamed: 0,id,name,change_field
0,1,Administration,cf
1,2,Brand Management,cf
2,3,Business Development,cf
3,4,Community Relations,cf
4,5,Corporate Affairs,cf


In [132]:
INIT_DEPARTMENT_COUNT = 10
n_costcenter_sample = int(INIT_DEPARTMENT_COUNT / 3)
df_init = df_md_department.sample(5)
df_db_datev_clients[["id"]] \
    .rename(columns={"id": "client_id"}) \
    .sample(n_costcenter_sample) \
    .join(df_init, how="")
# CROSS JOIN!!!
# https://stackoverflow.com/questions/34161978/pandas-two-dataframe-cross-join

Unnamed: 0,client_id
9,48
1,85
7,37
6,71
10,79


In [124]:
INIT_DEPARTMENT_COUNT = 5

df_init = df_md_department.sample(INIT_DEPARTMENT_COUNT)
if not tbl_db_datev_department.is_existing:
    
    tbl_db_datev_department.save(df_init)


df_db_datev_department = tbl_db_datev_department.read()
df_db_datev_department.head()

Unnamed: 0,id,name,change_field
0,27,Internal Audit,cf
1,34,Procurement,cf
2,47,Sustainability,cf
3,26,Innovation,cf
4,36,Project Management,cf uc: 1


#### Update Records

In [126]:
update_record_count = random.choice(range(0, 4))

df_db_datev_department = pd_update_rows(
    df=df_db_datev_department,
    nrows=update_record_count
)

tbl_db_datev_department.save(df_db_datev_department)

Updated Records (nrows=2):


Unnamed: 0,id,name,change_field
0,27,Internal Audit,cf uc: 1
1,34,Procurement,cf uc: 1


[2023-12-20 11:05:30.] TABLE successfully saved at: ./datev.dbo.department.csv (SHAPE: (7, 3))


#### Add Records

In [125]:
add_record_count = random.choice(range(0, 2)) if ADD_RECORDS_THIS_RUN else 0

df_db_datev_department = pd_add_rows(
    df=df_db_datev_department,
    df_md=df_md_department,
    nrows=add_record_count
)

tbl_db_datev_department.save(df_db_datev_department)

New Records (nrows=1):


Unnamed: 0,id,name,change_field
22,23,Human Resources,cf


[2023-12-20 11:05:30.] TABLE successfully saved at: ./datev.dbo.department.csv (SHAPE: (7, 3))


### dbo.costcenter

#### Create if not existing

In [127]:
df_md_costcenter

Unnamed: 0,id,buKr,costcenter_short,name,change_field
0,1,1735,10000,KSTNAME1,cf
1,2,1735,10550,KSTNAME2,cf
2,3,1735,11100,KSTNAME3,cf
3,4,1735,11650,KSTNAME4,cf
4,5,1735,12200,KSTNAME5,cf
...,...,...,...,...,...
2185,2186,1749,87550,KSTNAME2186,cf
2186,2187,1749,88100,KSTNAME2187,cf
2187,2188,1749,88650,KSTNAME2188,cf
2188,2189,1749,89200,KSTNAME2189,cf


In [None]:
INIT_CLIENTS_COUNT = 10

df_init = df_md_clients.sample(INIT_CLIENTS_COUNT)
if not tbl_db_datev_clients.is_existing:
    tbl_db_datev_clients.save(df_init)


df_db_datev_clients = tbl_db_datev_clients.read()
df_db_datev_clients.head()

#### Update Records

#### Add Records

### dbo.paytype [Frozen Master Data Table]

#### Create if not existing

### dbo.employee_pay [Transaction Table]

#### Create if not existing

#### Update Records

#### Add Records

#### Delete Records