In [427]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [428]:
from utils.DBUpdater import DBUpdater
DBUpdater().update()


# Data Formatting

In [429]:
from urllib.parse import urljoin
from dotenv import load_dotenv
from datetime import *
from bson import ObjectId
import pandas as pd
import numpy as np
import requests
import random
import torch
import json
import os
load_dotenv()

True

In [430]:
username = os.getenv("mongodb_user")
password = os.getenv("mongodb_password")

In [431]:
from modules.database_tables import Database
from modules.database_tables.Orders import Orders
from modules.database_tables.Products import Products
from modules.database_tables.TaskCards import TaskCards
from modules.database_tables.ProductOperations import Operations

In [432]:
class GSHubAPI:
    def __init__(self) -> None:
        self.domain: str = "https://premo.gshub.nl/"
    
    def learn_url(self, days_in_past: int) -> str:
        return urljoin(self.domain, f"api/dinand/learn/{days_in_past}")
    
    @property
    def play_url(self) -> str:
        return urljoin(self.domain, "api/dinand/play/")

In [433]:
play_url = GSHubAPI().play_url

In [434]:
db = Database(username, password)
orders = Orders(username, password)
products = Products(username, password)
taskcards = TaskCards(username, password)
operations = Operations(username, password)

In [435]:
def safe_to_datetime(x):
    try:
        return pd.to_datetime(x, format='%Y-%m-%dT%H:%M:%S.%fZ')
    except (ValueError, TypeError):
        return x

In [436]:
# Fetch all orders, products, taskcards, and operations in one go
all_orders = orders.get_all()
all_products = products.find_many({})
all_taskcards = taskcards.find_many({})
all_operations = operations.find_many({})

# Convert to DataFrames
ordersDF = pd.DataFrame(all_orders)
productsDF = pd.DataFrame(all_products).rename(columns={"_id": "prod_id"})
taskcardsDF = pd.DataFrame(all_taskcards).rename(columns={"_id": "taskcard_id"})
operationsDF = pd.DataFrame(all_operations).rename(columns={"_id": "operation_id"})

# Merge products with orders
productsDF = productsDF.merge(ordersDF, left_on="order_id", right_on="_id", suffixes=('', '_order'))

# Correct the order_id in productsDF to match the order_id from ordersDF
productsDF = productsDF.drop(columns=['order_id'], axis=1).rename(columns={"order_id_order": "order_id"})

# Merge taskcards with operations
taskcardsDF = taskcardsDF.merge(
    operationsDF[['product_id', 'sort_order', 'task_duration']],
    on=['product_id', 'sort_order'],
    suffixes=('', '_new')
)
taskcardsDF['task_duration_full'] = taskcardsDF['task_duration_new']
taskcardsDF = taskcardsDF.drop(columns=['task_duration_new'])
taskcardsDF['taskcard_id'] = taskcardsDF['taskcard_id']
taskcardsDF['operation_id'] = operationsDF['operation_id']



# Rank and adjust sort_order
taskcardsDF["sort_order"] = taskcardsDF.groupby("product_id")["sort_order"].rank(method="dense") - 1
taskcardsDF["sort_order"] = taskcardsDF["sort_order"].astype(int)

# Merge taskcards with products
dataDF = pd.merge(taskcardsDF, productsDF, left_on="product_id", right_on="prod_id")
dataDF = dataDF.drop(columns=["prod_id"], axis=1)

# Apply safe_to_datetime to relevant columns
dataDF = dataDF.map(safe_to_datetime)

# add taskcard_id and operation_id to dataDF 

print(" \n| ->\t".join(map(str, list([f" \n| ->\t{' - '.join(dataDF.columns)}", dataDF.shape]))))

 
| ->	taskcard_id - task_title - task_duration - start_at - end_at - sort_order - workspace - product_id - task_duration_full - operation_id - material - color - _id - order_id - delivery_date - order_created_at 
| ->	(927, 16)


In [437]:
dataDF.head(2)

Unnamed: 0,taskcard_id,task_title,task_duration,start_at,end_at,sort_order,workspace,product_id,task_duration_full,operation_id,material,color,_id,order_id,delivery_date,order_created_at
0,66fa6dd5f13c44fa6af4d6db,Vullen,3600,2024-07-01 09:00:00,2024-07-01 10:00:00,0,Soweco,66fa6dcdf13c44fa6af4d585,3600,66fa6dd8f13c44fa6af4da7a,Katoenen draagtas gekleurd,6,66fa6dcdf13c44fa6af4d584,1,2024-07-13 14:28:51,2024-07-08 14:28:51
1,66fa6dd5f13c44fa6af4d6dc,Tampon drukken (TB),1440,2024-07-08 09:00:00,2024-07-08 09:24:00,1,T2 1|2|3,66fa6dcdf13c44fa6af4d585,1440,66fa6dd8f13c44fa6af4da7b,Katoenen draagtas gekleurd,6,66fa6dcdf13c44fa6af4d584,1,2024-07-13 14:28:51,2024-07-08 14:28:51


In [438]:
def safe_to_timestamp(x):
    try:
        if isinstance(x, datetime):
            return x.replace(tzinfo=UTC).timestamp()
        return x
    except (ValueError, TypeError):
        return x

In [444]:
outputDF = dataDF.copy()[['order_id', 'start_at', 'end_at']]
outputDF = outputDF.rename(columns={"start_at": "start", "end_at": "end"})
outputDF = outputDF.map(safe_to_timestamp)
outputDF.head()

Unnamed: 0,order_id,start,end
0,1,1719824000.0,1719828000.0
1,1,1720429000.0,1720431000.0
2,1,1720688000.0,1720689000.0
3,1,1720775000.0,1720776000.0
4,3,1720602000.0,1720608000.0


# Model