In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils.DBUtils import *
DBUpdater().update()


# Data Formatting

In [3]:
from urllib.parse import urljoin
from dotenv import load_dotenv
from datetime import *
from bson import ObjectId
import pandas as pd
import numpy as np
import requests
import random
import torch
import json
import os
load_dotenv()

True

In [4]:
username = os.getenv("mongodb_user")
password = os.getenv("mongodb_password")

In [5]:
from modules.database_tables import Database
from modules.database_tables.Orders import Orders
from modules.database_tables.Products import Products
from modules.database_tables.TaskCards import TaskCards
from modules.database_tables.ProductOperations import Operations

In [6]:
class GSHubAPI:
    def __init__(self) -> None:
        self.domain: str = "https://premo.gshub.nl/"
    
    def learn_url(self, days_in_past: int) -> str:
        return urljoin(self.domain, f"api/dinand/learn/{days_in_past}")
    
    @property
    def play_url(self) -> str:
        return urljoin(self.domain, "api/dinand/play/")

In [36]:
play_url = GSHubAPI().play_url

'https://premo.gshub.nl/api/dinand/play/'

In [8]:
db = Database(username, password)
orders = Orders(username, password)
products = Products(username, password)
taskcards = TaskCards(username, password)
operations = Operations(username, password)

In [9]:
def safe_to_datetime(x):
    try:
        return pd.to_datetime(x, format='%Y-%m-%dT%H:%M:%S.%fZ')
    except (ValueError, TypeError):
        return x

In [10]:
def safe_to_int(x):
    try:
        if isinstance(x, ObjectId):
            return int(str(x), 16)
        return int(x)
    except (ValueError, TypeError):
        return x

In [11]:
data = json_from_database(orders, products, taskcards, operations)

In [12]:
def flatten_orders(orders):
    # Initialize an empty list to store the flattened data
    flattened_data = []

    # Iterate through each order
    for order in orders:
        order_id = order.get('order_id')
        delivery_date = order.get('delivery_date')
        order_created_at = order.get('order_created_at')

        # Iterate through each product in the order
        for product in order.get('products', []):
            material = product.get('material')
            color = product.get('color')

            # Get product operations and task cards
            product_operations = product.get('product_operations', [])
            task_cards = product.get('task_cards', [])

            # Create a mapping from sort_order to task card for quick lookup
            task_card_map = {tc.get('sort_order'): tc for tc in task_cards}

            # Iterate through each product operation
            for operation in product_operations:
                sort_order = operation.get('sort_order')
                task_title = operation.get('task_title')
                task_duration = operation.get('task_duration')
                workspace = operation.get('workspace')

                # Get the corresponding task card using sort_order
                task_card = task_card_map.get(sort_order, {})
                start_at = task_card.get('start_at')
                end_at = task_card.get('end_at')

                # Append the flattened row to the data list
                flattened_data.append({
                    'order_id': order_id,
                    'delivery_date': delivery_date,
                    'order_created_at': order_created_at,
                    'material': material,
                    'color': color,
                    'task_title': task_title,
                    'task_duration': task_duration,
                    'sort_order': sort_order,
                    'workspace': workspace,
                    'start_at': start_at,
                    'end_at': end_at
                })

    # Convert the list of dictionaries into a pandas DataFrame
    df = pd.DataFrame(flattened_data)

    # Convert date strings to datetime objects
    date_columns = ['delivery_date', 'order_created_at', 'start_at', 'end_at']
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

    return df

In [29]:
df = flatten_orders(data)
df.head()

Unnamed: 0,order_id,delivery_date,order_created_at,material,color,task_title,task_duration,sort_order,workspace,start_at,end_at
0,1,2024-07-13 14:28:51+00:00,2024-07-08 14:28:51+00:00,Katoenen draagtas gekleurd,6,Expeditie,720,60,Expeditie,2024-07-12 09:00:00+00:00,2024-07-12 09:12:00+00:00
1,1,2024-07-13 14:28:51+00:00,2024-07-08 14:28:51+00:00,Katoenen draagtas gekleurd,6,Vullen,3600,20,Soweco,2024-07-01 09:00:00+00:00,2024-07-01 10:00:00+00:00
2,1,2024-07-13 14:28:51+00:00,2024-07-08 14:28:51+00:00,Katoenen draagtas gekleurd,6,Inpakken,180,50,Inpak,2024-07-11 09:00:00+00:00,2024-07-11 09:03:00+00:00
3,1,2024-07-13 14:28:51+00:00,2024-07-08 14:28:51+00:00,Katoenen draagtas gekleurd,6,Tampon drukken (TB),1440,41,T2 1|2|3,2024-07-08 09:00:00+00:00,2024-07-08 09:24:00+00:00
4,3,2024-07-24 00:00:00+00:00,2024-07-09 08:11:10+00:00,Avant-knijper,13,Vullen,6120,20,Soweco,2024-07-10 09:00:00+00:00,2024-07-10 10:42:00+00:00


In [35]:
df['time_util_delivery'] = df['delivery_date'] - df['start_at']
df['time_util_delivery'] = df['time_util_delivery'].dt.total_seconds() / 3600
df['duration_since_order_created'] = df['start_at'] - df['order_created_at']
df['duration_since_order_created'] = df['duration_since_order_created'].dt.total_seconds() / 3600
df.head()

Unnamed: 0,order_id,delivery_date,order_created_at,material,color,task_title,task_duration,sort_order,workspace,start_at,end_at,time_util_delivery,duration_since_order_created
0,1,2024-07-13 14:28:51+00:00,2024-07-08 14:28:51+00:00,Katoenen draagtas gekleurd,6,Expeditie,720,60,Expeditie,2024-07-12 09:00:00+00:00,2024-07-12 09:12:00+00:00,29.480833,90.519167
1,1,2024-07-13 14:28:51+00:00,2024-07-08 14:28:51+00:00,Katoenen draagtas gekleurd,6,Vullen,3600,20,Soweco,2024-07-01 09:00:00+00:00,2024-07-01 10:00:00+00:00,293.480833,-173.480833
2,1,2024-07-13 14:28:51+00:00,2024-07-08 14:28:51+00:00,Katoenen draagtas gekleurd,6,Inpakken,180,50,Inpak,2024-07-11 09:00:00+00:00,2024-07-11 09:03:00+00:00,53.480833,66.519167
3,1,2024-07-13 14:28:51+00:00,2024-07-08 14:28:51+00:00,Katoenen draagtas gekleurd,6,Tampon drukken (TB),1440,41,T2 1|2|3,2024-07-08 09:00:00+00:00,2024-07-08 09:24:00+00:00,125.480833,-5.480833
4,3,2024-07-24 00:00:00+00:00,2024-07-09 08:11:10+00:00,Avant-knijper,13,Vullen,6120,20,Soweco,2024-07-10 09:00:00+00:00,2024-07-10 10:42:00+00:00,327.0,24.813889


In [None]:
X = df[['time_util_delivery', 'duration_since_order_created' ]]

# Model