In [None]:
%load_ext autoreload
%autoreload 2

In [4312]:
from utils.UpdateDatabase import *

# Data Formatting

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from urllib.parse import urljoin
from dotenv import load_dotenv
from datetime import *
from bson import ObjectId
import pandas as pd
import numpy as np
import requests
import random
import torch
import json
import os
load_dotenv()

In [4314]:
from utils.FlattenData import *
from modules.PremoAPI import *
from utils.SafeDataConverters import *

In [4315]:
api = PremoAPI()

In [4316]:
username = os.getenv("mongodb_user")
password = os.getenv("mongodb_password")

In [4317]:
from modules.database import Database
db = Database(username, password)

In [4318]:
play_data: list[dict] = api.play_data

In [4319]:
flat_out = flatten_output(db.to_json())
flat_in = flatten_input(db.to_json())

# Workspace Detection Model

In [None]:
out_df = pd.DataFrame(flat_out).sort_values(by=['order_id', 'sort_order'], ascending=True)
out_df['sort_order'] = out_df.groupby(['order_id', 'product_id'])['sort_order'].transform(lambda x: x.rank(method='dense').astype(int) - 1)
# out_df = out_df.map(safe_to_datetime)
out_df.head()

In [None]:
out_df[['delivery_date', 'order_created_at', 'start_at', 'end_at']] = out_df[['delivery_date', 'order_created_at', 'start_at', 'end_at']].apply(pd.to_datetime, format='%Y-%m-%dT%H:%M:%S.%fZ')
out_df['duration_since_order_created'] = out_df['start_at'] - out_df['order_created_at']
out_df['duration_since_order_created'] = out_df['duration_since_order_created'].dt.total_seconds() / 3600 # convert to hours
out_df['time_until_delivery'] = out_df['delivery_date'] - out_df['start_at']
out_df['time_until_delivery'] = out_df['time_until_delivery'].dt.total_seconds() / 3600 # convert to hours
out_df.head()

In [None]:

df.head()

In [4323]:
X_columns = {
    'init': ['task_title'],
    'addit': ['material', 'color', 'sort_order']
}

In [4324]:
y_columns = ['workspace', 'duration_since_order_created', 'time_until_delivery']

In [None]:
df = out_df.copy()[['task_title', 'delivery_date', 'order_created_at', 'material', 'color', 'sort_order', 'task_duration',  'start_at', 'end_at','workspace', 'duration_since_order_created', 'time_until_delivery']]
df = df.dropna()
df.head()

In [4326]:
def preprocess_inputs(df):
    # Compute time differences in hours
    df['delivery_offset'] = (df['delivery_date'] - df['order_created_at']).dt.total_seconds() / 3600.0
    
    # Extract day of the week and time of day
    df['order_day_of_week'] = df['order_created_at'].dt.weekday  # 0 = Monday
    df['order_time_of_day'] = df['order_created_at'].dt.hour + df['order_created_at'].dt.minute / 60.0
    
    # Normalize or scale features as needed
    # Example normalization
    df['delivery_offset'] = df['delivery_offset'] / df['delivery_offset'].max()
    df['task_duration'] = df['task_duration'] / df['task_duration'].max()
    df['order_day_of_week'] = df['order_day_of_week'] / 6.0  # Since weekdays range from 0 to 6
    df['order_time_of_day'] = (df['order_time_of_day'] - 9) / 8.0  # Scale between 0 and 1
    
    # Select features
    X = df[['delivery_offset', 'task_duration', 'order_day_of_week', 'order_time_of_day']].values
    return X

In [4327]:
def preprocess_outputs(df):
    # Ensure datetime conversion
    df['start_at'] = pd.to_datetime(df['start_at'], errors='coerce')
    df['order_created_at'] = pd.to_datetime(df['order_created_at'], errors='coerce')
    
    # Handle any NaT values
    df = df.dropna(subset=['start_at', 'order_created_at'])
    
    # Compute date offset (ensure it's at least 1)
    df['date_offset'] = (df['start_at'].dt.date - df['order_created_at'].dt.date).dt.days
    df['date_offset'] = df['date_offset'].apply(lambda x: max(1, x))
    
    # Compute start time in hours
    df['start_time'] = df['start_at'].dt.hour + df['start_at'].dt.minute / 60.0
    
    y = df[['date_offset', 'start_time']].values
    return y


In [4328]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [4329]:
class ColumnTransformer:
    def __init__(self, categorical_features: list[str], numerical_features: list[str]):
        self.categorical_features = categorical_features
        self.numerical_features = numerical_features
        self.one_hot_encoders = {}
        self.scalers = {}
        self._feature_names_out = []
        self._feature_names_in = []
        
    def fit(self, X: pd.DataFrame) -> None:
        for feature in self.categorical_features:
            if feature not in X.columns:
                continue
            ohe = OneHotEncoder(sparse_output=False)
            ohe.fit(X[[feature]])
            self.one_hot_encoders[feature] = ohe
            
            self._feature_names_out.extend(ohe.get_feature_names_out([feature]))
            self._feature_names_in.append(feature)
            
        for feature in self.numerical_features:
            if feature not in X.columns:
                continue
            scaler = MinMaxScaler()
            scaler.fit(X[[feature]])
            self.scalers[feature] = scaler
            
            self._feature_names_in.append(feature)
            self._feature_names_out.append(feature)
            
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X_out = []
        for feature in self.categorical_features:
            if feature not in X.columns:
                continue
            ohe = self.one_hot_encoders[feature]
            transformed = ohe.transform(X[[feature]])
            X_out.append(pd.DataFrame(transformed, 
                                        columns=ohe.get_feature_names_out([feature]), 
                                        index=X.index))
            
        for feature in self.numerical_features:
            if feature not in X.columns:
                continue
            scaler = self.scalers[feature]
            transformed = scaler.transform(X[[feature]])
            X_out.append(pd.DataFrame(transformed, columns=[feature], index=X.index))
        
        for feature in X.columns:
            if feature not in self._feature_names_in:
                X_out.append(X[[feature]])
        
        return pd.concat(X_out, axis=1)
    
    def fit_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        self.fit(X)
        return self.transform(X)
    
    def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X_out = []
        for feature in self.categorical_features:
            ohe_feature_names = [col for col in X.columns if col.startswith(feature + '_')]
            if not ohe_feature_names:
                continue
            ohe = self.one_hot_encoders[feature]
            X_out.append(pd.DataFrame(ohe.inverse_transform(X[ohe_feature_names]), columns=[feature], index=X.index))
            
        for feature in self.numerical_features:
            if feature not in X.columns:
                continue
            scaler = self.scalers[feature]
            X_out.append(pd.DataFrame(scaler.inverse_transform(X[[feature]]), columns=[feature], index=X.index))
            
        for feature in X.columns:
            if feature not in self._feature_names_out:
                X_out.append(X[[feature]])
            
        if not X_out:
            raise ValueError("No objects to concatenate")
            
        return pd.concat(X_out, axis=1)
    
    def reset(self) -> None:
        self.one_hot_encoders = {}
        self.scalers = {}
        self._feature_names_out = []
        self._feature_names_in = []
    
    def get_feature_names_out(self) -> list[str]:
        return self._feature_names_out
    
    def get_feature_names_in(self) -> list[str]:
        return self._feature_names_in
    

In [4330]:
categorical_features = ['task_title', 'material', 'color', 'workspace']
# numerical_features = ['duration_since_order_created', 'time_until_delivery']
numerical_features = []

In [4331]:
ct = ColumnTransformer(categorical_features, numerical_features)

In [None]:
X = df.drop(columns=['duration_since_order_created', 'time_until_delivery', 'start_at', 'end_at'])
# y = df[['duration_since_order_created', 'time_until_delivery']]
y = df.copy()[['start_at', 'end_at']] # .apply(lambda x: x.astype('int64') // 10**9).astype('float32')
y.head()

In [4333]:
X = ct.fit_transform(X)
# y = ct.fit_transform(y)

In [4334]:
out_df['start_at'] = pd.to_datetime(out_df['start_at'], errors='coerce')
out_df['end_at'] = pd.to_datetime(out_df['end_at'], errors='coerce')

In [None]:
# Prepare the data
X = preprocess_inputs(out_df)
y = preprocess_outputs(out_df)

In [None]:
# Extract lists for order_created_at and task_duration
order_created_at_list = df['order_created_at'].tolist()
task_duration_list = df['task_duration'].tolist()

# Initialize and train the model
model_manager = Model()
trained_model, history = model_manager.train(X, y)

# Make predictions
X_test = preprocess_inputs(df_test)  # Assuming df_test is your test DataFrame
order_created_at_test = df_test['order_created_at'].tolist()
task_duration_test = df_test['task_duration'].tolist()

predictions = model_manager.predict(X_test, order_created_at_test, task_duration_test)

# Display predictions
for start_at, end_at in predictions:
    print(f"Start At: {start_at}, End At: {end_at}")

In [None]:
X.shape

In [None]:
y.shape

In [3964]:
from model import *

In [3965]:
agent = Model()

In [3966]:
agent.num_epochs = 100
# agent.learning_rate = 0.001
# agent.batch_size = 32

In [None]:
model, history = agent.train(X, y)

In [3968]:
import matplotlib.pyplot as plt

In [None]:
# Plot total loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

In [3970]:
play_flat = flatten_input(play_data)

In [None]:
play_df = pd.DataFrame(play_flat).sort_values(by=['order_id', 'sort_order'], ascending=True).convert_dtypes()
play_df['sort_order'] = play_df.groupby(['order_id', 'material'])['sort_order'].transform(lambda x: x.rank(method='dense').astype(int) - 1)
play_df = play_df.groupby(['order_id', 'material']).apply(lambda x: x.sort_values(by='sort_order')).reset_index(drop=True)
play_df = play_df.map(safe_to_datetime)
play_df.head()

In [None]:
X_play = play_df[['task_title', 'material', 'color', 'sort_order', 'workspace']]
X_play.head()

In [None]:
X_play = ct.transform(X_play)
X_play.head()

In [None]:
pred = agent.predict(X_play)
pred = ct.inverse_transform(pd.DataFrame(pred, columns=['start_at', 'end_at'], index=X_play.index))
pred