In [3]:
# Database connection
user = "erpnext_sql"
password = "rtp[ps4XRF0_pX"
db = "_abde597b15b69fc8"
host = "erpnext.responsible-ai.net"
port = "3333"
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{db}")


In [4]:
# Load dataset
data_sql_full = pd.read_sql("""
    SELECT po.name, po.supplier, po.total_qty, po.transaction_date as order_date,
           po_items.schedule_date, po_receipt.posting_date as delivered_date,
           po.set_warehouse, po_items.item_name
    FROM `tabPurchase Order` po
    LEFT JOIN `tabPurchase Order Item` po_items ON po.name = po_items.parent
    LEFT JOIN `tabPurchase Receipt Item` po_receipt_item ON po.name = po_receipt_item.purchase_order
    LEFT JOIN `tabPurchase Receipt` po_receipt ON po_receipt_item.parent = po_receipt.name
    """, engine)

In [5]:
# Data preprocessing
data_sql_full.dropna(inplace=True, subset=["delivered_date", "schedule_date"])
data_sql_full["delivered_date"] = pd.to_datetime(data_sql_full["delivered_date"])
data_sql_full["schedule_date"] = pd.to_datetime(data_sql_full["schedule_date"])
data_sql_full["late"] = data_sql_full["delivered_date"] > data_sql_full["schedule_date"]
data_sql_full["days_late"] = (data_sql_full["delivered_date"] - data_sql_full["schedule_date"]).dt.days

In [18]:
data_sql_full.head()

Unnamed: 0,name,supplier,total_qty,order_date,schedule_date,delivered_date,set_warehouse,item_name,late,days_late,order_month
1,PUR-ORD-2024-08-00002,Beans Inc.,750.0,2023-04-24,2023-04-29,2023-05-04,Naples - RR,Excelsa,True,5,4
3,PUR-ORD-2024-08-00004,Aromatico,1000.0,2022-10-26,2022-11-10,2022-11-29,Amsterdam - RR,Maragogype,True,19,10
4,PUR-ORD-2024-08-00005,Aromatico,500.0,2022-03-18,2022-04-05,2022-04-24,London - RR,Maragogype Type B,True,19,3
5,PUR-ORD-2024-08-00006,Farmers of Brazil,100.0,2023-12-01,2023-12-15,2023-12-23,Naples - RR,Robusta,True,8,12
6,PUR-ORD-2024-08-00007,Farmers of Brazil,750.0,2024-05-17,2024-05-29,2024-05-29,Hamburg - RR,Maragogype,False,0,5


In [19]:
# Check datatypes
data_sql_full.dtypes


name                      object
supplier                  object
total_qty                float64
order_date        datetime64[ns]
schedule_date     datetime64[ns]
delivered_date    datetime64[ns]
set_warehouse             object
item_name                 object
late                        bool
days_late                  int64
order_month                int32
dtype: object

In [6]:
# Ensure order_date is in datetime format
data_sql_full["order_date"] = pd.to_datetime(data_sql_full["order_date"])

In [7]:
# Add month feature for seasonality
data_sql_full["order_month"] = data_sql_full["order_date"].dt.month
d_months = pd.get_dummies(data_sql_full['order_month'], prefix="d_month")

In [8]:
# Create dummies for supplier, warehouse, and item_name
d_suppliers = pd.get_dummies(data_sql_full['supplier'], prefix="d_supplier")
d_warehouses = pd.get_dummies(data_sql_full['set_warehouse'], prefix="d_warehouse")
d_item_names = pd.get_dummies(data_sql_full['item_name'], prefix="d_item")

In [9]:
# Combine all features
data_for_model = pd.concat([data_sql_full, d_suppliers, d_warehouses, d_item_names, d_months], axis=1)
features = ["total_qty"] + list(d_suppliers.columns) + list(d_warehouses.columns) + list(d_item_names.columns) + list(d_months.columns)


In [10]:
# Train/test split
split_date = pd.to_datetime("2023-03-01")
train = data_for_model[data_for_model["order_date"] <= split_date]
test = data_for_model[data_for_model["order_date"] > split_date]


In [11]:
# Classifier model with hyperparameter tuning
rf_classifier = RandomForestClassifier(random_state=1234, n_jobs=-1)
param_dist = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
random_search = RandomizedSearchCV(rf_classifier, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(train[features], train["late"])
best_rf_classifier = random_search.best_estimator_

predictions = best_rf_classifier.predict(test[features])
accuracy = accuracy_score(test["late"], predictions)
cm = confusion_matrix(test["late"], predictions, labels=[True, False])


In [16]:
print(f"Classifier Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)


Classifier Accuracy: 0.9634302822925578
Confusion Matrix:
[[4412   19]
 [ 152   93]]


In [12]:
# Export classifier model
export_name = "best_rf_classifier.pkl"
with open(export_name, "wb") as file:
    pickle.dump(best_rf_classifier, file)

In [13]:
# Regressor model for predicting days late with hyperparameter tuning
rf_regressor = RandomForestRegressor(random_state=1234, n_jobs=-1)
random_search = RandomizedSearchCV(rf_regressor, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
random_search.fit(train[features], train["days_late"])
best_rf_regressor = random_search.best_estimator_

predictions_rf = best_rf_regressor.predict(test[features])
mse_rf = mean_squared_error(test["days_late"], predictions_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(test["days_late"], predictions_rf)




In [17]:
print(f"Regressor RMSE: {rmse_rf}")
print(f"Regressor MAE: {mae_rf}")

Regressor RMSE: 1.9800719756956784
Regressor MAE: 1.571402978224983


In [14]:
# Export regressor model
export_name = "best_rf_regressor.pkl"
with open(export_name, "wb") as file:
    pickle.dump(best_rf_regressor, file)

In [15]:
# Print metrics
print(f"Classifier Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{cm}")
print(f"Regressor RMSE: {rmse_rf}")

Classifier Accuracy: 0.9634302822925578
Confusion Matrix:
[[4412   19]
 [ 152   93]]
Regressor RMSE: 1.9800719756956784
