In [1]:
from sqlalchemy import create_engine
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
user = "erpnext_sql"
password = ""
db = ""
host = ""
port = "3333"

In [3]:
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{db}")

In [4]:

purchase_orders = pd.read_sql("""
    Select name, 
            supplier, 
            schedule_date,                 
            total_qty, 
            transaction_date, 
            set_warehouse
    FROM `tabPurchase Order`
    """, engine)



In [5]:
purchase_orders.shape

(17181, 6)

### Dateset

In [6]:
data_sql_full = pd.read_sql("""
     Select 
        po.name, 
        po.supplier,            
        po.total_qty, 
        po.transaction_date, 
        po.set_warehouse,
        po_items.item_name, 
        po_items.schedule_date,
        po_items.rate,         
        po_items.qty as quantity_ordered, 
        po_items.warehouse,
        po_receipt_item.parent as receipt_item_parent,
        po_receipt.posting_date as delivered_date
FROM `tabPurchase Order` po
LEFT JOIN `tabPurchase Order Item` po_items  
	ON po.name = po_items.parent
LEFT JOIN `tabPurchase Receipt Item` po_receipt_item
	ON po.name = po_receipt_item.purchase_order
LEFT JOIN `tabPurchase Receipt` po_receipt
	ON po_receipt_item.parent =  po_receipt.name 
;
""",
engine)

In [7]:
data_sql_full.head()

Unnamed: 0,name,supplier,total_qty,transaction_date,set_warehouse,item_name,schedule_date,rate,quantity_ordered,warehouse,receipt_item_parent,delivered_date
0,PUR-ORD-2024-08-00001,Beans Inc.,750.0,2020-06-14,Naples - RR,Excelsa,2020-06-25,30.0,750.0,Naples - RR,MAT-PRE-2024-00001,2020-06-25
1,PUR-ORD-2024-08-00002,Beans Inc.,750.0,2023-04-24,Naples - RR,Excelsa,2023-04-29,30.0,750.0,Naples - RR,MAT-PRE-2024-00002,2023-05-04
2,PUR-ORD-2024-08-00003,Fair Trade AG,750.0,2024-09-08,Nairobi - RR,Arabica,2024-09-21,24.0,750.0,Nairobi - RR,,
3,PUR-ORD-2024-08-00004,Aromatico,1000.0,2022-10-26,Amsterdam - RR,Maragogype,2022-11-10,40.0,1000.0,Amsterdam - RR,MAT-PRE-2024-00003,2022-11-29
4,PUR-ORD-2024-08-00005,Aromatico,500.0,2022-03-18,London - RR,Maragogype Type B,2022-04-05,45.0,500.0,London - RR,MAT-PRE-2024-00004,2022-04-24


In [8]:
# Get size of dataset
data_sql_full.shape


(17181, 12)

In [9]:
# Check datatypes
data_sql_full.dtypes


name                    object
supplier                object
total_qty              float64
transaction_date        object
set_warehouse           object
item_name               object
schedule_date           object
rate                   float64
quantity_ordered       float64
warehouse               object
receipt_item_parent     object
delivered_date          object
dtype: object

In [10]:
# Drop nulls
data_sql_full.dropna(inplace=True, subset=["delivered_date", "schedule_date"])

In [11]:
# Convert timestamps
data_sql_full["delivered_date"] = pd.to_datetime(data_sql_full["delivered_date"])
data_sql_full["schedule_date"] = pd.to_datetime(data_sql_full["schedule_date"])

In [12]:

# Calculate days late
data_sql_full["days_late"] = (data_sql_full["delivered_date"] - data_sql_full["schedule_date"]).dt.days


In [13]:

# Get average numbers of days late
data_sql_full["days_late"].mean()


np.float64(10.406653394130114)

In [14]:


# Check days late by supplier
data_sql_full.groupby('supplier')["days_late"].mean().sort_values(ascending=False)


In [None]:

# Create Dummies for Supplier
d_suppliers = pd.get_dummies(data_sql_full['supplier'], prefix="d")


In [None]:

# Add dummies to dataset
data_for_model = pd.concat([data_sql_full, d_suppliers], axis=1)


In [None]:

# Create feature list
features = ["quantity_ordered"] + list(d_suppliers.columns)


In [None]:

# Init Random Forest
rf = RandomForestRegressor(n_estimators=1000, random_state=1234)


In [None]:

# Train model
rf.fit(data_for_model[features], data_for_model["days_late"])


In [None]:

# Predict
predictions = rf.predict(data_for_model[features])


In [None]:

# Get mean squared error
mean_squared_error(data_for_model["days_late"], predictions)


In [None]:

# Export model
export_name ="gbm_500.pkl"
with open(export_name, "wb") as file:
    pickle.dump(rf, file)
